In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data exploration

## Data extraction

In [None]:
df_train = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv', index_col = 'id')
df_test = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv', index_col = 'id')

## Basic information

In [None]:
print('df_train:')
print(str(df_train.shape) + '\n' * 2 + 'df_test:')
print(df_test.shape)

In [None]:
print('df_train:')
print(str(df_train.head()) + '\n' * 2 + 'df_test:')
print(df_test.head())

In [None]:
print('df_train:')
print(str(df_train.info()) + '\n' * 2 + 'df_test:')
print(df_test.info())

In [None]:
print('df_train:')
print(str(df_train.describe()) + '\n' * 2 + 'df_test:')
print(df_test.describe())

## Missing values information

In [None]:
df_train_missing_val_count = (df_train.isnull().sum())
df_test_missing_val_count = (df_test.isnull().sum())
print('df_train:')
print(str(df_train_missing_val_count) + '\n' * 2 + 'df_test:')
print(df_test_missing_val_count)

## Categorical variables information

In [None]:
object_cols = [col for col in df_train.columns if df_train[col].dtype == 'object']
for col in object_cols:
    print('df_train[' + col + ']:' + str(df_train[col].nunique()))
    print('df_test[' + col + ']:' + str(df_test[col].nunique()) + '\n')
good_label_cols = [col for col in object_cols if set(df_train[col]).issubset(set(df_test[col]))]
if object_cols == good_label_cols:
    print('No bad label columns.')
else:
    bad_label_cols = list(set(object_cols) - set(good_label_cols))
    print('Bad label column(s): ' + bad_label_cols)

# Data preparation

## Data split

In [None]:
y = df_train.target
X = df_train.drop(['target'], axis = 1)
print('X:')
print(str(X.shape) + '\n' * 2 + 'y:')
print(y.shape)

## Columns selecting

In [None]:
categorical_cols = [col for col in object_cols if (X[col].nunique() | X[col].nunique()) < 10]
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
cols_selected = categorical_cols + numerical_cols
X_train = X[cols_selected].copy()
X_test = df_test[cols_selected].copy()
print('columns selected: \n' + str(cols_selected) + '\n' * 2 + 'X_train:')
print(str(X_train.shape) + '\n' * 2 + 'X_test:')
print(X_test.shape)

# Model building

## Parameters choosing

In [None]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

average_score = {}
for n_estimators in np.linspace(300, 600, 4, dtype = np.int):
    for learning_rate in [0.1, 0.075, 0.05, 0.025, 0.001]:
        model = XGBRegressor(
            n_estimators = n_estimators,
            learning_rate = learning_rate,
            random_state = 0,
            booster = 'gbtree',
            max_depth = 3
        )
        categorical_transformer = Pipeline(steps = [
            ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
        ])
        preprocessor = ColumnTransformer(
            transformers = [
                ('cat', categorical_transformer, categorical_cols)
            ]
        )
        pipeline = Pipeline(steps = [
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        scores = -1 * cross_val_score(pipeline, X_train, y,
                                     cv = 5,
                                     scoring = 'neg_mean_squared_error')
        print(str(n_estimators) + ' and ' + str(learning_rate) + ': ' + str(scores.mean()))
        average_score[str(n_estimators) + '_' + str(learning_rate)] = scores.mean()        
best_parameters = min(average_score, key = average_score.get)
print('Best parameters: ' + str(best_parameters))

## Data training

In [None]:
model_chosen = XGBRegressor(
    n_estimators = 300,
    learning_rate = 0.025, 
    random_state = 0,
    booster = 'gbtree',
    max_depth = 3
)
pipeline_chosen = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model_chosen)
])
pipeline_chosen.fit(X_train, y)

## Data predicting

In [None]:
y_pred = pipeline_chosen.predict(X_test)
print(y_pred)

# Data submission

In [None]:
output = pd.DataFrame({
    'id': X_test.index,
    'target': y_pred})
output.to_csv('my_submission.csv', index = False)