In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#Import os libaries
import os
import time
from pathlib import Path
import sys
import pickle
import joblib

#Import data manipulation libaries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#import ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Splitting the data and cross-validation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score,KFold, GroupKFold, StratifiedKFold

#Metrics
#from sklearn.metrics import classification_report,accuracy_score , roc_auc_score
from sklearn.metrics import mean_squared_error
#Classifiers
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

#Feature engineerring
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer



In [2]:
#Notebook settings
# To define maximum number of columns to be displayed in a dataframe
pd.set_option('display.max_columns', None)

# To supress warnings
import warnings
warnings.filterwarnings('ignore')

#Set the theme for seaborn
#sns.set_theme(style="darkgrid")
pd.set_option('display.precision', 2)

#Set dark theme
plt.style.use('dark_background')

In [3]:
train_import = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test_import = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")

#### List of different feature sets

In [4]:
# List of colums based on their function
# Target
target_label  = "target"
# List of all features
all_feature = [col for col in train_import.columns if col != "target"]
# List of numeric features
all_numeric = [col for col in all_feature if train_import[col].dtype == "float64"]
# List of all categorical features
all_categorical = [col for col in all_feature if train_import[col].dtype == "object"]

In [5]:
#Step 0: Drop non-essential columns
#Step 1: Perform imputation or encoding of missing data
#Step 2: Perform encoding for the different categorical columns
#Step 3: feature transformation of numeric features
#Step 4: feature generation by aggregation/groupby function
#Step 5: 

### Create the feature matrix and target values

In [6]:
#Create the training feature matrix, the training feature labels and the same for the
#validation set

train = train_import.sample(frac = 1).copy()
train_X = train.loc[:, all_feature]
train_y = train.loc[:, [target_label]].values
train_X.shape, train_y.shape

((300000, 25), (300000, 1))

### Encoding

In [7]:
#Start with basic OHE-encoding
import category_encoders as ce

ohe_encoder = ce.one_hot.OneHotEncoder(use_cat_names = True) 
#ohe_transformer = Pipeline(steps=[('ohe',ce.one_hot.OneHotEncoder(use_cat_names = True))])

In [8]:
preproc = make_column_transformer(
(ohe_encoder,
    all_categorical),
    remainder = "passthrough"
    )

In [9]:
#Start with the pipeline outside the cross validation loop
preproc_df = preproc.fit_transform(train_X)
preproc_df.shape
#preproc_df = pd.DataFrame( preproc.fit_transform(train_X), columns = preproc.get_feature_names())


#preprocessor = ColumnTransformer(
 #   transformers=[('ohe', ohe_transformer, all_categorical)])
    

(300000, 71)

In [10]:
type(preproc_df),type(train_y)

(numpy.ndarray, numpy.ndarray)

In [11]:
#preproc_df.info()

### Test/Train split

In [12]:
# n_learning_rate_steps, n_folds = 5, 5 #A
# learning_rates = np.linspace(0.02, 0.1, num=n_learning_rate_steps)

# trn_err = np.zeros((n_learning_rate_steps, n_folds)) #C
# val_err = np.zeros((n_learning_rate_steps, n_folds))

In [13]:
#Define the validation strategy - how should the training data be split
n_folds = 5
splitter = KFold(n_splits=n_folds, shuffle=True, random_state=42) #B

In [14]:
#Validation loop with one hyperparameter for optimisation

# for i, rate in enumerate(learning_rates): #D
#     for j, (trn, val) in enumerate(splitter.split(preproc_df)):  
#         model = xgb.XGBRegressor(eta=rate, tree_method='gpu_hist')
#         model.fit(preproc_df[trn, :], train_y[trn])
#         pred_error_train = model.predict(preproc_df[trn, :])
#         pred_error_val = model.predict(preproc_df[val, :])
        
#         trn_err[i,j] = mean_squared_error(train_y[trn],pred_error_train)
#         val_err[i,j] = mean_squared_error(train_y[val],pred_error_val)

# trn_err = np.mean(trn_err, axis = 1) #E
# val_err = np.mean(val_err, axis = 1)
#print(trn_err)
#print(val_err)

In [15]:
# print(trn_err)
# print(val_err)

In [16]:
#model = LinearRegression()
#model = xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta= 0.1, subsample=0.7, colsample_bytree=0.8)

In [17]:
train_err = np.zeros( n_folds) 
valid_err = np.zeros(n_folds)

#for i, rate in enumerate(learning_rates): #D
for j, (trn, val) in enumerate(splitter.split(preproc_df)):  
    model = xgb.XGBRegressor(eta=0.1, tree_method='gpu_hist')
    model.fit(preproc_df[trn, :], train_y[trn])
    train_err[j] = mean_squared_error(train_y[trn],
                        model.predict(preproc_df[trn, :]))
    
    valid_err[j] = mean_squared_error(train_y[val],
                        model.predict(preproc_df[val, :]))
tr = np.mean(train_err) #E
valid = np.mean(valid_err)
#print(trn_err)
#print(val_err)

In [18]:
print(tr)
print(valid)

0.6914794871663341
0.7180451586476149


In [19]:
#Train model on whole data set
model_full_train = model.fit(preproc_df, train_y)

In [20]:
#Transform the test set
test_import.shape


(200000, 25)

In [21]:
preproc_df = preproc.fit_transform(train_X)

In [22]:
#Think about validation strategy
#Perform train -test split
#Look at adversial validation and understanding if he train and test dataset have different
# distributions
#Look at featuretools