In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# For ordina l encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
# from sklearn.model_selection import train_test_split - not needed anymore as we have split the dataset using KFold
from sklearn.model_selection import KFold

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# For model evaluation and slection
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv("../input/30days-folds/train_kfolds.csv") 
# if we want index to be `id` we have to give anothe argument index_col = 0 or 'id'(column name)
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,0
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,1


In [3]:
test = pd.read_csv('../input/30-days-of-ml/test.csv')
test.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,0,B,B,B,C,B,B,A,E,E,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,5,A,B,A,C,B,C,A,E,C,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,15,B,A,A,A,B,B,A,E,D,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,16,B,B,A,C,B,D,A,E,A,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,17,B,B,A,C,B,C,A,E,C,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


In [4]:
sample_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')
sample_submission.head()

Unnamed: 0,id,target
0,0,0.5
1,5,0.5
2,15,0.5
3,16,0.5
4,17,0.5


In [5]:
useful_features = [col for col in train.columns if col not in ("id","target","kfold")]
cat_col = [col for col in train.columns if 'cat' in col]
test = test[useful_features]

In [6]:
final_predictions = []
for fold in range(5):
    X_train = train[train.kfold != fold].reset_index(drop=True)
    X_valid = train[train.kfold == fold].reset_index(drop=True)
    X_test = test.copy()
    
    Y_train = X_train.target
    Y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_col] = ordinal_encoder.fit_transform(X_train[cat_col])
    X_valid[cat_col] = ordinal_encoder.transform(X_valid[cat_col])
    X_test[cat_col] = ordinal_encoder.transform(X_test[cat_col])
    
    model = XGBRegressor(n_estimators=150,random_state=fold, n_jobs=4)
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_valid)
    
    test_pred = model.predict(X_test)
    final_predictions.append(test_pred)
    print(fold, mean_squared_error(Y_valid, Y_pred, squared = False))

0 0.7249711030553708
1 0.7241914703264882
2 0.7263588804804968
3 0.7259016502676974
4 0.725091914850526


In [7]:
preds = np.mean(np.column_stack(final_predictions)) 
# `np.column_stack(final_predictions)` list of list --> this makes 5 columns with predictions on each of the five folds

In [8]:
sample_submission.target = preds
sample_submission.to_csv("submission2.csv", index=False)