# Importing required modules 

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

### Loading data

In [2]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

### K-Fold model
* having 5 splits

In [3]:
train_data["kfold"] = -1
Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
# fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
    train_data.loc[valid_index, "kfold"] = fold

#### Training data after making K-fold column using K-fold model

In [4]:
print(train_data.shape)
train_data.head()

(300000, 27)


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,4
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,0
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,1
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,0


#### Setting up columns
1. num_cols stores all columns of training data which have numerical columns
2. cat_cols stores all columns of training data which have categorical columns
3. useful_cols stores both num_cols and cat_cols
* useful_cols are required for target encoding, hence test_data also uses the useful_cols only

In [5]:
num_cols = [col for col in train_data.columns if "cont" in col]
cat_cols = [col for col in train_data.columns if "cat" in col]
useful_cols = cat_cols + num_cols
test_data = test_data[useful_cols]

In [6]:
def aggregate_1(x):
    return round(((x**0.5)*2), 6)

### Target Encoding

In [7]:
# Iterate over categorical columns
for col in cat_cols:
    """
    Based on each categorical column, one target fold is created
    total 5 folds for one column
    """
    temp_train = []
    temp_test_target = None
    for fold in range(5):
        # making training data and validating data for each fold
        X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
        X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
        
        # getting the mean of training data target
        mean_target = dict(X_train.groupby(col)["target"].agg("median"))
        
        # adding the mean_target to X_valid (valid dataset of each fold)
        X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
        temp_train.append(X_valid)
        if (temp_test_target is None):
            temp_test_target = test_data[col].map(mean_target)
        else:
            temp_test_target += test_data[col].map(mean_target)
        
    # getting the average of temporary test target on each column
    temp_test_target /= 5
    # adding the temporary test target to test data on each column (total=10)
    test_data.loc[:, f"target_enc_{col}"] = temp_test_target
    
    # setting training data as temp_train
    train_data = pd.concat(temp_train)

#### Training data after Target Encoding

In [8]:
train_data.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,target_enc_cat0,target_enc_cat1,target_enc_cat2,target_enc_cat3,target_enc_cat4,target_enc_cat5,target_enc_cat6,target_enc_cat7,target_enc_cat8,target_enc_cat9
0,2,B,B,A,A,B,D,A,F,A,...,8.200139,8.162371,8.192867,8.234425,8.190434,8.172714,8.19078,8.234335,8.143371,8.175322
1,6,A,A,A,C,B,D,A,E,A,...,8.187044,8.22051,8.192867,8.185586,8.190434,8.172714,8.19078,8.192015,8.143371,8.213467
2,8,B,A,A,A,B,D,A,E,C,...,8.200139,8.22051,8.192867,8.234425,8.190434,8.172714,8.19078,8.192015,8.246532,8.207318
3,10,A,B,A,C,B,D,A,E,G,...,8.187044,8.162371,8.192867,8.185586,8.190434,8.172714,8.19078,8.192015,8.184368,8.159815
4,18,B,A,A,C,B,D,A,E,A,...,8.200139,8.22051,8.192867,8.185586,8.190434,8.172714,8.19078,8.192015,8.143371,8.184239


In [9]:
train_data.columns

Index(['id', 'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5',
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'target', 'kfold', 'target_enc_cat0', 'target_enc_cat1',
       'target_enc_cat2', 'target_enc_cat3', 'target_enc_cat4',
       'target_enc_cat5', 'target_enc_cat6', 'target_enc_cat7',
       'target_enc_cat8', 'target_enc_cat9'],
      dtype='object')

#### Test data after Target Encoding

In [10]:
test_data.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,target_enc_cat0,target_enc_cat1,target_enc_cat2,target_enc_cat3,target_enc_cat4,target_enc_cat5,target_enc_cat6,target_enc_cat7,target_enc_cat8,target_enc_cat9
0,B,B,B,C,B,B,A,E,E,I,...,8.199411,8.162038,8.186756,8.185477,8.189992,8.199043,8.190005,8.191722,8.145356,8.158563
1,A,B,A,C,B,C,A,E,C,H,...,8.186798,8.162038,8.192303,8.185477,8.189992,8.221418,8.190005,8.191722,8.248797,8.195199
2,B,A,A,A,B,B,A,E,D,K,...,8.199411,8.219706,8.192303,8.232115,8.189992,8.199043,8.190005,8.191722,8.205808,8.232337
3,B,B,A,C,B,D,A,E,A,N,...,8.199411,8.162038,8.192303,8.185477,8.189992,8.171351,8.190005,8.191722,8.134364,8.21
4,B,B,A,C,B,C,A,E,C,F,...,8.199411,8.162038,8.192303,8.185477,8.189992,8.221418,8.190005,8.191722,8.248797,8.206115


#### Setting useful features in training data and test data

In [11]:
useful_cols = [col for col in train_data.columns if (train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")][1:]
train_data = train_data[useful_cols]
test_data = test_data[useful_cols[:14]+useful_cols[17:]]

In [12]:
train_data.head()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,target_enc_cat0,target_enc_cat1,target_enc_cat2,target_enc_cat3,target_enc_cat4,target_enc_cat5,target_enc_cat6,target_enc_cat7,target_enc_cat8,target_enc_cat9
0,0.743068,0.367411,1.021605,0.365798,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,...,8.200139,8.162371,8.192867,8.234425,8.190434,8.172714,8.19078,8.234335,8.143371,8.175322
1,1.058291,0.367492,-0.052389,0.232407,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,...,8.187044,8.22051,8.192867,8.185586,8.190434,8.172714,8.19078,8.192015,8.143371,8.213467
2,0.743661,0.234794,0.339026,0.424034,0.281511,0.396705,0.273454,0.824573,0.656325,0.677114,...,8.200139,8.22051,8.192867,8.234425,8.190434,8.172714,8.19078,8.192015,8.246532,8.207318
3,0.523472,0.492059,0.16544,0.749995,0.28111,0.472564,0.414036,0.809142,1.013301,0.761183,...,8.187044,8.162371,8.192867,8.185586,8.190434,8.172714,8.19078,8.192015,8.184368,8.159815
4,0.814649,0.884882,0.725545,0.787947,0.27688,0.728022,0.553966,0.504254,0.555141,0.810336,...,8.200139,8.22051,8.192867,8.185586,8.190434,8.172714,8.19078,8.192015,8.143371,8.184239


In [13]:
test_data.head()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cont13,target_enc_cat1,target_enc_cat2,target_enc_cat3,target_enc_cat4,target_enc_cat5,target_enc_cat6,target_enc_cat7,target_enc_cat8,target_enc_cat9
0,0.296227,0.686757,0.587731,0.392753,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,...,0.845702,8.162038,8.186756,8.185477,8.189992,8.199043,8.190005,8.191722,8.145356,8.158563
1,0.543707,0.364761,0.452967,0.929645,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,...,0.69394,8.162038,8.192303,8.185477,8.189992,8.221418,8.190005,8.191722,8.248797,8.195199
2,0.408961,0.296129,0.690999,0.740027,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,...,0.508099,8.219706,8.192303,8.232115,8.189992,8.199043,8.190005,8.191722,8.205808,8.232337
3,1.031239,0.356062,0.303651,0.895591,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,...,0.461372,8.162038,8.192303,8.185477,8.189992,8.171351,8.190005,8.191722,8.134364,8.21
4,0.530447,0.729004,0.281723,0.444698,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,...,0.900412,8.162038,8.192303,8.185477,8.189992,8.221418,8.190005,8.191722,8.248797,8.206115


In [14]:
useful_cols_2 = test_data.columns
final_predictions = []
mse_scores = []

# As total folds are 5, getting predictions and rmse at every fold
for fold in range(5):
    # Splitting the training and validating data according to fold and kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    
    X_test = test_data.copy()
    
    # setting the target
    y_train = X_train.target
    y_valid = X_valid.target
    
    # setting the training data and validating data
    X_train = X_train[useful_cols_2]
    X_valid = X_valid[useful_cols_2]
    
    # Making XGBoost Regressor model
    model = XGBRegressor(n_estimators=2000, learning_rate=0.05, random_state=fold**2, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_valid, y_valid)], verbose=False)
    
    # predicting the target on training and test data
    train_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    
    # MSE at each prediction
    mse = mean_squared_error(y_valid, train_preds, squared=False)
    mse_scores.append(mse)
    
    print(f"Fold: {fold}, RMSE: {mse}, \nPredictions on test data: {test_preds}")

Fold: 0, RMSE: 0.7194723789656678, 
Predictions on test data: [8.003034  8.302616  8.406588  ... 8.427282  8.101332  7.9288735]
Fold: 1, RMSE: 0.7272996571086815, 
Predictions on test data: [8.093162  8.300066  8.398706  ... 8.448269  7.97737   7.9625144]
Fold: 2, RMSE: 0.7226288879751521, 
Predictions on test data: [8.049716 8.399328 8.373241 ... 8.41401  8.084784 8.099085]
Fold: 3, RMSE: 0.7216766353296825, 
Predictions on test data: [8.007561  8.33773   8.394438  ... 8.262458  8.1585865 7.8434086]
Fold: 4, RMSE: 0.7244861108621717, 
Predictions on test data: [8.023723 8.330841 8.380571 ... 8.350329 8.088068 8.136006]


### Final predictions

In [15]:
final_preds = np.mean(np.column_stack(final_predictions), axis=1)
final_preds

array([8.0354395, 8.334116 , 8.390709 , ..., 8.38047  , 8.082028 ,
       7.9939775], dtype=float32)

### Storing the results in 'submission.csv'

In [16]:
# Submitting predicted target to `submission.csv`
output = pd.DataFrame({"Id" : sample_submission.id,"target" : final_preds})
output.to_csv('submission.csv', index=False)