## Importing required modules

In [1]:
# importing required modules
import math
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, KFold

## Loading data

In [2]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")

#### Making K-fold column in training data

In [3]:
train_data["kfold"] = -1

## K-Fold model, where fold = 5

In [4]:
Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
    train_data.loc[valid_index, "kfold"] = fold

In [5]:
print(train_data.shape)
train_data.head()

(300000, 27)


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,4
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,0
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,1
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,0


### Categorical data column and Numerical data column

In [6]:
# Prediction target and features
cat_cols = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
num_cols = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
test_data_2 = test_data[cat_cols + num_cols]

## Getting final predictions & RMSE on each fold

In [7]:
final_predictions = []
rmse_scores = []

# As total folds are 5, getting predictions and rmse at every fold
for fold in range(5):
    # Splitting the training and validating data according to fold and kfold
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    
    X_test = test_data_2.copy()
    
    # setting the target
    y_train = X_train.target
    y_valid = X_valid.target
    
    # setting the training data
    X_train = X_train[cat_cols + num_cols]
    X_valid = X_valid[cat_cols + num_cols]
    
    # One Hot Encoding on categorical data
    OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    
    # fitting and transforming the training and test data
    OH_encoder.fit(X_train[cat_cols])
    OH_X_train = OH_encoder.transform(X_train[cat_cols])
    OH_X_valid = OH_encoder.transform(X_valid[cat_cols])
    OH_X_test = OH_encoder.transform(X_test[cat_cols])
    
    # Naming the one hot encoded columns
    OH_X_train = pd.DataFrame(OH_X_train, columns=[f"ohe_{i}" for i in range(OH_X_train.shape[1])])
    OH_X_valid = pd.DataFrame(OH_X_valid, columns=[f"ohe_{i}" for i in range(OH_X_valid.shape[1])])
    OH_X_test = pd.DataFrame(OH_X_test, columns=[f"ohe_{i}" for i in range(OH_X_test.shape[1])])
    
    # Adding one hot encoded columns to main data (training, validating, test)
    X_train = pd.concat([X_train, OH_X_train], axis=1)
    X_valid = pd.concat([X_valid, OH_X_valid], axis=1)
    X_test = pd.concat([X_test, OH_X_test], axis=1)
    
    # Dropping the categorical columns, as their one hot encoded columns are added to main data
    X_train = X_train.drop(cat_cols, axis=1)
    X_valid = X_valid.drop(cat_cols, axis=1)
    X_test = X_test.drop(cat_cols, axis=1)
    
    # Making XGBoost Regressor model
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)
    
    # pridicting the target on training and test data
    train_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    
    # RMSE at each prediction
    rmse = math.sqrt(mean_squared_error(y_valid, train_preds))
    rmse_scores.append(rmse)
    
    print(f"Fold: {fold}, RMSE: {rmse}, Predictions: {test_preds}")


Fold: 0, RMSE: 0.7195125328386832, Predictions: [7.9917154 8.342595  8.400791  ... 8.396838  8.145232  7.946934 ]
Fold: 1, RMSE: 0.7274188326325031, Predictions: [8.0653305 8.291491  8.350918  ... 8.417305  8.038082  8.004667 ]
Fold: 2, RMSE: 0.7225344716276871, Predictions: [8.034033 8.378964 8.350615 ... 8.399388 8.032452 8.098599]
Fold: 3, RMSE: 0.7218584158950778, Predictions: [8.004924 8.319611 8.323311 ... 8.350578 8.101683 7.872486]
Fold: 4, RMSE: 0.717712085061957, Predictions: [8.052923 8.386763 8.353857 ... 8.354397 8.325792 8.13462 ]


### Getting the final predicitons of target from `final_predictions` arrays of arrays

In [8]:
final_predictions =  np.mean(np.column_stack(final_predictions), axis=1)
final_predictions

array([8.029785, 8.343884, 8.355899, ..., 8.383701, 8.128649, 8.011461],
      dtype=float32)

## Submitting the final predictions

In [9]:
# Submitting predicted target to `submission.csv`
output = pd.DataFrame({'Id': test_data.id,'target': final_predictions})
output.to_csv('submission.csv', index=False)