# Importing required modules

In [1]:
import math
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

## Loading data

In [2]:
train_data = pd.read_csv("../input/30daysofml-train-folds/train_folds.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")

#### Making data columns

In [3]:
cat_cols = [col for col in train_data.columns if col.startswith("cat")]
num_cols = [col for col in train_data.columns if col.startswith("cont")]

### Predicting target using XGB Regressor and kfold (total folds = 5)
* Applying *One Hot Encoding* to categorical columns

In [4]:
final_preds = []
mse_scores = []
for fold in range(5):
    X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    
    X_test = test_data.copy()
    X_test = X_test[cat_cols]
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[cat_cols]
    X_valid = X_valid[cat_cols]    
    
    # Applying One Hot Encoding
    OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
    OH_encoder.fit(X_train[cat_cols])
    OH_X_train = OH_encoder.transform(X_train[cat_cols])
    OH_X_valid = OH_encoder.transform(X_valid[cat_cols])
    OH_X_test = OH_encoder.transform(X_test[cat_cols])
    
    # making DataFrame of one hot encoded columns and naming them
    OH_X_train = pd.DataFrame(OH_X_train, columns=[f"ohe_{i}" for i in range(OH_X_train.shape[1])])
    OH_X_valid = pd.DataFrame(OH_X_valid, columns=[f"ohe_{i}" for i in range(OH_X_valid.shape[1])])
    OH_X_test = pd.DataFrame(OH_X_test, columns=[f"ohe_{i}" for i in range(OH_X_test.shape[1])])
    
    # One Hot Encoding removes Index, adding the Index back
    OH_X_train.index = X_train[cat_cols].index
    OH_X_valid.index = X_valid[cat_cols].index
    OH_X_test.index = X_test[cat_cols].index
    
    # Concatinating both normal and one hot encoded columns
    X_train = pd.concat([X_train, OH_X_train], axis=1)
    X_valid = pd.concat([X_valid, OH_X_valid], axis=1)
    X_test = pd.concat([X_test, OH_X_test], axis=1)
    
    # Dropping the categorical columns (as they are one hot encoded)
    X_train = X_train.drop(cat_cols, axis=1)
    X_valid = X_valid.drop(cat_cols, axis=1)
    X_test = X_test.drop(cat_cols, axis=1)
    
    # XGB Regressor model
    model = XGBRegressor(random_state=fold, tree_method="gpu_hist", gpu_id=0, predictor="gpu_predictor")
    model.fit(X_train, y_train)
    train_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_preds.append(test_preds)
    rmse = math.sqrt(mean_squared_error(y_valid, train_preds))
    mse_scores.append(rmse)
    print(f"Fold: {fold}, RMSE: {rmse}")    

Fold: 0, RMSE: 0.7454918963073176
Fold: 1, RMSE: 0.7449422944733177
Fold: 2, RMSE: 0.7462892116818625
Fold: 3, RMSE: 0.7469621106629815
Fold: 4, RMSE: 0.7473844446393453


#### MSE scores on each fold

In [5]:
print(f"MSE: {mse_scores}")

MSE: [0.7454918963073176, 0.7449422944733177, 0.7462892116818625, 0.7469621106629815, 0.7473844446393453]


#### Mean of mse_scores

In [6]:
print(f"Mean: {sum(mse_scores)/len(mse_scores)}")

Mean: 0.7462139915529649


### Final Predictions on each fold

In [7]:
print(f"Final predictions: {final_preds}")

Final predictions: [array([8.112866 , 8.2709255, 8.314985 , ..., 8.183537 , 8.256744 ,
       8.237152 ], dtype=float32), array([8.104092 , 8.252947 , 8.2794075, ..., 8.1989155, 8.248346 ,
       8.225908 ], dtype=float32), array([8.123636, 8.240576, 8.448338, ..., 8.197838, 8.247944, 8.228142],
      dtype=float32), array([8.111397 , 8.2638855, 8.404632 , ..., 8.186489 , 8.245812 ,
       8.227277 ], dtype=float32), array([8.0988455, 8.261955 , 8.3078785, ..., 8.1967745, 8.247262 ,
       8.216866 ], dtype=float32)]


### Making stack of final predictions

In [8]:
stack = np.column_stack(final_preds)
stack

array([[8.112866 , 8.104092 , 8.123636 , 8.111397 , 8.0988455],
       [8.2709255, 8.252947 , 8.240576 , 8.2638855, 8.261955 ],
       [8.314985 , 8.2794075, 8.448338 , 8.404632 , 8.3078785],
       ...,
       [8.183537 , 8.1989155, 8.197838 , 8.186489 , 8.1967745],
       [8.256744 , 8.248346 , 8.247944 , 8.245812 , 8.247262 ],
       [8.237152 , 8.225908 , 8.228142 , 8.227277 , 8.216866 ]],
      dtype=float32)

## Final prediction 

In [9]:
final_prediction = np.mean(stack, axis=1)
final_prediction

array([8.110167, 8.258059, 8.351048, ..., 8.19271 , 8.249222, 8.227069],
      dtype=float32)

## Submitting the final prediction

In [10]:
# Submitting predicted target to `submission.csv`
output = pd.DataFrame({'Id': test_data.id, 'target': final_prediction})
output.to_csv('submission.csv', index=False)