# Importing required modules

In [1]:
import optuna
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

## 1st model
#### where:
##### **K-fold** to split the training data(`train.csv`) for training and testing where folds = 5
##### **Target Encoding** to get numerical `median` values of categorical values
##### **Ordinal Encoding** on optuna_model_1 to get numerical values of categorical values
##### **One Hot Encoding** on model_1 to get numerical values of categorical values
##### **XGBoost** to make the model and predict validation data and test data n estimators=1000
____________________________________________________________________________________________________
## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`

In [2]:
# 1st model
def model_1():
    train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
    test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
    test_preds_1 = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
    
    # K-fold splitting where total folds = 5
    train_data["kfold"] = -1
    Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
    # fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
    for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
        train_data.loc[valid_index, "kfold"] = fold
    
    #==============================================================================================================================================================================#
    # Setting required models
    num_cols = [col for col in train_data.columns if "cont" in col]
    cat_cols = [col for col in train_data.columns if "cat" in col]
    useful_cols = cat_cols + num_cols
    model_1_train_data = train_data
    model_1_test_data = test_data
    test_data = test_data[useful_cols]
    
    #==============================================================================================================================================================================#
    # Target Encoding
    # Iterate over categorical columns
    for col in cat_cols:
        """
        Based on each categorical column, one target fold is created
        total 5 folds for one column
        """
        temp_train = []
        temp_test_target = None
        for fold in range(5):
            # making training data and validating data for each fold
            X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
            X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
            
            # getting the mean of training data target
            mean_target = dict(X_train.groupby(col)["target"].agg("median"))

            # adding the mean_target to X_valid (valid dataset of each fold)
            X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
            temp_train.append(X_valid)
            if (temp_test_target is None):
                temp_test_target = test_data[col].map(mean_target)
            else:
                temp_test_target += test_data[col].map(mean_target)
                
        # getting the average of temporary test target on each column
        temp_test_target /= 5
        # adding the temporary test target to test data on each column (total=5)
        test_data.loc[:, f"target_enc_{col}"] = temp_test_target

        # setting training data as temp_train
        train_data = pd.concat(temp_train)
        
    #==============================================================================================================================================================================#
    # Setting required columns again
    useful_cols = [col for col in train_data.columns if ((train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")) and (col not in ("id"))]
    cat_cols = [col for col in train_data.columns if (train_data[col].dtypes == "object")]
    useful_cols = cat_cols + num_cols
    train_data = train_data[useful_cols]
    test_data = test_data[useful_cols[:24] + useful_cols[26:]]
    
    #==============================================================================================================================================================================#
    # optuna model 1 for final model 1
    def optuna_model_1(trial):
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # Setting required columns again
        useful_cols = [col for col in model_1_train_data.columns if ((model_1_train_data[col].dtypes == "int64") or (model_1_train_data[col].dtypes =="float64")) and (col != "id")]
        cat_cols = [col for col in model_1_test_data.columns if (model_1_test_data[col].dtypes == "object")]
        useful_cols = cat_cols + num_cols
        train_data = model_1_train_data[useful_cols]
        test_data = model_1_test_data[useful_cols[:24] + useful_cols[26:]]

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # making parameteres
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.5, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        
        fold = 0
        # making training data and validating data for each fold
        X_train = model_1_train_data[model_1_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_1_train_data[model_1_train_data.kfold == fold].reset_index(drop=True)

        # setting the target
        y_train = X_train.target
        y_valid = X_valid.target

        # setting the training data and validating data
        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])

        # making model
        model = XGBRegressor(
            n_estimators=2000,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=50,
        )
        # fitting data into the model
        model.fit(X_train, y_train, early_stopping_rounds=300,
                  eval_set=[(X_valid, y_valid)], verbose=1000)

        # getting predictions
        valid_preds = model.predict(X_valid)

        # getting RMSE
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        return rmse
    #==============================================================================================================================================================================#
    # Optimizing optuna_model and getting best parameters
    optuna_1 = optuna.create_study(direction="minimize")
    optuna_1.optimize(optuna_model_1, n_trials=5)
    best_params_model_1 = optuna_1.best_params
    #==============================================================================================================================================================================#
    # final_model made by XGB Regressor
    final_valid_predictions = {}
    final_test_predictions = []
    scores = []

    for fold in range(5):
        # making training data and validating data for each fold
        X_train = model_1_train_data[model_1_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_1_train_data[model_1_train_data.kfold == fold].reset_index(drop=True)

        X_test = test_data.copy()

        valid_ids = X_valid.id.values.tolist()
        
        y_train = X_train.target
        y_valid = X_valid.target

        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## One Hot Encoding for categorical data
        OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

        # fitting and transforming the training and test data
        OH_encoder.fit(X_train[cat_cols])
        OH_X_train = OH_encoder.transform(X_train[cat_cols])
        OH_X_valid = OH_encoder.transform(X_valid[cat_cols])
        OH_X_test = OH_encoder.transform(X_test[cat_cols])

        # Naming the one hot encoded columns
        OH_X_train = pd.DataFrame(OH_X_train, columns=[f"ohe_{i}" for i in range(OH_X_train.shape[1])])
        OH_X_valid = pd.DataFrame(OH_X_valid, columns=[f"ohe_{i}" for i in range(OH_X_valid.shape[1])])
        OH_X_test = pd.DataFrame(OH_X_test, columns=[f"ohe_{i}" for i in range(OH_X_test.shape[1])])

        # Adding one hot encoded columns to main data (training, validating, test)
        X_train = pd.concat([X_train, OH_X_train], axis=1)
        X_valid = pd.concat([X_valid, OH_X_valid], axis=1)
        X_test = pd.concat([X_test, OH_X_test], axis=1)

        # Dropping the categorical columns, as their one hot encoded columns are added to main data
        X_train = X_train.drop(cat_cols, axis=1)
        X_valid = X_valid.drop(cat_cols, axis=1)
        X_test = X_test.drop(cat_cols, axis=1)

        # making the model
        model = XGBRegressor(
            n_estimators=2000,
            **best_params_model_1,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=50,
        )
        # fitting the data in the model
        model.fit(X_train, y_train, early_stopping_rounds=300,
                  eval_set=[(X_valid, y_valid)], verbose=1000)

        # getting valid predictions and test predictions
        valid_preds = model.predict(X_valid)
        test_preds = model.predict(X_test)
        final_valid_predictions.update(dict(zip(valid_ids, valid_preds)))
        final_test_predictions.append(test_preds)

        # rmse on valid predictions
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        scores.append(rmse)
    
    #==============================================================================================================================================================================#
    valid_preds_1 = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    valid_preds_1.columns = ["id", "pred_1"]
    valid_preds_1.to_csv("train_preds_1.csv", index=False)
    
    test_preds_1.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    test_preds_1.columns = ["id", "pred_1"]
    test_preds_1.to_csv("test_preds_1.csv", index=False)
    return scores, test_preds_1

## MSE and Test Predictions on model 1

In [3]:
# model_1 MSE and test_predictions
mse_1, final_preds_1 = model_1()
print(f"MSE: {mse_1}\nTest data predictions: {final_preds_1}")

[32m[I 2021-09-01 01:54:09,667][0m A new study created in memory with name: no-name-8c7d9bec-b94b-404e-b0d7-522b6508ff1a[0m


[0]	validation_0-rmse:7.14876
[1000]	validation_0-rmse:0.71694
[1528]	validation_0-rmse:0.71708


[32m[I 2021-09-01 01:54:17,273][0m Trial 0 finished with value: 0.7168378950538234 and parameters: {'learning_rate': 0.08133944387862009, 'reg_lambda': 91.96725706667004, 'reg_alpha': 9.137793303669227e-08, 'subsample': 0.6990687555707329, 'colsample_bytree': 0.5686016109955158, 'max_depth': 4}. Best is trial 0 with value: 0.7168378950538234.[0m


[0]	validation_0-rmse:6.76476
[444]	validation_0-rmse:0.72559


[32m[I 2021-09-01 01:54:22,168][0m Trial 1 finished with value: 0.7220245838974617 and parameters: {'learning_rate': 0.13119332427712838, 'reg_lambda': 0.14169864669904933, 'reg_alpha': 1.4712023503641833e-06, 'subsample': 0.29869237594397197, 'colsample_bytree': 0.2769129760715793, 'max_depth': 6}. Best is trial 0 with value: 0.7168378950538234.[0m


[0]	validation_0-rmse:6.50524
[401]	validation_0-rmse:0.73011


[32m[I 2021-09-01 01:54:26,726][0m Trial 2 finished with value: 0.7233763044455628 and parameters: {'learning_rate': 0.16487509719268212, 'reg_lambda': 0.0005619946547244303, 'reg_alpha': 0.0459393619370181, 'subsample': 0.3790941303657904, 'colsample_bytree': 0.5824886507986692, 'max_depth': 6}. Best is trial 0 with value: 0.7168378950538234.[0m


[0]	validation_0-rmse:6.81722
[679]	validation_0-rmse:0.71936


[32m[I 2021-09-01 01:54:32,942][0m Trial 3 finished with value: 0.7177978383792132 and parameters: {'learning_rate': 0.12434298582395381, 'reg_lambda': 1.2632361311719154, 'reg_alpha': 1.37836852882006e-05, 'subsample': 0.6870507957204991, 'colsample_bytree': 0.18695451114722828, 'max_depth': 6}. Best is trial 0 with value: 0.7168378950538234.[0m


[0]	validation_0-rmse:5.51380
[426]	validation_0-rmse:0.72373


[32m[I 2021-09-01 01:54:37,114][0m Trial 4 finished with value: 0.7199435099151316 and parameters: {'learning_rate': 0.29391906110343075, 'reg_lambda': 0.8863725041351904, 'reg_alpha': 3.7245295352231127e-07, 'subsample': 0.9666774776812908, 'colsample_bytree': 0.43004202164162153, 'max_depth': 5}. Best is trial 0 with value: 0.7168378950538234.[0m


[0]	validation_0-rmse:7.14876
[1000]	validation_0-rmse:0.71709
[1530]	validation_0-rmse:0.71713
[0]	validation_0-rmse:7.15224
[1000]	validation_0-rmse:0.72464
[1552]	validation_0-rmse:0.72468
[0]	validation_0-rmse:7.14819
[1000]	validation_0-rmse:0.72021
[1618]	validation_0-rmse:0.72018
[0]	validation_0-rmse:7.15468
[1000]	validation_0-rmse:0.71980
[1593]	validation_0-rmse:0.71989
[0]	validation_0-rmse:7.15438
[1000]	validation_0-rmse:0.71473
[1565]	validation_0-rmse:0.71460
MSE: [0.7169609374018332, 0.7244526982367475, 0.7199796820374822, 0.7197370484788822, 0.7144074191873216]
Test data predictions:             id    pred_1
0            0  8.090875
1            5  8.369916
2           15  8.402437
3           16  8.499657
4           17  8.180559
...        ...       ...
199995  499987  8.103170
199996  499990  8.423574
199997  499991  8.470532
199998  499994  8.113050
199999  499995  7.908584

[200000 rows x 2 columns]


# 2nd model
#### where:
##### **K-fold** to split the training data(`train.csv`) for training and testing where folds = 5
##### **Target Encoding** to get numerical `median` values of categorical values
##### **One Hot Encoding** on optuna_model_1 to get numerical values of categorical values
##### **Ordinal Encoding** on model_1 to get numerical values of categorical values
##### **XGB Regressor** to make the model and predict validation data and test data n estimators=5000
____________________________________________________________________________________________________
## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`

In [4]:
# 2nd model
def model_2():
    train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
    test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
    test_preds_2 = sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
    
    # K-fold splitting where total folds = 5
    train_data["kfold"] = -1
    Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
    # fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
    for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
        train_data.loc[valid_index, "kfold"] = fold
    
    #==============================================================================================================================================================================#
    # Setting required models
    num_cols = [col for col in train_data.columns if "cont" in col]
    cat_cols = [col for col in train_data.columns if "cat" in col]
    useful_cols = cat_cols + num_cols
    model_2_train_data = train_data
    model_2_test_data = test_data
    test_data = test_data[useful_cols]
    
    #==============================================================================================================================================================================#
    # Target Encoding
    # Iterate over categorical columns
    for col in cat_cols:
        """
        Based on each categorical column, one target fold is created
        total 5 folds for one column
        """
        temp_train = []
        temp_test_target = None
        for fold in range(5):
            # making training data and validating data for each fold
            X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
            X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
            
            # getting the mean of training data target
            mean_target = dict(X_train.groupby(col)["target"].agg("median"))

            # adding the mean_target to X_valid (valid dataset of each fold)
            X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
            temp_train.append(X_valid)
            if (temp_test_target is None):
                temp_test_target = test_data[col].map(mean_target)
            else:
                temp_test_target += test_data[col].map(mean_target)
                
        # getting the average of temporary test target on each column
        temp_test_target /= 5
        # adding the temporary test target to test data on each column (total=5)
        test_data.loc[:, f"target_enc_{col}"] = temp_test_target

        # setting training data as temp_train
        train_data = pd.concat(temp_train)
        
    #==============================================================================================================================================================================#
    # Setting required columns again
    useful_cols = [col for col in train_data.columns if ((train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")) and (col not in ("id"))]
    cat_cols = [col for col in train_data.columns if (train_data[col].dtypes == "object")]
    useful_cols = cat_cols + num_cols
    train_data = train_data[useful_cols]
    test_data = test_data[useful_cols[:24] + useful_cols[26:]]
    
    #==============================================================================================================================================================================#
    # optuna model 2 for final model 2
    def optuna_model_2(trial):
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # Setting required columns again
        useful_cols = [col for col in model_2_train_data.columns if ((model_2_train_data[col].dtypes == "int64") or (model_2_train_data[col].dtypes =="float64")) and (col != "id")]
        cat_cols = [col for col in model_2_test_data.columns if (model_2_test_data[col].dtypes == "object")]
        useful_cols = cat_cols + num_cols
        train_data = model_2_train_data[useful_cols]
        test_data = model_2_test_data[useful_cols[:24] + useful_cols[26:]]

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # making parameteres
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.05, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        
        fold = 0
        # making training data and validating data for each fold
        X_train = model_2_train_data[model_2_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_2_train_data[model_2_train_data.kfold == fold].reset_index(drop=True)
        
        X_test = test_data.copy()
        
        # setting the target
        y_train = X_train.target
        y_valid = X_valid.target

        # setting the training data and validating data
        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## One Hot Encoding for categorical data
        OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

        # fitting and transforming the training and test data
        OH_encoder.fit(X_train[cat_cols])
        OH_X_train = OH_encoder.transform(X_train[cat_cols])
        OH_X_valid = OH_encoder.transform(X_valid[cat_cols])

        # Naming the one hot encoded columns
        OH_X_train = pd.DataFrame(OH_X_train, columns=[f"ohe_{i}" for i in range(OH_X_train.shape[1])])
        OH_X_valid = pd.DataFrame(OH_X_valid, columns=[f"ohe_{i}" for i in range(OH_X_valid.shape[1])])

        # Adding one hot encoded columns to main data (training, validating, test)
        X_train = pd.concat([X_train, OH_X_train], axis=1)
        X_valid = pd.concat([X_valid, OH_X_valid], axis=1)

        # Dropping the categorical columns, as their one hot encoded columns are added to main data
        X_train = X_train.drop(cat_cols, axis=1)
        X_valid = X_valid.drop(cat_cols, axis=1)

        # making model
        model = XGBRegressor(
            n_estimators=5000,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting data into the model
        model.fit(X_train, y_train, early_stopping_rounds=500,
                  eval_set=[(X_valid, y_valid)], verbose=2000)

        # getting predictions
        valid_preds = model.predict(X_valid)

        # getting RMSE
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        return rmse
    #==============================================================================================================================================================================#
    # Optimizing optuna_model and getting best parameters
    optuna_2 = optuna.create_study(direction="minimize")
    optuna_2.optimize(optuna_model_2, n_trials=5)
    best_params_model_2 = optuna_2.best_params
    #==============================================================================================================================================================================#
    # final_model made by XGB Regressor
    final_valid_predictions = {}
    final_test_predictions = []
    scores = []

    for fold in range(5):
        # making training data and validating data for each fold
        X_train = model_2_train_data[model_2_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_2_train_data[model_2_train_data.kfold == fold].reset_index(drop=True)

        X_test = test_data.copy()

        valid_ids = X_valid.id.values.tolist()
        
        y_train = X_train.target
        y_valid = X_valid.target

        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]
        
        ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])
        X_test[cat_cols] = Ord_encoder.transform(X_test[cat_cols])

        # making the model
        model = XGBRegressor(
            n_estimators=5000,
            **best_params_model_2,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting the data in the model
        model.fit(X_train, y_train, early_stopping_rounds=500,
                  eval_set=[(X_valid, y_valid)], verbose=2000)

        # getting valid predictions and test predictions
        valid_preds = model.predict(X_valid)
        test_preds = model.predict(X_test)
        final_valid_predictions.update(dict(zip(valid_ids, valid_preds)))
        final_test_predictions.append(test_preds)

        # rmse on valid predictions
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        scores.append(rmse)
    
    #==============================================================================================================================================================================#
    valid_preds_2 = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    valid_preds_2.columns = ["id", "pred_2"]
    valid_preds_2.to_csv("train_preds_2.csv", index=False)
    
    test_preds_2.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    test_preds_2.columns = ["id", "pred_2"]
    test_preds_2.to_csv("test_preds_2.csv", index=False)
    return scores, test_preds_2

## MSE and Test Predictions on model 2

In [5]:
# model_2 MSE and test_predictions
mse_2, final_preds_2 = model_2()
print(f"MSE: {mse_2}\nTest data predictions: {final_preds_2}")

[32m[I 2021-09-01 01:55:40,661][0m A new study created in memory with name: no-name-220fd072-4dea-4750-86eb-714479cfe057[0m


[0]	validation_0-rmse:7.65826
[2000]	validation_0-rmse:0.72013
[2151]	validation_0-rmse:0.72023


[32m[I 2021-09-01 01:56:09,035][0m Trial 0 finished with value: 0.7199314404999175 and parameters: {'learning_rate': 0.015173066720321393, 'reg_lambda': 0.08454325416428177, 'reg_alpha': 3.249016813790478e-08, 'subsample': 0.2897787354798519, 'colsample_bytree': 0.9842939095643247, 'max_depth': 7}. Best is trial 0 with value: 0.7199314404999175.[0m


[0]	validation_0-rmse:7.59976
[1033]	validation_0-rmse:0.72430


[32m[I 2021-09-01 01:56:34,721][0m Trial 1 finished with value: 0.7229272220579959 and parameters: {'learning_rate': 0.022753860330639808, 'reg_lambda': 6.589764870652293e-08, 'reg_alpha': 6.248059142746158e-05, 'subsample': 0.10668869447273817, 'colsample_bytree': 0.22833640595679433, 'max_depth': 9}. Best is trial 0 with value: 0.7199314404999175.[0m


[0]	validation_0-rmse:7.63957
[2000]	validation_0-rmse:0.72505
[4000]	validation_0-rmse:0.72104
[4999]	validation_0-rmse:0.71988


[32m[I 2021-09-01 01:56:52,705][0m Trial 2 finished with value: 0.719878015740238 and parameters: {'learning_rate': 0.01759525685674353, 'reg_lambda': 0.3490040805860492, 'reg_alpha': 0.022701877538455064, 'subsample': 0.8055536799979458, 'colsample_bytree': 0.6573107702778506, 'max_depth': 2}. Best is trial 2 with value: 0.719878015740238.[0m


[0]	validation_0-rmse:7.52992
[1071]	validation_0-rmse:0.72084


[32m[I 2021-09-01 01:57:28,933][0m Trial 3 finished with value: 0.7196248224818687 and parameters: {'learning_rate': 0.03182949044196197, 'reg_lambda': 1.9109285658982982e-05, 'reg_alpha': 3.991316425829756, 'subsample': 0.6729636731518039, 'colsample_bytree': 0.8513150675199644, 'max_depth': 9}. Best is trial 3 with value: 0.7196248224818687.[0m


[0]	validation_0-rmse:7.65918
[2000]	validation_0-rmse:0.71862
[4000]	validation_0-rmse:0.71780
[4759]	validation_0-rmse:0.71784


[32m[I 2021-09-01 01:57:56,712][0m Trial 4 finished with value: 0.71777300602988 and parameters: {'learning_rate': 0.015053867176857803, 'reg_lambda': 2.255514823127675e-07, 'reg_alpha': 0.003093819699313589, 'subsample': 0.33014474729008103, 'colsample_bytree': 0.5850581273919265, 'max_depth': 5}. Best is trial 4 with value: 0.71777300602988.[0m


[0]	validation_0-rmse:7.65918
[2000]	validation_0-rmse:0.71871
[4000]	validation_0-rmse:0.71789
[4149]	validation_0-rmse:0.71795
[0]	validation_0-rmse:7.66237
[2000]	validation_0-rmse:0.72641
[4000]	validation_0-rmse:0.72560
[4258]	validation_0-rmse:0.72560
[0]	validation_0-rmse:7.65842
[2000]	validation_0-rmse:0.72227
[4000]	validation_0-rmse:0.72110
[4673]	validation_0-rmse:0.72103
[0]	validation_0-rmse:7.66486
[2000]	validation_0-rmse:0.72114
[3855]	validation_0-rmse:0.72040
[0]	validation_0-rmse:7.66459
[2000]	validation_0-rmse:0.71684
[4000]	validation_0-rmse:0.71550
[4571]	validation_0-rmse:0.71550
MSE: [0.7178632073212032, 0.7255448064223806, 0.7209973852429764, 0.7203936121150996, 0.7154673262846395]
Test data predictions:             id    pred_2
0            0  8.079402
1            5  8.367846
2           15  8.383591
3           16  8.440797
4           17  8.179073
...        ...       ...
199995  499987  8.132765
199996  499990  8.401813
199997  499991  8.442299
199998  4

# 3rd model
#### where:
##### **K-fold** to split the training data(`train.csv`) for training and testing where folds = 5
##### **Target Encoding** to get numerical `median` values of categorical values
##### **One Hot Encoding** on optuna_model_1 to get numerical values of categorical values
##### **Ordinal Encoding** on model_1 to get numerical values of categorical values
##### **XGB Regressor** to make the model and predict validation data and test data n estimators=10000
____________________________________________________________________________________________________
## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`

In [6]:
# 3rd model
def model_3():
    train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
    test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
    test_preds_3 = sample_submission_3 = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
    
    # K-fold splitting where total folds = 5
    train_data["kfold"] = -1
    Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
    # fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
    for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
        train_data.loc[valid_index, "kfold"] = fold
    
    #==============================================================================================================================================================================#
    # Setting required models
    num_cols = [col for col in train_data.columns if "cont" in col]
    cat_cols = [col for col in train_data.columns if "cat" in col]
    useful_cols = cat_cols + num_cols
    model_3_train_data = train_data
    model_3_test_data = test_data
    test_data = test_data[useful_cols]
    
    #==============================================================================================================================================================================#
    # Target Encoding
    # Iterate over categorical columns
    for col in cat_cols:
        """
        Based on each categorical column, one target fold is created
        total 5 folds for one column
        """
        temp_train = []
        temp_test_target = None
        for fold in range(5):
            # making training data and validating data for each fold
            X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
            X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
            
            # getting the mean of training data target
            mean_target = dict(X_train.groupby(col)["target"].agg("median"))

            # adding the mean_target to X_valid (valid dataset of each fold)
            X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
            temp_train.append(X_valid)
            if (temp_test_target is None):
                temp_test_target = test_data[col].map(mean_target)
            else:
                temp_test_target += test_data[col].map(mean_target)
                
        # getting the average of temporary test target on each column
        temp_test_target /= 5
        # adding the temporary test target to test data on each column (total=5)
        test_data.loc[:, f"target_enc_{col}"] = temp_test_target

        # setting training data as temp_train
        train_data = pd.concat(temp_train)
        
    #==============================================================================================================================================================================#
    # Setting required columns again
    useful_cols = [col for col in train_data.columns if ((train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")) and (col not in ("id"))]
    cat_cols = [col for col in train_data.columns if (train_data[col].dtypes == "object")]
    useful_cols = cat_cols + num_cols
    train_data = train_data[useful_cols]
    test_data = test_data[useful_cols[:24] + useful_cols[26:]]
    
    #==============================================================================================================================================================================#
    # optuna model 3 for final model 3
    def optuna_model_3(trial):
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # Setting required columns again
        useful_cols = [col for col in model_3_train_data.columns if ((model_3_train_data[col].dtypes == "int64") or (model_3_train_data[col].dtypes =="float64")) and (col != "id")]
        cat_cols = [col for col in model_3_test_data.columns if (model_3_test_data[col].dtypes == "object")]
        useful_cols = cat_cols + num_cols
        train_data = model_3_train_data[useful_cols]
        test_data = model_3_test_data[useful_cols[:24] + useful_cols[26:]]

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # making parameteres
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.5, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        
        fold = 0
        # making training data and validating data for each fold
        X_train = model_3_train_data[model_3_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_3_train_data[model_3_train_data.kfold == fold].reset_index(drop=True)

        # setting the target
        y_train = X_train.target
        y_valid = X_valid.target

        # setting the training data and validating data
        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])

        # making model
        model = XGBRegressor(
            n_estimators=10000,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting data into the model
        model.fit(X_train, y_train, early_stopping_rounds=1000,
                  eval_set=[(X_valid, y_valid)], verbose=5000)

        # getting predictions
        valid_preds = model.predict(X_valid)

        # getting RMSE
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        return rmse
    #==============================================================================================================================================================================#
    
    # Optimizing optuna_model and getting best parameters
    optuna_3 = optuna.create_study(direction="minimize")
    optuna_3.optimize(optuna_model_3, n_trials=5)
    best_params_model_3 = optuna_3.best_params
    #==============================================================================================================================================================================#
    
    # final_model made by XGB Regressor
    final_valid_predictions = {}
    final_test_predictions = []
    scores = []

    for fold in range(5):
        # making training data and validating data for each fold
        X_train = model_3_train_data[model_3_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_3_train_data[model_3_train_data.kfold == fold].reset_index(drop=True)

        X_test = test_data.copy()

        valid_ids = X_valid.id.values.tolist()
        
        y_train = X_train.target
        y_valid = X_valid.target

        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

         ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])
        X_test[cat_cols] = Ord_encoder.transform(X_test[cat_cols])

        # making the model
        model = XGBRegressor(
            n_estimators=10000,
            **best_params_model_3,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting the data in the model
        model.fit(X_train, y_train, early_stopping_rounds=1000,
                  eval_set=[(X_valid, y_valid)], verbose=5000)

        # getting valid predictions and test predictions
        valid_preds = model.predict(X_valid)
        test_preds = model.predict(X_test)
        final_valid_predictions.update(dict(zip(valid_ids, valid_preds)))
        final_test_predictions.append(test_preds)

        # rmse on valid predictions
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        scores.append(rmse)
    
    #==============================================================================================================================================================================#
    valid_preds_3 = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    valid_preds_3.columns = ["id", "pred_3"]
    valid_preds_3.to_csv("train_preds_3.csv", index=False)
    
    test_preds_3.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    test_preds_3.columns = ["id", "pred_3"]
    test_preds_3.to_csv("test_preds_3.csv", index=False)
    return scores, test_preds_3

## MSE and Test Predictions on model 3

In [7]:
# model_3 MSE and test_predictions
mse_3, final_preds_3 = model_3()
print(f"MSE: {mse_3}\nTest data predictions: {final_preds_3}")

[32m[I 2021-09-01 01:59:46,311][0m A new study created in memory with name: no-name-4500c508-b8fa-49c9-ac76-1a8381fa4161[0m


[0]	validation_0-rmse:7.54340
[1872]	validation_0-rmse:0.71864


[32m[I 2021-09-01 02:00:34,031][0m Trial 0 finished with value: 0.7167820921125675 and parameters: {'learning_rate': 0.030080370238632186, 'reg_lambda': 2.886592511768368, 'reg_alpha': 8.737119208333445, 'subsample': 0.5131082925009189, 'colsample_bytree': 0.2064826486494481, 'max_depth': 10}. Best is trial 0 with value: 0.7167820921125675.[0m


[0]	validation_0-rmse:6.97251
[1048]	validation_0-rmse:0.84534


[32m[I 2021-09-01 02:01:20,603][0m Trial 1 finished with value: 0.7397773831443202 and parameters: {'learning_rate': 0.10421488593502182, 'reg_lambda': 1.4699421094873095e-08, 'reg_alpha': 0.0011492671090863576, 'subsample': 0.17233686845516344, 'colsample_bytree': 0.9639028272592877, 'max_depth': 10}. Best is trial 0 with value: 0.7167820921125675.[0m


[0]	validation_0-rmse:7.56422
[5000]	validation_0-rmse:0.71778
[9999]	validation_0-rmse:0.71637


[32m[I 2021-09-01 02:01:43,160][0m Trial 2 finished with value: 0.716370745656987 and parameters: {'learning_rate': 0.02737521297057318, 'reg_lambda': 4.889331552327708e-06, 'reg_alpha': 3.4319611252803963e-07, 'subsample': 0.6776601857441027, 'colsample_bytree': 0.9714058621268021, 'max_depth': 2}. Best is trial 2 with value: 0.716370745656987.[0m


[0]	validation_0-rmse:5.62003
[1022]	validation_0-rmse:1.02959


[32m[I 2021-09-01 02:01:59,745][0m Trial 3 finished with value: 0.7437111505616736 and parameters: {'learning_rate': 0.28023637353468944, 'reg_lambda': 0.0012009972875386228, 'reg_alpha': 0.005599371211538415, 'subsample': 0.11793435721863486, 'colsample_bytree': 0.10170424835876453, 'max_depth': 9}. Best is trial 2 with value: 0.716370745656987.[0m


[0]	validation_0-rmse:7.68024
[3632]	validation_0-rmse:0.71859


[32m[I 2021-09-01 02:02:34,491][0m Trial 4 finished with value: 0.7183112721565323 and parameters: {'learning_rate': 0.01231768046871694, 'reg_lambda': 0.3591646374199854, 'reg_alpha': 1.1923275257125785, 'subsample': 0.6221668819112247, 'colsample_bytree': 0.8776710186958979, 'max_depth': 7}. Best is trial 2 with value: 0.716370745656987.[0m


[0]	validation_0-rmse:7.56422
[5000]	validation_0-rmse:0.71778
[9999]	validation_0-rmse:0.71637
[0]	validation_0-rmse:7.56744
[5000]	validation_0-rmse:0.72522
[9999]	validation_0-rmse:0.72377
[0]	validation_0-rmse:7.56352
[5000]	validation_0-rmse:0.72119
[9999]	validation_0-rmse:0.71961
[0]	validation_0-rmse:7.56990
[5000]	validation_0-rmse:0.72047
[9999]	validation_0-rmse:0.71930
[0]	validation_0-rmse:7.56971
[5000]	validation_0-rmse:0.71581
[9999]	validation_0-rmse:0.71422
MSE: [0.716370745656987, 0.7237654879154056, 0.7196079078182203, 0.7192916067877397, 0.7142102739605147]
Test data predictions:             id    pred_3
0            0  8.099949
1            5  8.381259
2           15  8.411623
3           16  8.506899
4           17  8.168657
...        ...       ...
199995  499987  8.114617
199996  499990  8.452845
199997  499991  8.479716
199998  499994  8.198995
199999  499995  7.953298

[200000 rows x 2 columns]


## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`
### Training predicted data
* `train_pred_1.csv`
* `train_pred_2.csv`
* `train_pred_3.csv`
### Test predicted data
* `test_pred_1.csv`
* `test_pred_2.csv`
* `test_pred_3.csv`

In [8]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

train_pred_1 = pd.read_csv("train_preds_1.csv")
train_pred_2 = pd.read_csv("train_preds_2.csv")
train_pred_3 = pd.read_csv("train_preds_3.csv")

test_pred_1 = pd.read_csv("test_preds_1.csv")
test_pred_2 = pd.read_csv("test_preds_2.csv")
test_pred_3 = pd.read_csv("test_preds_3.csv")

train_data = train_data.merge(train_pred_1, on="id", how="left")
train_data = train_data.merge(train_pred_2, on="id", how="left")
train_data = train_data.merge(train_pred_3, on="id", how="left")

test_data = pd.concat([test_data, test_pred_1, test_pred_2, test_pred_3], join="inner", axis=1)
test_data = test_data.drop("id", axis=1)
test_data["id"] = test_pred_1["id"]

### Getting useful features/columns from test data

In [9]:
useful_cols = ["pred_1", "pred_2", "pred_3"]
test_data = test_data[useful_cols]
test_data.head()

Unnamed: 0,pred_1,pred_2,pred_3
0,8.090875,8.079402,8.099949
1,8.369916,8.367846,8.381259
2,8.402437,8.383591,8.411623
3,8.499657,8.440797,8.506899
4,8.180559,8.179073,8.168657


## Final Model
#### where:
##### **K-fold** to split the training data(`train_dadta`) for training and testing
##### **Target Encoding** to get numerical `median` values of categorical values
##### **Linear Regression** to make the model and predict validation data and test data

In [10]:
# K-fold splitting where total folds = 5
train_data["kfold"] = -1
Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
# fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
    train_data.loc[valid_index, "kfold"] = fold

In [11]:
useful_features = ["pred_1", "pred_2", "pred_3"]
test_data = test_data[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train =  train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()

    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, valid_preds, squared=False)
    print(f"Fold: {fold}, RMSE: {rmse}, \nTest predictions{test_preds}")
    scores.append(rmse)

Fold: 0, RMSE: 0.7162189975315332, 
Test predictions[8.09590519 8.3809012  8.41375117 ... 8.48379835 8.17324054 7.93478074]
Fold: 1, RMSE: 0.7236446905520985, 
Test predictions[8.09601876 8.37985191 8.41227977 ... 8.48182613 8.17114252 7.93499532]
Fold: 2, RMSE: 0.7194014916886058, 
Test predictions[8.09583753 8.38190605 8.41487582 ... 8.48520573 8.17506402 7.93483276]
Fold: 3, RMSE: 0.7191667910948852, 
Test predictions[8.09404372 8.38094019 8.41438411 ... 8.48517472 8.17410287 7.93245189]
Fold: 4, RMSE: 0.7139549511002987, 
Test predictions[8.09546049 8.38055705 8.4135681  ... 8.48378182 8.17604385 7.9355773 ]


# Submitting the Final Predictions

In [12]:
sample_submission.target = np.median(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)