# Importing required modules

In [1]:
import optuna
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

## 1st model
#### where:
##### **K-fold** to split the training data(`train.csv`) for training and testing where folds = 5
##### **Target Encoding** to get numerical `median` values of categorical values
##### **Ordinal Encoding** on optuna_model_1 to get numerical values of categorical values
##### **One Hot Encoding** on model_1 to get numerical values of categorical values
##### **XGBoost** to make the model and predict validation data and test data n estimators=1000
____________________________________________________________________________________________________
## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`

In [2]:
# 1st model
def model_1():
    train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
    test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
    test_preds_1 = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
    
    # K-fold splitting where total folds = 5
    train_data["kfold"] = -1
    Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
    # fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
    for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
        train_data.loc[valid_index, "kfold"] = fold
    
    #==============================================================================================================================================================================#
    # Setting required models
    num_cols = [col for col in train_data.columns if "cont" in col]
    cat_cols = [col for col in train_data.columns if "cat" in col]
    useful_cols = cat_cols + num_cols
    model_1_train_data = train_data
    model_1_test_data = test_data
    test_data = test_data[useful_cols]
    
    #==============================================================================================================================================================================#
    # Target Encoding
    # Iterate over categorical columns
    for col in cat_cols:
        """
        Based on each categorical column, one target fold is created
        total 5 folds for one column
        """
        temp_train = []
        temp_test_target = None
        for fold in range(5):
            # making training data and validating data for each fold
            X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
            X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
            
            # getting the mean of training data target
            mean_target = dict(X_train.groupby(col)["target"].agg("median"))

            # adding the mean_target to X_valid (valid dataset of each fold)
            X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
            temp_train.append(X_valid)
            if (temp_test_target is None):
                temp_test_target = test_data[col].map(mean_target)
            else:
                temp_test_target += test_data[col].map(mean_target)
                
        # getting the average of temporary test target on each column
        temp_test_target /= 5
        # adding the temporary test target to test data on each column (total=5)
        test_data.loc[:, f"target_enc_{col}"] = temp_test_target

        # setting training data as temp_train
        train_data = pd.concat(temp_train)
        
    #==============================================================================================================================================================================#
    # Setting required columns again
    useful_cols = [col for col in train_data.columns if ((train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")) and (col not in ("id"))]
    cat_cols = [col for col in train_data.columns if (train_data[col].dtypes == "object")]
    useful_cols = cat_cols + num_cols
    train_data = train_data[useful_cols]
    test_data = test_data[useful_cols[:24] + useful_cols[26:]]
    
    #==============================================================================================================================================================================#
    # optuna model 1 for final model 1
    def optuna_model_1(trial):
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # Setting required columns again
        useful_cols = [col for col in model_1_train_data.columns if ((model_1_train_data[col].dtypes == "int64") or (model_1_train_data[col].dtypes =="float64")) and (col != "id")]
        cat_cols = [col for col in model_1_test_data.columns if (model_1_test_data[col].dtypes == "object")]
        useful_cols = cat_cols + num_cols
        train_data = model_1_train_data[useful_cols]
        test_data = model_1_test_data[useful_cols[:24] + useful_cols[26:]]

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # making parameteres
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.5, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        
        fold = 0
        # making training data and validating data for each fold
        X_train = model_1_train_data[model_1_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_1_train_data[model_1_train_data.kfold == fold].reset_index(drop=True)

        # setting the target
        y_train = X_train.target
        y_valid = X_valid.target

        # setting the training data and validating data
        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])

        # making model
        model = XGBRegressor(
            n_estimators=2000,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=50,
        )
        # fitting data into the model
        model.fit(X_train, y_train, early_stopping_rounds=300,
                  eval_set=[(X_valid, y_valid)], verbose=1000)

        # getting predictions
        valid_preds = model.predict(X_valid)

        # getting RMSE
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        return rmse
    #==============================================================================================================================================================================#
    # Optimizing optuna_model and getting best parameters
    optuna_1 = optuna.create_study(direction="minimize")
    optuna_1.optimize(optuna_model_1, n_trials=5)
    best_params_model_1 = optuna_1.best_params
    #==============================================================================================================================================================================#
    # final_model made by XGB Regressor
    final_valid_predictions = {}
    final_test_predictions = []
    scores = []

    for fold in range(5):
        # making training data and validating data for each fold
        X_train = model_1_train_data[model_1_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_1_train_data[model_1_train_data.kfold == fold].reset_index(drop=True)

        X_test = test_data.copy()

        valid_ids = X_valid.id.values.tolist()
        
        y_train = X_train.target
        y_valid = X_valid.target

        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## One Hot Encoding for categorical data
        OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

        # fitting and transforming the training and test data
        OH_encoder.fit(X_train[cat_cols])
        OH_X_train = OH_encoder.transform(X_train[cat_cols])
        OH_X_valid = OH_encoder.transform(X_valid[cat_cols])
        OH_X_test = OH_encoder.transform(X_test[cat_cols])

        # Naming the one hot encoded columns
        OH_X_train = pd.DataFrame(OH_X_train, columns=[f"ohe_{i}" for i in range(OH_X_train.shape[1])])
        OH_X_valid = pd.DataFrame(OH_X_valid, columns=[f"ohe_{i}" for i in range(OH_X_valid.shape[1])])
        OH_X_test = pd.DataFrame(OH_X_test, columns=[f"ohe_{i}" for i in range(OH_X_test.shape[1])])

        # Adding one hot encoded columns to main data (training, validating, test)
        X_train = pd.concat([X_train, OH_X_train], axis=1)
        X_valid = pd.concat([X_valid, OH_X_valid], axis=1)
        X_test = pd.concat([X_test, OH_X_test], axis=1)

        # Dropping the categorical columns, as their one hot encoded columns are added to main data
        X_train = X_train.drop(cat_cols, axis=1)
        X_valid = X_valid.drop(cat_cols, axis=1)
        X_test = X_test.drop(cat_cols, axis=1)

        # making the model
        model = XGBRegressor(
            n_estimators=2000,
            **best_params_model_1,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=50,
        )
        # fitting the data in the model
        model.fit(X_train, y_train, early_stopping_rounds=300,
                  eval_set=[(X_valid, y_valid)], verbose=1000)

        # getting valid predictions and test predictions
        valid_preds = model.predict(X_valid)
        test_preds = model.predict(X_test)
        final_valid_predictions.update(dict(zip(valid_ids, valid_preds)))
        final_test_predictions.append(test_preds)

        # rmse on valid predictions
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        scores.append(rmse)
    
    #==============================================================================================================================================================================#
    valid_preds_1 = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    valid_preds_1.columns = ["id", "pred_1"]
    valid_preds_1.to_csv("train_preds_1.csv", index=False)
    
    test_preds_1.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    test_preds_1.columns = ["id", "pred_1"]
    test_preds_1.to_csv("test_preds_1.csv", index=False)
    return scores, test_preds_1

## MSE and Test Predictions on model 1

In [3]:
# model_1 MSE and test_predictions
mse_1, final_preds_1 = model_1()
print(f"MSE: {mse_1}\nTest data predictions: {final_preds_1}")

[32m[I 2021-09-01 01:36:28,000][0m A new study created in memory with name: no-name-fd9fca7e-d25d-46ce-ab47-f85042ff14f7[0m


[0]	validation_0-rmse:6.89149
[846]	validation_0-rmse:0.72126


[32m[I 2021-09-01 01:36:32,906][0m Trial 0 finished with value: 0.7211777159433347 and parameters: {'learning_rate': 0.1147570270572494, 'reg_lambda': 6.825851382112473, 'reg_alpha': 3.213300893168289e-06, 'subsample': 0.10892909814702535, 'colsample_bytree': 0.46323340760229426, 'max_depth': 3}. Best is trial 0 with value: 0.7211777159433347.[0m


[0]	validation_0-rmse:7.26166
[1000]	validation_0-rmse:0.71794
[1086]	validation_0-rmse:0.71808


[32m[I 2021-09-01 01:36:41,010][0m Trial 1 finished with value: 0.7176918745026623 and parameters: {'learning_rate': 0.06664684744753252, 'reg_lambda': 0.018108864693262686, 'reg_alpha': 36.20447627410865, 'subsample': 0.8171198858875128, 'colsample_bytree': 0.9264069112538692, 'max_depth': 6}. Best is trial 1 with value: 0.7176918745026623.[0m


[0]	validation_0-rmse:7.08504
[442]	validation_0-rmse:0.73064


[32m[I 2021-09-01 01:36:54,492][0m Trial 2 finished with value: 0.7235744320960131 and parameters: {'learning_rate': 0.08954578797888178, 'reg_lambda': 1.928061183994041, 'reg_alpha': 3.621485314970313e-07, 'subsample': 0.5615787273460423, 'colsample_bytree': 0.725498930130339, 'max_depth': 9}. Best is trial 1 with value: 0.7176918745026623.[0m


[0]	validation_0-rmse:7.43607
[520]	validation_0-rmse:0.72659


[32m[I 2021-09-01 01:37:18,523][0m Trial 3 finished with value: 0.7240961981891074 and parameters: {'learning_rate': 0.044003648985582976, 'reg_lambda': 3.931013213625755e-05, 'reg_alpha': 7.990103809461327e-06, 'subsample': 0.32260766054084444, 'colsample_bytree': 0.32663613159565374, 'max_depth': 10}. Best is trial 1 with value: 0.7176918745026623.[0m


[0]	validation_0-rmse:7.49610
[1000]	validation_0-rmse:0.72160
[1002]	validation_0-rmse:0.72160


[32m[I 2021-09-01 01:37:29,725][0m Trial 4 finished with value: 0.7209834010553372 and parameters: {'learning_rate': 0.03621325990966776, 'reg_lambda': 0.013281140637444693, 'reg_alpha': 0.0040249175840555386, 'subsample': 0.3514272455549661, 'colsample_bytree': 0.7301529090132135, 'max_depth': 7}. Best is trial 1 with value: 0.7176918745026623.[0m


[0]	validation_0-rmse:7.26166
[1000]	validation_0-rmse:0.71788
[1076]	validation_0-rmse:0.71805
[0]	validation_0-rmse:7.26499
[1000]	validation_0-rmse:0.72562
[1125]	validation_0-rmse:0.72585
[0]	validation_0-rmse:7.26102
[1000]	validation_0-rmse:0.72076
[1215]	validation_0-rmse:0.72103
[0]	validation_0-rmse:7.26746
[1000]	validation_0-rmse:0.72013
[1117]	validation_0-rmse:0.72031
[0]	validation_0-rmse:7.26718
[1000]	validation_0-rmse:0.71575
[1142]	validation_0-rmse:0.71590
MSE: [0.7175903370259966, 0.7255340176048198, 0.7206253625231483, 0.7200180255998814, 0.7155452285583133]
Test data predictions:             id    pred_1
0            0  8.070074
1            5  8.362631
2           15  8.372488
3           16  8.466038
4           17  8.137090
...        ...       ...
199995  499987  8.085490
199996  499990  8.414785
199997  499991  8.434820
199998  499994  8.084948
199999  499995  7.998491

[200000 rows x 2 columns]


# 2nd model
#### where:
##### **K-fold** to split the training data(`train.csv`) for training and testing where folds = 5
##### **Target Encoding** to get numerical `median` values of categorical values
##### **One Hot Encoding** on optuna_model_1 to get numerical values of categorical values
##### **Ordinal Encoding** on model_1 to get numerical values of categorical values
##### **XGB Regressor** to make the model and predict validation data and test data n estimators=5000
____________________________________________________________________________________________________
## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`

In [4]:
# 2nd model
def model_2():
    train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
    test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
    test_preds_2 = sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
    
    # K-fold splitting where total folds = 5
    train_data["kfold"] = -1
    Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
    # fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
    for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
        train_data.loc[valid_index, "kfold"] = fold
    
    #==============================================================================================================================================================================#
    # Setting required models
    num_cols = [col for col in train_data.columns if "cont" in col]
    cat_cols = [col for col in train_data.columns if "cat" in col]
    useful_cols = cat_cols + num_cols
    model_2_train_data = train_data
    model_2_test_data = test_data
    test_data = test_data[useful_cols]
    
    #==============================================================================================================================================================================#
    # Target Encoding
    # Iterate over categorical columns
    for col in cat_cols:
        """
        Based on each categorical column, one target fold is created
        total 5 folds for one column
        """
        temp_train = []
        temp_test_target = None
        for fold in range(5):
            # making training data and validating data for each fold
            X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
            X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
            
            # getting the mean of training data target
            mean_target = dict(X_train.groupby(col)["target"].agg("median"))

            # adding the mean_target to X_valid (valid dataset of each fold)
            X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
            temp_train.append(X_valid)
            if (temp_test_target is None):
                temp_test_target = test_data[col].map(mean_target)
            else:
                temp_test_target += test_data[col].map(mean_target)
                
        # getting the average of temporary test target on each column
        temp_test_target /= 5
        # adding the temporary test target to test data on each column (total=5)
        test_data.loc[:, f"target_enc_{col}"] = temp_test_target

        # setting training data as temp_train
        train_data = pd.concat(temp_train)
        
    #==============================================================================================================================================================================#
    # Setting required columns again
    useful_cols = [col for col in train_data.columns if ((train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")) and (col not in ("id"))]
    cat_cols = [col for col in train_data.columns if (train_data[col].dtypes == "object")]
    useful_cols = cat_cols + num_cols
    train_data = train_data[useful_cols]
    test_data = test_data[useful_cols[:24] + useful_cols[26:]]
    
    #==============================================================================================================================================================================#
    # optuna model 2 for final model 2
    def optuna_model_2(trial):
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # Setting required columns again
        useful_cols = [col for col in model_2_train_data.columns if ((model_2_train_data[col].dtypes == "int64") or (model_2_train_data[col].dtypes =="float64")) and (col != "id")]
        cat_cols = [col for col in model_2_test_data.columns if (model_2_test_data[col].dtypes == "object")]
        useful_cols = cat_cols + num_cols
        train_data = model_2_train_data[useful_cols]
        test_data = model_2_test_data[useful_cols[:24] + useful_cols[26:]]

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # making parameteres
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.05, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        
        fold = 0
        # making training data and validating data for each fold
        X_train = model_2_train_data[model_2_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_2_train_data[model_2_train_data.kfold == fold].reset_index(drop=True)
        
        X_test = test_data.copy()
        
        # setting the target
        y_train = X_train.target
        y_valid = X_valid.target

        # setting the training data and validating data
        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## One Hot Encoding for categorical data
        OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

        # fitting and transforming the training and test data
        OH_encoder.fit(X_train[cat_cols])
        OH_X_train = OH_encoder.transform(X_train[cat_cols])
        OH_X_valid = OH_encoder.transform(X_valid[cat_cols])

        # Naming the one hot encoded columns
        OH_X_train = pd.DataFrame(OH_X_train, columns=[f"ohe_{i}" for i in range(OH_X_train.shape[1])])
        OH_X_valid = pd.DataFrame(OH_X_valid, columns=[f"ohe_{i}" for i in range(OH_X_valid.shape[1])])

        # Adding one hot encoded columns to main data (training, validating, test)
        X_train = pd.concat([X_train, OH_X_train], axis=1)
        X_valid = pd.concat([X_valid, OH_X_valid], axis=1)

        # Dropping the categorical columns, as their one hot encoded columns are added to main data
        X_train = X_train.drop(cat_cols, axis=1)
        X_valid = X_valid.drop(cat_cols, axis=1)

        # making model
        model = XGBRegressor(
            n_estimators=5000,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting data into the model
        model.fit(X_train, y_train, early_stopping_rounds=500,
                  eval_set=[(X_valid, y_valid)], verbose=2000)

        # getting predictions
        valid_preds = model.predict(X_valid)

        # getting RMSE
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        return rmse
    #==============================================================================================================================================================================#
    # Optimizing optuna_model and getting best parameters
    optuna_2 = optuna.create_study(direction="minimize")
    optuna_2.optimize(optuna_model_2, n_trials=5)
    best_params_model_2 = optuna_2.best_params
    #==============================================================================================================================================================================#
    # final_model made by XGB Regressor
    final_valid_predictions = {}
    final_test_predictions = []
    scores = []

    for fold in range(5):
        # making training data and validating data for each fold
        X_train = model_2_train_data[model_2_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_2_train_data[model_2_train_data.kfold == fold].reset_index(drop=True)

        X_test = test_data.copy()

        valid_ids = X_valid.id.values.tolist()
        
        y_train = X_train.target
        y_valid = X_valid.target

        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]
        
        ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])
        X_test[cat_cols] = Ord_encoder.transform(X_test[cat_cols])

        # making the model
        model = XGBRegressor(
            n_estimators=5000,
            **best_params_model_2,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting the data in the model
        model.fit(X_train, y_train, early_stopping_rounds=500,
                  eval_set=[(X_valid, y_valid)], verbose=2000)

        # getting valid predictions and test predictions
        valid_preds = model.predict(X_valid)
        test_preds = model.predict(X_test)
        final_valid_predictions.update(dict(zip(valid_ids, valid_preds)))
        final_test_predictions.append(test_preds)

        # rmse on valid predictions
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        scores.append(rmse)
    
    #==============================================================================================================================================================================#
    valid_preds_2 = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    valid_preds_2.columns = ["id", "pred_2"]
    valid_preds_2.to_csv("train_preds_2.csv", index=False)
    
    test_preds_2.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    test_preds_2.columns = ["id", "pred_2"]
    test_preds_2.to_csv("test_preds_2.csv", index=False)
    return scores, test_preds_2

## MSE and Test Predictions on model 2

In [5]:
# model_2 MSE and test_predictions
mse_2, final_preds_2 = model_2()
print(f"MSE: {mse_2}\nTest data predictions: {final_preds_2}")

[32m[I 2021-09-01 01:38:42,138][0m A new study created in memory with name: no-name-b332ef5c-d474-4db4-8622-4c7f60d82625[0m


[0]	validation_0-rmse:7.46870
[2000]	validation_0-rmse:0.72055
[4000]	validation_0-rmse:0.71731
[4999]	validation_0-rmse:0.71665


[32m[I 2021-09-01 01:39:00,109][0m Trial 0 finished with value: 0.716648121574307 and parameters: {'learning_rate': 0.03977377364433992, 'reg_lambda': 4.411299029154334e-06, 'reg_alpha': 0.0001810029330122166, 'subsample': 0.822184849960225, 'colsample_bytree': 0.3429127494185461, 'max_depth': 2}. Best is trial 0 with value: 0.716648121574307.[0m


[0]	validation_0-rmse:7.66283
[2000]	validation_0-rmse:0.71789
[4000]	validation_0-rmse:0.71625
[4999]	validation_0-rmse:0.71615


[32m[I 2021-09-01 01:39:28,994][0m Trial 1 finished with value: 0.7161394154357016 and parameters: {'learning_rate': 0.01457554904308877, 'reg_lambda': 1.0992630150897692e-05, 'reg_alpha': 8.34628164572927e-07, 'subsample': 0.8711763606429522, 'colsample_bytree': 0.27848310455534064, 'max_depth': 5}. Best is trial 1 with value: 0.7161394154357016.[0m


[0]	validation_0-rmse:7.67449
[2000]	validation_0-rmse:0.71993
[4000]	validation_0-rmse:0.71738
[4999]	validation_0-rmse:0.71690


[32m[I 2021-09-01 01:39:52,634][0m Trial 2 finished with value: 0.7168974430479758 and parameters: {'learning_rate': 0.013063915233843178, 'reg_lambda': 4.9083797513294517e-05, 'reg_alpha': 6.268155107812823e-08, 'subsample': 0.4368924520929911, 'colsample_bytree': 0.41471170215563125, 'max_depth': 4}. Best is trial 1 with value: 0.7161394154357016.[0m


[0]	validation_0-rmse:7.60661
[2000]	validation_0-rmse:0.72090
[4000]	validation_0-rmse:0.71832
[4999]	validation_0-rmse:0.71776


[32m[I 2021-09-01 01:40:13,394][0m Trial 3 finished with value: 0.7177591065596528 and parameters: {'learning_rate': 0.021876088494678037, 'reg_lambda': 0.013567567916936871, 'reg_alpha': 70.74015350955486, 'subsample': 0.5360129060849486, 'colsample_bytree': 0.55202670782887, 'max_depth': 3}. Best is trial 1 with value: 0.7161394154357016.[0m


[0]	validation_0-rmse:7.56972
[2000]	validation_0-rmse:0.72859
[4000]	validation_0-rmse:0.72612
[4999]	validation_0-rmse:0.72527


[32m[I 2021-09-01 01:40:28,636][0m Trial 4 finished with value: 0.7252652354124913 and parameters: {'learning_rate': 0.02666227099329454, 'reg_lambda': 6.44329044918662e-06, 'reg_alpha': 2.4751915432790983, 'subsample': 0.5863318545554217, 'colsample_bytree': 0.9131362150130748, 'max_depth': 1}. Best is trial 1 with value: 0.7161394154357016.[0m


[0]	validation_0-rmse:7.66284
[2000]	validation_0-rmse:0.71777
[4000]	validation_0-rmse:0.71622
[4999]	validation_0-rmse:0.71607
[0]	validation_0-rmse:7.66605
[2000]	validation_0-rmse:0.72569
[4000]	validation_0-rmse:0.72407
[4999]	validation_0-rmse:0.72391
[0]	validation_0-rmse:7.66214
[2000]	validation_0-rmse:0.72129
[4000]	validation_0-rmse:0.71969
[4999]	validation_0-rmse:0.71948
[0]	validation_0-rmse:7.66851
[2000]	validation_0-rmse:0.72048
[4000]	validation_0-rmse:0.71906
[4999]	validation_0-rmse:0.71896
[0]	validation_0-rmse:7.66833
[2000]	validation_0-rmse:0.71591
[4000]	validation_0-rmse:0.71421
[4999]	validation_0-rmse:0.71401
MSE: [0.7160550903886324, 0.7239131575714605, 0.7194784348760122, 0.7189435754938459, 0.714003905062061]
Test data predictions:             id    pred_2
0            0  8.082083
1            5  8.389063
2           15  8.400261
3           16  8.462927
4           17  8.182848
...        ...       ...
199995  499987  8.081489
199996  499990  8.416911
19

# 3rd model
#### where:
##### **K-fold** to split the training data(`train.csv`) for training and testing where folds = 5
##### **Target Encoding** to get numerical `median` values of categorical values
##### **One Hot Encoding** on optuna_model_1 to get numerical values of categorical values
##### **Ordinal Encoding** on model_1 to get numerical values of categorical values
##### **XGB Regressor** to make the model and predict validation data and test data n estimators=10000
____________________________________________________________________________________________________
## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`

In [6]:
# 3rd model
def model_3():
    train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
    test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
    test_preds_3 = sample_submission_3 = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
    
    # K-fold splitting where total folds = 5
    train_data["kfold"] = -1
    Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
    # fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
    for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
        train_data.loc[valid_index, "kfold"] = fold
    
    #==============================================================================================================================================================================#
    # Setting required models
    num_cols = [col for col in train_data.columns if "cont" in col]
    cat_cols = [col for col in train_data.columns if "cat" in col]
    useful_cols = cat_cols + num_cols
    model_3_train_data = train_data
    model_3_test_data = test_data
    test_data = test_data[useful_cols]
    
    #==============================================================================================================================================================================#
    # Target Encoding
    # Iterate over categorical columns
    for col in cat_cols:
        """
        Based on each categorical column, one target fold is created
        total 5 folds for one column
        """
        temp_train = []
        temp_test_target = None
        for fold in range(5):
            # making training data and validating data for each fold
            X_train = train_data[train_data.kfold != fold].reset_index(drop=True)
            X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
            
            # getting the mean of training data target
            mean_target = dict(X_train.groupby(col)["target"].agg("median"))

            # adding the mean_target to X_valid (valid dataset of each fold)
            X_valid.loc[:, f"target_enc_{col}"] = X_valid[col].map(mean_target)
            temp_train.append(X_valid)
            if (temp_test_target is None):
                temp_test_target = test_data[col].map(mean_target)
            else:
                temp_test_target += test_data[col].map(mean_target)
                
        # getting the average of temporary test target on each column
        temp_test_target /= 5
        # adding the temporary test target to test data on each column (total=5)
        test_data.loc[:, f"target_enc_{col}"] = temp_test_target

        # setting training data as temp_train
        train_data = pd.concat(temp_train)
        
    #==============================================================================================================================================================================#
    # Setting required columns again
    useful_cols = [col for col in train_data.columns if ((train_data[col].dtypes == "int64") or (train_data[col].dtypes =="float64")) and (col not in ("id"))]
    cat_cols = [col for col in train_data.columns if (train_data[col].dtypes == "object")]
    useful_cols = cat_cols + num_cols
    train_data = train_data[useful_cols]
    test_data = test_data[useful_cols[:24] + useful_cols[26:]]
    
    #==============================================================================================================================================================================#
    # optuna model 3 for final model 3
    def optuna_model_3(trial):
        
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # Setting required columns again
        useful_cols = [col for col in model_3_train_data.columns if ((model_3_train_data[col].dtypes == "int64") or (model_3_train_data[col].dtypes =="float64")) and (col != "id")]
        cat_cols = [col for col in model_3_test_data.columns if (model_3_test_data[col].dtypes == "object")]
        useful_cols = cat_cols + num_cols
        train_data = model_3_train_data[useful_cols]
        test_data = model_3_test_data[useful_cols[:24] + useful_cols[26:]]

        #---------------------------------------------------------------------------------------------------------------------------------------------------------------#
        # making parameteres
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.5, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 10)
        
        fold = 0
        # making training data and validating data for each fold
        X_train = model_3_train_data[model_3_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_3_train_data[model_3_train_data.kfold == fold].reset_index(drop=True)

        # setting the target
        y_train = X_train.target
        y_valid = X_valid.target

        # setting the training data and validating data
        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

        ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])

        # making model
        model = XGBRegressor(
            n_estimators=10000,
            learning_rate=learning_rate,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            max_depth=max_depth,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting data into the model
        model.fit(X_train, y_train, early_stopping_rounds=1000,
                  eval_set=[(X_valid, y_valid)], verbose=5000)

        # getting predictions
        valid_preds = model.predict(X_valid)

        # getting RMSE
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        return rmse
    #==============================================================================================================================================================================#
    
    # Optimizing optuna_model and getting best parameters
    optuna_3 = optuna.create_study(direction="minimize")
    optuna_3.optimize(optuna_model_3, n_trials=5)
    best_params_model_3 = optuna_3.best_params
    #==============================================================================================================================================================================#
    
    # final_model made by XGB Regressor
    final_valid_predictions = {}
    final_test_predictions = []
    scores = []

    for fold in range(5):
        # making training data and validating data for each fold
        X_train = model_3_train_data[model_3_train_data.kfold != fold].reset_index(drop=True)
        X_valid = model_3_train_data[model_3_train_data.kfold == fold].reset_index(drop=True)

        X_test = test_data.copy()

        valid_ids = X_valid.id.values.tolist()
        
        y_train = X_train.target
        y_valid = X_valid.target

        X_train = X_train[test_data.columns]
        X_valid = X_valid[test_data.columns]

         ## Ordinal for categorical data
        Ord_encoder = OrdinalEncoder()

        # fitting and transforming the training and test data
        Ord_encoder.fit(X_train[cat_cols])
        X_train[cat_cols] = Ord_encoder.transform(X_train[cat_cols])
        X_valid[cat_cols] = Ord_encoder.transform(X_valid[cat_cols])
        X_test[cat_cols] = Ord_encoder.transform(X_test[cat_cols])

        # making the model
        model = XGBRegressor(
            n_estimators=10000,
            **best_params_model_3,
            tree_method="gpu_hist",
            gpu_id=0,
            predictor="gpu_predictor",
            random_state=100,
        )
        # fitting the data in the model
        model.fit(X_train, y_train, early_stopping_rounds=1000,
                  eval_set=[(X_valid, y_valid)], verbose=5000)

        # getting valid predictions and test predictions
        valid_preds = model.predict(X_valid)
        test_preds = model.predict(X_test)
        final_valid_predictions.update(dict(zip(valid_ids, valid_preds)))
        final_test_predictions.append(test_preds)

        # rmse on valid predictions
        rmse = mean_squared_error(y_valid, valid_preds, squared=False)
        scores.append(rmse)
    
    #==============================================================================================================================================================================#
    valid_preds_3 = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    valid_preds_3.columns = ["id", "pred_3"]
    valid_preds_3.to_csv("train_preds_3.csv", index=False)
    
    test_preds_3.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    test_preds_3.columns = ["id", "pred_3"]
    test_preds_3.to_csv("test_preds_3.csv", index=False)
    return scores, test_preds_3

## MSE and Test Predictions on model 3

In [7]:
# model_3 MSE and test_predictions
mse_3, final_preds_3 = model_3()
print(f"MSE: {mse_3}\nTest data predictions: {final_preds_3}")

[32m[I 2021-09-01 01:42:36,477][0m A new study created in memory with name: no-name-070f35b2-0d27-44a8-9414-60f66e46974c[0m


[0]	validation_0-rmse:4.93026
[1044]	validation_0-rmse:0.78538


[32m[I 2021-09-01 01:42:53,200][0m Trial 0 finished with value: 0.7234899167795706 and parameters: {'learning_rate': 0.3704086420738438, 'reg_lambda': 58.44132678232211, 'reg_alpha': 3.436644860302557, 'subsample': 0.4741806916865723, 'colsample_bytree': 0.2162045932217823, 'max_depth': 8}. Best is trial 0 with value: 0.7234899167795706.[0m


[0]	validation_0-rmse:7.46218
[1907]	validation_0-rmse:0.71786


[32m[I 2021-09-01 01:43:10,465][0m Trial 1 finished with value: 0.7168681313976885 and parameters: {'learning_rate': 0.04062768193511658, 'reg_lambda': 10.06719904269574, 'reg_alpha': 11.079171880172636, 'subsample': 0.5040656301684638, 'colsample_bytree': 0.28190955566916, 'max_depth': 7}. Best is trial 1 with value: 0.7168681313976885.[0m


[0]	validation_0-rmse:7.67652
[5000]	validation_0-rmse:0.71707
[5894]	validation_0-rmse:0.71712


[32m[I 2021-09-01 01:43:36,072][0m Trial 2 finished with value: 0.7170435206401045 and parameters: {'learning_rate': 0.012797488456053238, 'reg_lambda': 1.3190033017972656e-07, 'reg_alpha': 0.0012425080786613982, 'subsample': 0.9347587316738429, 'colsample_bytree': 0.5545234943960127, 'max_depth': 5}. Best is trial 1 with value: 0.7168681313976885.[0m


[0]	validation_0-rmse:6.80807
[1176]	validation_0-rmse:0.73234


[32m[I 2021-09-01 01:44:11,463][0m Trial 3 finished with value: 0.7194809320263664 and parameters: {'learning_rate': 0.12555913631539878, 'reg_lambda': 5.0683845546986964e-08, 'reg_alpha': 2.827968746968803, 'subsample': 0.9998097181189507, 'colsample_bytree': 0.13613448200317219, 'max_depth': 10}. Best is trial 1 with value: 0.7168681313976885.[0m


[0]	validation_0-rmse:4.47806
[1043]	validation_0-rmse:0.77762


[32m[I 2021-09-01 01:44:19,756][0m Trial 4 finished with value: 0.7243339131541159 and parameters: {'learning_rate': 0.4294271113748058, 'reg_lambda': 21.710833827518666, 'reg_alpha': 1.279712114236112e-07, 'subsample': 0.7143892197440618, 'colsample_bytree': 0.6666124639917888, 'max_depth': 6}. Best is trial 1 with value: 0.7168681313976885.[0m


[0]	validation_0-rmse:7.46218
[1908]	validation_0-rmse:0.71787
[0]	validation_0-rmse:7.46545
[2014]	validation_0-rmse:0.72594
[0]	validation_0-rmse:7.46141
[1997]	validation_0-rmse:0.72106
[0]	validation_0-rmse:7.46787
[1979]	validation_0-rmse:0.72086
[0]	validation_0-rmse:7.46767
[2142]	validation_0-rmse:0.71634
MSE: [0.7168681313976885, 0.724654647863053, 0.7199917763674756, 0.7195306230476601, 0.7150173920468939]
Test data predictions:             id    pred_3
0            0  8.110340
1            5  8.352780
2           15  8.397600
3           16  8.441732
4           17  8.148008
...        ...       ...
199995  499987  8.057126
199996  499990  8.411471
199997  499991  8.436836
199998  499994  8.081011
199999  499995  7.965829

[200000 rows x 2 columns]


## Reading data from csv files
* `train.csv`
* `test.csv`
* `sample_submission.csv`
### Training predicted data
* `train_pred_1.csv`
* `train_pred_2.csv`
* `train_pred_3.csv`
### Test predicted data
* `test_pred_1.csv`
* `test_pred_2.csv`
* `test_pred_3.csv`

In [8]:
train_data = pd.read_csv("../input/30-days-of-ml/train.csv")
test_data = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

train_pred_1 = pd.read_csv("train_preds_1.csv")
train_pred_2 = pd.read_csv("train_preds_2.csv")
train_pred_3 = pd.read_csv("train_preds_3.csv")

test_pred_1 = pd.read_csv("test_preds_1.csv")
test_pred_2 = pd.read_csv("test_preds_2.csv")
test_pred_3 = pd.read_csv("test_preds_3.csv")

train_data = train_data.merge(train_pred_1, on="id", how="left")
train_data = train_data.merge(train_pred_2, on="id", how="left")
train_data = train_data.merge(train_pred_3, on="id", how="left")

test_data = pd.concat([test_data, test_pred_1, test_pred_2, test_pred_3], join="inner", axis=1)
test_data = test_data.drop("id", axis=1)
test_data["id"] = test_pred_1["id"]

### Getting useful features/columns from test data

In [9]:
useful_cols = ["pred_1", "pred_2", "pred_3"]
test_data = test_data[useful_cols]
test_data.head()

Unnamed: 0,pred_1,pred_2,pred_3
0,8.070074,8.082083,8.11034
1,8.362631,8.389063,8.35278
2,8.372488,8.400261,8.3976
3,8.466038,8.462927,8.441732
4,8.13709,8.182848,8.148008


## Final Model
#### where:
##### **K-fold** to split the training data(`train_dadta`) for training and testing
##### **Target Encoding** to get numerical `median` values of categorical values
##### **Linear Regression** to make the model and predict validation data and test data

In [10]:
# K-fold splitting where total folds = 5
train_data["kfold"] = -1
Kf_model = KFold(n_splits=5, random_state=1, shuffle=True)
# fold (0, 1, 2, 3, 4); train_index (0, 2, 3...); valid_index(1, 4, 6)
for fold, (train_index, valid_index) in enumerate(Kf_model.split(X=train_data)):
    train_data.loc[valid_index, "kfold"] = fold

In [11]:
useful_features = ["pred_1", "pred_2", "pred_3"]
test_data = test_data[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    X_train =  train_data[train_data.kfold != fold].reset_index(drop=True)
    X_valid = train_data[train_data.kfold == fold].reset_index(drop=True)
    X_test = test_data.copy()

    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, valid_preds, squared=False)
    print(f"Fold: {fold}, RMSE: {rmse}, \nTest predictions{test_preds}")
    scores.append(rmse)

Fold: 0, RMSE: 0.7159473211978672, 
Test predictions[8.08463427 8.38272217 8.40257001 ... 8.44349284 8.11749513 7.9118475 ]
Fold: 1, RMSE: 0.7237873644001014, 
Test predictions[8.08450619 8.38252023 8.40210127 ... 8.44308438 8.11768394 7.91229768]
Fold: 2, RMSE: 0.7193191382604242, 
Test predictions[8.08424944 8.38462479 8.40437404 ... 8.44481154 8.11897187 7.90718247]
Fold: 3, RMSE: 0.7188223838165171, 
Test predictions[8.08234949 8.38306543 8.40307351 ... 8.4435804  8.11668739 7.90494678]
Fold: 4, RMSE: 0.7139512618883442, 
Test predictions[8.08355385 8.38199414 8.40270361 ... 8.44361367 8.1151673  7.90950595]


# Submitting the Final Predictions

In [12]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)