### Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor 

### Load Data

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")

In [None]:
df_train.head()

In [None]:
df_train.set_index("id", inplace = True)
df_train.head()

### EDA of train data

In [None]:
df_train.info()

In [None]:
df_train[df_train.duplicated(keep = "first")].shape
#df_train.duplicated(keep = "first").sum()

In [None]:
df_train.describe().T

In [None]:
input_columns = [column for column in df_train.columns if "cont" in column]

In [None]:
plt.figure(figsize=(20,10))
for index, feature in enumerate(df.columns[:12]):
    plt.subplot(2,6,index+1).get_xaxis().set_visible(False)
    sns.boxplot(data = df[feature], width = 0.2, ).set_title(feature)

In [None]:
plt.figure(figsize=(20, 10))
for index, feature in enumerate(df_train[input_columns]):
    plt.subplot(2, 7,index+1).get_xaxis().set_visible(False)
    sns.displot(df_train[feature], color="blue", kde=True, bins=120)

In [None]:
df_train[input_columns].plot(kind = "box",
                             figsize = (14,6),
                             vert=False, 
                             meanline = True, 
                             showmeans = True);

In [None]:
plt.figure()
sns.displot(df_train["target"]);

In [None]:
df_train["target"].plot(kind = "box",
                        figsize = (14,6),
                        vert=False, 
                        meanline = True, 
                        showmeans = True);

In [None]:
corr = df_train.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train[input_columns],
                                                    df_train["target"],
                                                    test_size=0.1,
                                                    random_state = 27 )

### Model selection

In [None]:
tree_regressor = DecisionTreeRegressor()
random_forests_regressor = RandomForestRegressor()
gradient_boosting_regressor = GradientBoostingRegressor()
ada_boost_regressor = AdaBoostRegressor()
xgboost_regressor = XGBRegressor()
lgb_regressor = LGBMRegressor()

In [None]:
grid_tree_regressor = {"criterion": ["mse"],
                        "splitter": ["best", "random"],
                        "max_depth": [2, 3, 4, 5, 6, None],
                        "min_samples_split": [2, 5, 10],
                        "min_samples_leaf": [2, 5, 10]
                       }

grid_random_forests_regressor = {"n_estimators": [150],
                                 "criterion": ["mse"],
                                 "max_depth": [3,5,10,15,20],
                                 "max_features": ["sqrt", 3, 4, 5, 6]
                                }

grid_gradient_boosting_regressor = {"learning_rate": [0.05, 0.10, 0.50],
                                    "n_estimators": [50, 100, 200],
                                    "max_depth": [1,2, 3, 4, 5, 6],
                                    "subsample": [1.0, 0.8, 0.5], 
                                    "max_features": ["sqrt", 3, 4, 5, 6]
                                   }

grid_ada_boost_regressor = {"loss": ["linear", "square", "exponential"],
                            "learning_rate": [0.05, 0.10, 0.50],
                            "n_estimators": [10, 20, 50, 100, 500, 1000]
                           }

grid_xgboost_regressor = {"learning_rate": [0.05, 0.10, 0.50],
                          "n_estimators": [10,100,1000],
                          "max_depth": [2, 3, 4, 5, 6, 0]
                         }

grid_lgb_regressor = {"n_estimators": [1000, 2000, 5000],
                      "importance_type": ["split"]
                     }

In [None]:
gs_tree_regressor = GridSearchCV(tree_regressor,
                                 grid_tree_regressor,
                                 cv = 5,
                                 scoring = "neg_mean_squared_error",
                                 verbose = 1,
                                 n_jobs = -1,
                                 return_train_score = True)

gs_random_forests_regressor = GridSearchCV(random_forests_regressor,
                                           grid_random_forests_regressor,
                                           cv = 5,
                                           scoring = "neg_mean_squared_error",
                                           verbose = 1,
                                           n_jobs = -1,
                                           return_train_score = True)

gs_gradient_boosting_regressor = GridSearchCV(gradient_boosting_regressor,
                                              grid_gradient_boosting_regressor,
                                              cv = 5,
                                              scoring = "neg_mean_squared_error",
                                              verbose = 1,
                                              n_jobs = -1,
                                              return_train_score = True)

gs_ada_boost_regressor = GridSearchCV(ada_boost_regressor,
                                      grid_ada_boost_regressor,
                                      cv = 5,
                                      scoring = "neg_mean_squared_error",
                                      verbose = 1,
                                      n_jobs = -1,
                                      return_train_score = True)

gs_xgboost_regressor = GridSearchCV(xgboost_regressor,
                                    grid_xgboost_regressor,
                                    cv = 5,
                                    scoring = "neg_mean_squared_error",
                                    verbose = 1,
                                    n_jobs = -1,
                                    return_train_score = True)

gs_lgb_regressor = GridSearchCV(lgb_regressor,
                                grid_lgb_regressor,
                                cv = 5,
                                scoring = "neg_mean_squared_error",
                                verbose = 1,
                                n_jobs = -1,
                                return_train_score = True)

In [None]:
all_grid_searchs_regresion = {"gs_tree_regressor":gs_tree_regressor,
                              "gs_random_forests_regressor":gs_random_forests_regressor,
                              "gs_gradient_boosting_regressor":gs_gradient_boosting_regressor,
                              "gs_ada_boost_regressor":gs_ada_boost_regressor,
                              "gs_xgboost_regressor":gs_xgboost_regressor,
                              "gs_lgb_regressor": gs_lgb_regressor
                             } 


In [None]:
for name, grid_search in all_grid_searchs_regresion.items():
    print(f"Training Grid Search de {name}...")
    grid_search.fit(X_train, 
                    y_train)

In [None]:
best_score_each_gridsearch_regresion = [(name_model, grid_search.best_score_) 
                                        for name_model, grid_search           
                                        in all_grid_searchs_regresion.items()]       

best_score_each_gridsearch_regresion

In [None]:
df_best_score_each_gridsearch_regresion = pd.DataFrame(best_score_each_gridsearch_regresion,
                                                       columns=["GridSearchCV", "Best MSE"])
df_best_score_each_gridsearch_regresion["Best MSE"] = (best_score_each_gridsearch_regresion["Best MSE"].apply(abs))
df_best_score_each_gridsearch_regresion["Best RMSE"] = (best_score_each_gridsearch_regresion["Best MSE"].apply(sqrt))
df_best_score_each_gridsearch_regresion.sort_values(by="Best MSE", ascending=True, inplace = True)
df_best_score_each_gridsearch_regresion

In [None]:
df_best_regressor = df_best_score_each_gridsearch_regresion[df_best_score_each_gridsearch_regresion["Best MSE"] == (min(df_best_score_each_gridsearch_regresion["Best MSE"]))]
df_best_regressor.reset_index(drop=True, inplace=True)
df_best_regressor

In [None]:
regressor_model = df_best_regressor.iloc[0]["GridSearchCV"]

In [None]:
best_regressor = all_grid_searchs_regresion[regressor_model]

In [None]:
best_pipeline_regresion = best_regressor.best_estimator_

In [None]:
best_pipeline_regresion.fit(X_train, y_train)

In [None]:
predictions_test = best_pipeline_regresion.predict(X_test)

In [None]:
rmse_model= np.sqrt(mean_squared_error(y_test, predictions_test))
print(rmse_model)

In [None]:
pred_model = best_pipeline_regresion.predict(df_test[input_columns])

In [None]:
output = pd.DataFrame({'id': df_test["id"], 'target': pred_model})
output.to_csv('Kaggle_Playground_Submission.csv', index=False)