In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

import lightgbm
import xgboost as xgb

from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# Load dataset

In [None]:
df = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
df.head()

In [None]:
df_test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
df_test.head()

# Check for null values

In [None]:
df_test.isnull().sum().sum()

# High correlation

In [None]:
def high_correlation(df,threshold):  
    corr_list = []
    corr = df.corr()
    for i in range(len(corr)):
        for j in range(i):        
            if corr.iloc[i,j] >= threshold:                
                print(f"Correlation between {corr.columns[i]} and {corr.columns[j]} is {corr.iloc[i,j] * 100:.2f}")            

In [None]:
high_correlation(df,0.7) 

# Check for duplicates

In [None]:
print(f"{df[df.duplicated()].shape[0]} ligne(s) dupliquée(s)")

# Correlation matrix
Maybe check to delete high correlation

In [None]:
corr = df.corr()

plt.figure(figsize=(20,20))
x = sns.heatmap(corr)
plt.show()

# Prepare data

In [None]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
        pred = fitted_model.predict_proba(X_test_data)
    else:
        pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [None]:
y = df['target']
X = df.drop(['target','id'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=6311)

# Scale data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)        
X_test = scaler.transform (X_test)     

# LGBMRegressor

In [None]:
def model_lightgbm():
    model = lightgbm.LGBMRegressor(n_jobs=-1,random_state=6311)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print("Base test")
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    
    model = lightgbm.LGBMRegressor(n_jobs=-1,random_state=6311)
    #param_grid = {
    #    'num_leaves': [7, 14, 21, 28, 31, 50],
    #    'learning_rate': [0.1, 0.03, 0.003],
    #    'max_depth': [-1, 3, 5],
    #    'n_estimators': [50, 100, 200, 500],
    #} 

    param_grid = {
        'learning_rate': [0.1], 
        'max_depth': [5], 
        'n_estimators': [500], 
        'num_leaves': [28]
    }
        
    
    model_lightgbm, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, param_grid, cv=5)
    print(model_lightgbm.best_estimator_)
    print(model_lightgbm.best_params_)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred)))


    X_test_lightgbm = scaler.transform (df_test.drop('id', axis=1)) 
    return model_lightgbm.predict(X_test_lightgbm), pred


In [None]:
preds_lightgbm, pred_test_lightgbm = model_lightgbm()

In [None]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred_test_lightgbm)))

# XGBoost

In [None]:
def model_XGBoost():
    #model = xgb.XGBRegressor(n_estimators  = 400,max_depth=3)
    #model.fit(X_train,y_train)
    #y_pred = model.predict(X_test)
    print("Base test")
    #print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    #print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    #print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    
    model = xgb.XGBRegressor(n_estimators  = 400,max_depth=3, learning_rate=0.15)
    param_grid={
        "learning_rate": [0.15, 0.20],
        "max_depth": [ 3, 4],
        "min_child_weight": [3, 5],
        "gamma":[ 0.1, 0.2],
        "colsample_bytree":[ 0.3, 0.4]
    }

    param_grid={
        
    }

    
    model_xgb, pred = algorithm_pipeline(X_train, X_test, y_train, y_test, model, param_grid, cv=5)
    print(model_xgb.best_estimator_)
    print(model_xgb.best_params_)
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, pred)))


    X_test_xgb = scaler.transform (df_test.drop('id', axis=1)) 
    return model_xgb.predict(X_test_xgb), pred

In [None]:
preds_xgb, pred_test_xgb = model_XGBoost()

In [None]:
new_preds = (pred_test_lightgbm + pred_test_lightgbm + pred_test_xgb ) / 3
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, new_preds)))

# Make submission

In [None]:
new_preds = preds_xgb

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
submission['target'] = new_preds
submission.to_csv("submissio_l_x.csv", index=False)

In [None]:
submission