## Importing Libraries 📗

In [None]:
import os
import gc
import sys
import time
import pickle
import random
import numpy as np
import pandas as pd
import datatable as dt
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
 
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from sklearn.manifold import TSNE

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings('ignore')

import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoost, Pool

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import (StandardScaler,RobustScaler ,
                                   PowerTransformer,KBinsDiscretizer,
                                   QuantileTransformer ,LabelEncoder, 
                                   OneHotEncoder,OrdinalEncoder)

## Loading Data 💽

In [None]:
folder_path = '../input/tabular-playground-series-feb-2021'
train_data = pd.read_csv(f'{folder_path}/train.csv')
test_data = pd.read_csv(f'{folder_path}/test.csv')
sample = pd.read_csv(f'{folder_path}/sample_submission.csv')

In [None]:
train_data.head()

## Data Preprocessing

In [None]:
cont_features = [f'cont{i}' for i in range(14)]
cat_features = [f'cat{i}' for i in range(10)] 
# cat_features = ['cat1','cat3','cat5','cat8','cat9']
all_features =   cat_features + cont_features
target_feature = 'target'

num_bins = int(1 + np.log2(len(train_data)))
train_data.loc[:,'bins'] = pd.cut(train_data['target'].to_numpy(),bins=num_bins,labels=False)
bins = train_data['bins'].to_numpy()

target = train_data[target_feature].to_numpy()
train_data = train_data[all_features].to_numpy()
test_data = test_data[all_features].to_numpy()

ct = ColumnTransformer([('onehot',OrdinalEncoder(),slice(len(cat_features))),
                        ('at',QuantileTransformer(),slice(len(cat_features),
                        len(cat_features)+len(cont_features)))])

train_data = ct.fit_transform(train_data)
test_data = ct.transform(test_data)

In [None]:
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

## LGBM Model

In [None]:
nfolds = 5
seed = 0

params = {'reg_alpha': 6.147694913504962,
 'reg_lambda': 0.002457826062076097,
 'colsample_bytree': 0.3,
 'subsample': 0.8,
 'learning_rate': 0.001,
 'max_depth': 20,
 'num_leaves': 111,
 'min_child_samples': 285,
'categorical_features': list(range(len(cat_features))),
 'random_state': 48,
'verbose':-1,
 'n_estimators': 10000,
 'metric': 'rmse',
 'cat_smooth': 39}

In [None]:
lgbm_preds = np.zeros(test_data.shape[0])

kfold = StratifiedKFold(n_splits=nfolds,random_state=seed)
lgbm_scores = list()
for k, (train_idx, valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    lgb_train = lgb.Dataset(train_data[train_idx],target[train_idx])
    lgb_valid = lgb.Dataset(train_data[valid_idx],target[valid_idx],reference=lgb_train)
    lgb_model = lgb.train(params,
                      lgb_train, 
                      valid_sets=[lgb_train,lgb_valid],
                      verbose_eval=0,
                      early_stopping_rounds=800,
                      )
    rmse = rmse_score(target[valid_idx],lgb_model.predict(train_data[valid_idx]))
    print(f"fold {k}: rmse:{rmse}")
    lgbm_scores.append(rmse)
    lgbm_preds += lgb_model.predict(test_data)/nfolds
#     break

print("mean rmse score",np.mean(lgbm_scores))

In [None]:
def plot_feature_importance(model,features):
    feature_importance = pd.DataFrame({"feature":features,"importance":model.feature_importance(importance_type='gain')})
    feature_importance = feature_importance.sort_values(by='importance',ascending=False)
    
    plt.figure(figsize=(10,10))
    plt.subplot(211)
    sns.barplot(data=feature_importance,x='importance',y='feature')
    
    for idx, v in enumerate(feature_importance.importance):
            plt.text(v, idx, "  {:.2e}".format(v))
    
    feature_importance = pd.DataFrame({"feature":features,"importance":model.feature_importance(importance_type='split')})
    feature_importance = feature_importance.sort_values(by='importance',ascending=False)
    
    plt.subplot(212)
    sns.barplot(data=feature_importance,x='importance',y='feature')
    
    for idx, v in enumerate(feature_importance.importance):
        plt.text(v, idx, "  {:.2e}".format(v))

In [None]:
plot_feature_importance(lgb_model,all_features)

## Catboost

In [None]:
params = {'l2_leaf_reg': 0.02247766515106271, 
          'max_bin': 364,
          'subsample': 0.6708650091202213,
          'learning_rate': 0.0010290546311954876,
          'max_depth': 10,
          'verbose':0,
          'random_state': seed, 
          'min_data_in_leaf': 300,
          'loss_function': 'RMSE',
          'n_estimators':  1600000,
          'rsm':0.5,
          'early_stopping_rounds':800}

In [None]:
cat_preds = np.zeros(test_data.shape[0])
kfold = StratifiedKFold(n_splits=nfolds, random_state =seed)
cat_scores = list()
for train_idx, valid_idx in kfold.split(X=train_data,y=bins):
    cat_train = Pool(train_data[train_idx],target[train_idx])
    cat_valid = Pool(train_data[valid_idx],target[valid_idx])
    
    cat_model = CatBoost(params)
    cat_model.fit(cat_train,eval_set=cat_valid)
    score = rmse_score(target[valid_idx],cat_model.predict(train_data[valid_idx]))
    print(f"fold: {k}, score: {score}")
    cat_scores.append(score)
    cat_preds += cat_model.predict(test_data)/nfolds
    
print('mean rmse score:',np.mean(cat_scores))

## correlation matrix

In [None]:
predictions = pd.DataFrame({"lgbm":lgbm_preds,'catboost':cat_preds})
plt.figure(figsize=(7,7))
sns.heatmap(predictions.corr(),annot=True);

## submission

In [None]:
sample.target = (lgbm_preds.ravel() + cat_preds.ravel())/2
sample.to_csv("submission.csv",index=False)
sample.head()

In [None]:
plt.figure(figsize=(15,7))
plt.subplot(131)
sns.distplot(sample.target)
plt.title("test-target distribution")
plt.subplot(132)
sns.distplot(target)
plt.title("train-target distribution")
plt.subplot(133)
sns.distplot(sample.target.to_numpy(),label='test')
sns.distplot(target,label='target')
plt.legend()
plt.title("train and test target distribution");