In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Hello there!**  
  This is my not only first competition on Kaggle, but the first off-course assignment. My goal is to test some methods, that I'v learned so far, and also to try data visualization techniques.  
  So I would be very glad to get some feedback 

# Importing Libraries

In [None]:
# Basics
import numpy as np 
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import squarify
import seaborn as sns

# Model stuff
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score, mean_squared_error

# Appearence
import warnings
warnings.filterwarnings('ignore')
from colorama import Fore

# Task Detail 

## Goal
For this competition, you will be predicting a **continuous target** based on a number of feature columns given in the data. All of the feature columns, **cat0 - cat9 are categorical**, and the feature columns **cont0 - cont13 are continuous**.  
  
The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. The original dataset deals with predicting the amount of an insurance claim. Although the features are **anonymized**, they have properties relating to real-world features.  
  
  ## Metric
Submissions are scored on the root mean squared error. RMSE is defined as:
$$\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}$$

where  is the predicted value,  is the original value, and  is the number of rows in the test data.

# First Look On The Data  
## Train.csv

In [None]:
train_csv = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
print(train_csv.info(verbose = True))

In [None]:
train_csv.head()

## Test.csv

In [None]:
test_csv = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
print(test_csv.info(verbose = True))

As we see there is no null data and all continuous features are normalized

# Data Visualization 📊

## Target  
We can try to visualize target data with boxplot. That would show us such characteristics as percentiles, min (max) values and the set of extreme values 

In [None]:
target_data = train_csv['target']

plt.figure(figsize=(9,9))
plt.title('Target Boxplot', size = 14)
sns.boxplot(data = target_data, color = 'royalblue')

As we can see, there are some extreme points in our dataset. These points may correspond to reality, in the case of a *large number of them*.  
But if there are only few of them, it would be better to **get rid of them**, because the submissions are scored on the root mean squared error, which is very sensitive to outliers in the data

In [None]:
"""the max value = 10.3, but 25% quartile is ~6.8 so we have a few count of outliers in our data"""
target_data.describe()

**Lets count how many outliers are there**

In [None]:
"""lower boundary"""
lower_whisker = plt.boxplot(target_data)['whiskers'][0].get_ydata()[1]
print(sum(1 for i in target_data if i <= (lower_whisker - 0.1)), 'value(s) is(are) below lower whisker') # 0.1 is the "tolerance" in case the test data will be shifted

"""higher boundary"""
higher_whisker = plt.boxplot(target_data)['whiskers'][1].get_ydata()[1]
print(sum(1 for i in target_data if i >= (higher_whisker + 0.1)), 'value(s) is(are) above the upper whisker')

**258** values are only **0.086%** of all data, so we can delete these rows in our DataFrame

In [None]:
"""creating list with indexes"""
index_list = []
for value in target_data:
    if value <= (lower_whisker - 0.1) or value >= (higher_whisker + 0.1):
        index_list.append(list(target_data).index(value))

"""rows deletion"""
train_csv.drop(index_list, axis=0, inplace = True)

In [None]:
plt.figure(figsize=(9,9))
plt.title('Target Boxplot', size = 14)
sns.boxplot(data = train_csv['target'], color = 'royalblue')

## Categorial  
Lets create a treemap of our categorial data

In [None]:
def treemap(DataFrame = train_csv):
    f, axes = plt.subplots(nrows = 3, ncols= 3, figsize = (24,16))
    
    for i in range(train_csv.select_dtypes(include='object').shape[1] - 1):
        
        # Set labels & size for treemap
        df = DataFrame.groupby('cat{}'.format(i)).size().reset_index(name = 'counts')
        labels = df.apply( lambda x: str(x[0]) + '\n (' + str(x[1]) + ')', axis = 1)
        size = df['counts'].values
        
        
        colors = [plt.cm.coolwarm(i/float(len(labels))) for i in range(len(labels))]
        squarify.plot(sizes=size, label=labels, color = colors, alpha = 0.8, ax = axes[i//3, i%3])
        
        # Decorate
        axes[i//3, i%3].set_title('Treemap of Cat{}'.format(i))
        axes[i//3, i%3].axis('off')

In [None]:
treemap()

In [None]:
sns.catplot(x="cat6", kind="count", palette="coolwarm", data=train_csv)
plt.title('cat6 in train', size = 13.5)

sns.catplot(x="cat6", kind="count", palette="coolwarm", data=test_csv)
plt.title('cat6 in test', size = 13.5)

As we can see, there is no much complexity in categorial features. We have  
* 5 columns, where one value is dominating others
* 3 columns, where two values are dominating others  
* 1 column with with a wide variability  
  
**Another moment that we need to notice is that the** `number of cat features in test != number of cat features in train` 

## Continuous

In [None]:
# Select cont data only
cont_data = train_csv.select_dtypes(include =['float64', 'int64'])

num_rows, num_cols = 4, 4
f, axes = plt.subplots(nrows = num_rows, ncols= num_cols, figsize = (24,16))

for i, col_name in enumerate(cont_data):
    
    sns.kdeplot(cont_data[col_name], fill=True, color = 'royalblue',
                   alpha=.5, linewidth=0, ax = axes[i // num_rows, i % num_cols])

f.delaxes(axes[0, 0])
f.delaxes(axes[3, 3])
plt.tight_layout()
plt.show()

## Correlation between features

In [None]:
# plot heatmap
plt.figure(figsize=(15,15))
mask = np.triu(cont_data.corr())
sns.heatmap(cont_data.corr(), annot=True, mask = mask, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
plt.yticks(rotation=0)
plt.show()

As we can see, there is no high correlation between variables

# Feature Modify 🧰

In [None]:
def modify_df(df):
    df['cat4'] = df['cat4'].apply(lambda x: x if x == 'B' else 'Z')
    df['cat5'] = df['cat5'].apply(lambda x: x if x in ['B', 'D'] else 'Z')
    df['cat6'] = df['cat6'].apply(lambda x: x if x == 'A' else 'Z')
    df['cat7'] = df['cat7'].apply(lambda x: x if x in ['E', 'D'] else 'Z')
    df['cat8'] = df['cat8'].apply(lambda x: x if x in ['E', 'C', 'G', 'A'] else 'Z')
    
    df['cont001'] = df['cont8'] * df['cont0']
    df['cont002'] = df['cont9'] * df['cont0']
    df['cont003'] = df['cont9'] * df['cont5']
    df['cont004'] = df['cont8'] * df['cont5']
    df['cont005'] = df['cont2'] * df['cont4']
    df['cont006'] = df['cont1'] * df['cont3']
    df['cont007'] = df['cont13'] * df['cont1']

    return df

mod_train_csv = modify_df(train_csv.copy())
mod_test_csv = modify_df(test_csv.copy())

treemap(DataFrame = mod_train_csv)

## Train

In [None]:
feature_cols = mod_train_csv.drop(['id', 'target'], axis=1).columns

X_cat = mod_train_csv[feature_cols].select_dtypes(include = 'object')
X_cont = mod_train_csv[feature_cols].select_dtypes(exclude = 'object')
y = mod_train_csv['target']

## Test

In [None]:
feature_cols = mod_test_csv.drop(['id'], axis=1).columns

X_cat_TEST = mod_test_csv[feature_cols].select_dtypes(include = 'object')
X_cont_TEST = mod_test_csv[feature_cols].select_dtypes(exclude = 'object')

# One-Hot-Encoding 📟

## Train

In [None]:
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

"""merging data"""
X = np.hstack((X_cont, X_cat_oh))

## Test

In [None]:
X_cat_oh_TEST = encoder.fit_transform(X_cat_TEST.T.to_dict().values())

"""merging data"""
X_TEST = np.hstack((X_cont_TEST, X_cat_oh_TEST))

# Train Test Split 🪓
For checking our models we have to split our data 

In [None]:
(X_train, 
 X_test, 
 y_train, y_test) = train_test_split(X, y, 
                                     test_size=0.2, 
                                     random_state=0,
                                     )

# Linear Model 📉

In [None]:
# Import library
from sklearn import linear_model

# Model set
estimator1 = linear_model.SGDRegressor(random_state = 42)

"""Grid Search"""

parameters_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'alpha' : np.linspace(0.0001, 0.001, num = 5),
}

cv = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 42)

grid_cv_LM = model_selection.GridSearchCV(estimator1, parameters_grid, scoring = 'neg_root_mean_squared_error', cv = cv)

In [None]:
%%time
grid_cv_LM.fit(X_train, y_train)

predictions_LM = grid_cv_LM.best_estimator_.predict(X_test)

score_rmse_LM = (mean_squared_error(y_test, predictions_LM))**0.5
print(Fore.GREEN + 'Base Linear SGDRegressor RMSE: {}'.format(score_rmse_LM))

# Random Forest 🌳

In [None]:
# Import library
from sklearn.ensemble import RandomForestRegressor

estimator_RF = RandomForestRegressor(n_jobs=-1, random_state=42)

In [None]:
%%time
estimator_RF.fit(X_train, y_train)

# Test
predictions_RF = estimator_RF.predict(X_test)

score_rmse_RF = (mean_squared_error(y_test, predictions_RF))**0.5
print(Fore.GREEN + 'RandomForest RMSE: {}'.format(score_rmse_RF))

# XGBoost 🏃🏻

In [None]:
%%time
from xgboost import XGBRegressor
XGB_default = XGBRegressor(random_state=42, tree_method='gpu_hist');

XGB_default.fit(X_train, y_train);

In [None]:
predictions_XGB = XGB_default.predict(X_test)

score_rmse_XGB = (mean_squared_error(y_test, predictions_XGB))**0.5
print(Fore.GREEN + 'Base XGBoost RMSE: {}'.format(score_rmse_XGB))

# XGBoost Parameter Tuning 🚵🏻

In [None]:
"""best params"""

xgb_params = {
    'booster':'gbtree',
    'n_estimators':10000,
    'max_depth':7, 
    'eta':0.01,
    'gamma':1.8,
    'objective':'reg:squarederror',
    'verbosity':0,
    'subsample':0.85,
    'colsample_bytree':0.4,
    'lambda':2.7,
    'alpha':6,
    'scale_pos_weight':1,
    'objective':'reg:squarederror',
    'eval_metric':'rmse',
    'seed': 42,
    'tree_method':'gpu_hist',
    'gpu_id':0
}

In [None]:
%%time

XGB_tune = XGBRegressor(**xgb_params);
XGB_tune.fit(X_train, y_train);

In [None]:
"""predict"""

predictions_XGB = XGB_tune.predict(X_test)

score_rmse_XGB = (mean_squared_error(y_test, predictions_XGB))**0.5
print(Fore.GREEN + 'Tune XGB RMSE: {}'.format(score_rmse_XGB))

# kNN 👯‍

In [None]:
# import library
from sklearn.neighbors import KNeighborsRegressor

estimator_kNN = KNeighborsRegressor()
estimator_kNN.fit(X_train, y_train)

In [None]:
"""predict"""

predictions_kNN = estimator_kNN.predict(X_test)

score_rmse_kNN = (mean_squared_error(y_test, predictions_kNN))**0.5
print(Fore.GREEN + 'Base kNN RMSE: {}'.format(score_rmse_kNN))

# LGMB
wanna try LGBM model with optimize params that I found on https://www.kaggle.com/andreshg/tps-feb-a-complete-study#5.-Optimized-LGBM-CrossValidated-%F0%9F%A7%AE

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgb

best_params = {
    'reg_lambda': 0.015979956459638782,
    'reg_alpha': 9.103977313355028,
    'colsample_bytree': 0.3,
    'subsample': 1.0,
    'learning_rate': 0.009,
    'n_estimators': 3000,
    'max_depth': 15,
    'min_child_samples': 142,
    'num_leaves': 84,
    'random_state': 42, 
    'device': 'gpu',
}

# Instantiate model with 100 decision trees
estimator_LGBM = LGBMRegressor(**best_params)

estimator_LGBM.fit(X_train, y_train)

# Use the forest's predict method on the test data
predictions_LGBM = estimator_LGBM.predict(X_test)

score_rmse_LGBM = (mean_squared_error(y_test, predictions_LGBM))**0.5
print(Fore.GREEN + 'Tuned LGBM RMSE: {}'.format(score_rmse_LGBM))

In [None]:
results = {'score': [score_rmse_LM, score_rmse_RF, score_rmse_XGB, score_rmse_kNN, score_rmse_LGBM], 'model': ['Linear', 'RF', 'XGB', 'kNN', 'LGBM']}
print(Fore.WHITE + 'Results:\n')
for i in range(5):
    print(Fore.GREEN + list(results.values())[1][i], '\t', list(results.values())[0][i], '\n')
    
print(Fore.WHITE + 'LGBM is the best model')

# Train on full data & Submit 🔩

In [None]:
estimator_LGBM.fit(X, y)

predictions = estimator_LGBM.predict(X_TEST)

In [None]:
XGB_tune.fit(X, y)

predictions2 = XGB_tune.predict(X_TEST)

# Prepare Submission File 📝

In [None]:
my_submission = pd.DataFrame({'id': test_csv.id, 'target': predictions})

my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission2 = pd.DataFrame({'id': test_csv.id, 'target': predictions2})

my_submission2.to_csv('submission.csv', index=False)

# Another way to Feature Modify

In [None]:
train_csv['cat6'] = train_csv['cat6'].apply(lambda x: x if x in ['A', 'B', 'C', 'D', 'E', 'I', 'H']  else 'A')
sns.catplot(x="cat6", kind="count", palette="coolwarm", data=train_csv)
plt.title('cat6 in train', size = 13.5)

## Train

In [None]:
feature_cols = train_csv.drop(['id', 'target'], axis=1).columns

X_cat = train_csv[feature_cols].select_dtypes(include = 'object')
X_cont = train_csv[feature_cols].select_dtypes(exclude = 'object')
y = train_csv['target']

## Test

In [None]:
feature_cols = test_csv.drop(['id'], axis=1).columns

X_cat_TEST = test_csv[feature_cols].select_dtypes(include = 'object')
X_cont_TEST = test_csv[feature_cols].select_dtypes(exclude = 'object')

## Train O-H-E

In [None]:
encoder = DV(sparse = False)
X_cat_oh = encoder.fit_transform(X_cat.T.to_dict().values())

"""merging data"""
X = np.hstack((X_cont, X_cat_oh))

## Test O-H-E

In [None]:
X_cat_oh_TEST = encoder.fit_transform(X_cat_TEST.T.to_dict().values())

"""merging data"""
X_TEST = np.hstack((X_cont_TEST, X_cat_oh_TEST))

## Train test split

In [None]:
(X_train, 
 X_test, 
 y_train, y_test) = train_test_split(X, y, 
                                     test_size=0.2, 
                                     random_state=0,
                                     )

## XGB

In [None]:
"""best params"""

xgb_params = {
    'booster':'gbtree',
    'n_estimators':10000,
    'max_depth':7, 
    'eta':0.01,
    'gamma':1.8,
    'objective':'reg:squarederror',
    'verbosity':0,
    'subsample':0.85,
    'colsample_bytree':0.4,
    'lambda':2.7,
    'alpha':6,
    'scale_pos_weight':1,
    'objective':'reg:squarederror',
    'eval_metric':'rmse',
    'seed': 42,
    'tree_method':'gpu_hist',
    'gpu_id':0
}

XGB_tune1 = XGBRegressor(**xgb_params);
XGB_tune1.fit(X_train, y_train);

"""predict"""

predictions_XGB1 = XGB_tune1.predict(X_test)

score_rmse_XGB1 = (mean_squared_error(y_test, predictions_XGB1))**0.5
print(Fore.GREEN + 'Tune XGB RMSE: {}'.format(score_rmse_XGB1))

## LGMB

In [None]:
best_params = {
    'reg_lambda': 0.015979956459638782,
    'reg_alpha': 9.103977313355028,
    'colsample_bytree': 0.3,
    'subsample': 1.0,
    'learning_rate': 0.009,
    'n_estimators': 3000,
    'max_depth': 15,
    'min_child_samples': 142,
    'num_leaves': 84,
    'random_state': 42, 
    'device': 'gpu',
}

# Instantiate model with 100 decision trees
estimator_LGBM1 = LGBMRegressor(**best_params)

estimator_LGBM1.fit(X_train, y_train)

# Use the forest's predict method on the test data
predictions_LGBM1 = estimator_LGBM1.predict(X_test)

score_rmse_LGBM1 = (mean_squared_error(y_test, predictions_LGBM1))**0.5
print(Fore.GREEN + 'Tuned LGBM RMSE: {}'.format(score_rmse_LGBM1))

In [None]:
estimator_LGBM1.fit(X, y)

predictions3 = estimator_LGBM1.predict(X_TEST)

In [None]:
my_submission3 = pd.DataFrame({'id': test_csv.id, 'target': predictions3})

my_submission3.to_csv('submission3.csv', index=False)

In [None]:
my_submission3

In [None]:
estimator_LGBM1.fit(X_train, y_train)

In [None]:
predictions4 = estimator_LGBM1.predict(X_TEST)

In [None]:
my_submission4 = pd.DataFrame({'id': test_csv.id, 'target': predictions4})

my_submission4.to_csv('submission4.csv', index=False)