# Importing Libraries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

import pandas_profiling
%matplotlib inline

import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import sklearn.metrics as metrics

import warnings
warnings.filterwarnings("ignore")

# Loading Data


In [None]:
# Path to the dataset
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%%time
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sample_sub = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')
org_len = len(train.drop('id',axis=1))

# Understanding Data

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print("Train data set dtypes: \n")
print(f"Shape : {train.shape}")
print(f"{train.dtypes.value_counts()}")

print('*'*30)

print("Test data set dtypes: \n")
print(f"Shape : {test.shape}")
print(f"{test.dtypes.value_counts()}")

In [None]:
# Gives a details on Count number of non-NA/null observations, Maximum and Minimum of the values in the object, Mean and Standard Deviation of the Values
train.describe()

### No Missing Values
**As we can see above the count of no_null values are equal to the len of columns (300000)**

# Pandas Profiling 🐼 
**pandas_profiling extends the pandas DataFrame for quick data analysis.**

In [None]:
profile = pandas_profiling.ProfileReport(train,minimal=True)
profile.to_file(output_file="output.html")
profile

# Univariate Analysis
**We will usually use Distribution plot to visualize their data distribution for continuous Values**

In [None]:
train_cont = train.drop('id',axis=1)

In [None]:
fig = plt.figure(figsize=(18,16))

for index,col in enumerate(train_cont):
    plt.subplot(5,3,index+1)
    sns.distplot(train_cont.loc[:,col], kde=False)
fig.tight_layout(pad=1.0)

In [None]:
    for c in train_cont.columns:
        fig, axs = plt.subplots(1, 3, figsize=(16, 5))
        sns.boxplot(y=c, data=train_cont, ax=axs[0]) # 1

        sns.violinplot(y=c, data=train_cont, ax=axs[1]) # 2

        sns.stripplot(y=c, data=train_cont, size=4, color=".3", linewidth=0, ax=axs[2]) # 3


        fig.suptitle(c, fontsize=15, y=1.1)
        axs[0].set_title('Box Plot')
        axs[1].set_title('Violin Plot')
        axs[2].set_title('Strip Plot')

        plt.tight_layout()
        plt.show()

**Few Outliers in target, cont10, cont9, cont7** <br>
**Cont2 have some regular interval gaps**<br>
**Cont5 is dominated with lesser value**

# Bi-variate Analysis

Scatterplot with the target

In [None]:
fig = plt.figure(figsize=(18,16))
train_cont = train.drop('id',axis=1)
for index,col in enumerate(train_cont):
    plt.subplot(5,3,index+1)
    sns.scatterplot(x=train_cont.iloc[:,index], y=train['target'],alpha=0.5)
fig.tight_layout(pad=1.0)

Heatmap

In [None]:
plt.figure(figsize=(16,12))
corr = train_cont.corr()
sns.heatmap(corr,cmap='Blues',linewidth=0.5,annot=True)

No good correlation with target column 👀

# Data Processing 

### **Feature Engineering**

**Tried some randomn combinations**

In [None]:
train_cont['new'] = train_cont['cont2']*train_cont['cont3']*train_cont['cont6']*train_cont['cont7']*train_cont['cont11']*train_cont['cont12']
test['new'] = test['cont2']*test['cont3']*test['cont6']*test['cont7']*test['cont11']*test['cont12']

In [None]:
train_cont['new1'] = train_cont['cont9']*train_cont['cont10']*train_cont['cont1']
test['new1'] = test['cont9']*test['cont10']*test['cont1']

In [None]:
features = train_cont.drop('target',axis=1).columns
train_cont['mean'] = train_cont[features].mean(axis=1)
test['mean'] = test[features].mean(axis=1)

### **Removing Outliers** 

In [None]:
# removing outlier in lower region
low_cont = ['target', 'cont10', 'cont9', 'cont7']
# removing outlier in upper region
up_cont = ['cont10']
n999 = [ np.percentile(train_cont[i],99.9) for i in train_cont[up_cont]]
n001 = [ np.percentile(train_cont[i],0.1) for i in train_cont[low_cont]]

In [None]:
import gc
for i, j in enumerate(low_cont):
    train_cont = train_cont[train_cont[j] > n001[i]]
    gc.collect()
for i, j in enumerate(up_cont):
    train_cont = train_cont[train_cont[j] < n999[i]]
    gc.collect()

**After removing outlier**

In [None]:
    for c in train_cont.columns:
        fig, axs = plt.subplots(1, 2, figsize=(16, 5))
        sns.boxplot(y=c, data=train_cont, ax=axs[0]) # 1
        sns.stripplot(y=c, data=train_cont, size=4, color=".3", linewidth=0, ax=axs[1]) # 2


        fig.suptitle(c, fontsize=15, y=1.1)
        axs[0].set_title('Box Plot')
        axs[1].set_title('Strip Plot')

        plt.tight_layout()
        plt.show()

**Percentage of data removed**

In [None]:
str(round(((org_len - len(train_cont))/org_len)*100,2))+'%'

# Modelling 

**We'll be using XGBRegressor**

In [None]:
X_train = train_cont.drop('target',axis=1)
y_train = train_cont['target']

# Tuning

In [None]:
import optuna
from sklearn.model_selection  import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 6, 13),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.009, 0.10),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 0.05),
        'objective':'reg:squarederror',
        'eval_metric' : 'rmse',
        'tree_method':'gpu_hist',
       }
        
    clf = xgb.XGBRegressor(**params)
    rmse_scores = []
    X_train_k = X_train.values
    y_train_k = y_train.values
    skf = KFold(n_splits=3,shuffle=True)
    for train_idx, valid_idx in skf.split(X_train_k,y_train_k):
        train_data = X_train_k[train_idx, :], y_train_k[train_idx]
        valid_data = X_train_k[valid_idx, :], y_train_k[valid_idx]
        
        clf.fit(X_train_k[train_idx, :], y_train_k[train_idx])
        pred = clf.predict(X_train_k[valid_idx, :])
        rmse = np.sqrt(mean_squared_error(y_train_k[valid_idx],pred))
        rmse_scores.append(rmse)
    print(f'Trial done: Accuracy values on folds: {rmse_scores}')
    return np.average(rmse_scores)

In [None]:
#  Just for lesser time I've used less trials,Please do increase the trials 
n_trials = 5

FIT_XGB = True

if FIT_XGB:
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_slice(study)

## Fitting

In [None]:
#best_param = study.best_params
best_param = {'n_estimators': 751, 'max_depth': 10, 
              'learning_rate': 0.019789645280696613, 
              'subsample': 0.8730019407814834, 
              'colsample_bytree': 0.6012295369579667,'gamma':0}
best_param['objective'] ='reg:squarederror'
best_param['tree_method'] ='gpu_hist'
best_param['eval_metric'] ='rmse'

In [None]:
model = xgb.XGBRegressor(**best_param)
model.fit(X_train,y_train)

### Model Importance 

In [None]:
xgb.plot_importance(model)

# Submission

In [None]:
predictions_final = model.predict(test.drop('id',axis=1))

In [None]:
submission = pd.DataFrame({
        "id": test["id"],
        "target":predictions_final
    })
submission.to_csv('my_submission.csv', index=False)

# If you liked it. Please do upvote ✌✔😺