In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import optuna

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## Data visualization

In [None]:
# Reading training data
train_df = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')

# Display the first line of training data, check the summary
print(train_df.head())
print(train_df.describe())


In [None]:
# Reading test data
test_df  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

# Display the first line of test data, check the summary
print(test_df.head())
print(test_df.describe())


There are 300,000 training data and 200,000 test data.  
There are 14 variables from cont1 to cont14, and there is no category data.  

Next, check if there are any missing values.  

In [None]:
print(train_df.info())
print(test_df.info())

There seems to be no missing values.  

Check the correlation between cont1 to 14 and target.  

In [None]:
# Create a heat map of the correlation matrix of training data
cor = train_df.corr()

plt.figure(figsize=(10,8))
sns.heatmap(cor, cmap= sns.color_palette('coolwarm', 10),
            vmin = -1, vmax = 1);


There is no correlation between target and cont1-14.   
Cont2 and cont14 have a small correlation with other variables. Weak negative correlations are rarely seen between cont3-1 and cont3-9.
On the other hand, there are many positively correlated variables such as cont1-6,9,10,12, cont6-9,10,11,12,13, and cont11-12 has a particularly strong correlation.  

Check the distribution of each variable. First from target.  

In [None]:
column = train_df.columns[-1]

fig, ax1 = plt.subplots(1,1)

ax1.hist(train_df[column], bins=50)
ax1.set_title(column);

It is not a single distribution, but it seems that the shape is like two distributions overlapping.

Check the distribution of each cont for train and test.

In [None]:
# Draw a histogram of training data
train_cols = [col for col in list(train_df) if col != 'id']
train_df[train_cols].hist(figsize=(20,20), bins=100, color='blue', alpha=0.5)
plt.show()


In [None]:
# Draw a histogram of test data
test_cols = train_cols.copy()
test_cols.pop(-1)

test_df[test_cols].hist(figsize=(20,20), bins=100, color='orange', alpha = 0.5)
plt.show()

Check the distribution by overlaying the training data and test data

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(20,20))

for i, col in enumerate(test_cols):
    sns.distplot(train_df[col],bins=50, hist=True, color='blue', ax=axes[i//4, i%4])
    sns.distplot(test_df[col],bins=50, hist=True, color='orange', ax=axes[i//4, i%4])
    fig.subplots_adjust(wspace=0.2, hspace=0.2);


Since the distribution shape is the same, if the training data gives a good prediction, the test data is likely to give a good prediction.  
Let's actually predict with a model.


I refer to the following notebooks.
+ https://www.kaggle.com/dwin183287/tps-jan-2021-eda-models


## Modeling and Prediction

In [None]:
# Store features in x_train and'target' in y_train
features = [feature for feature in train_df.columns if feature not in ['id', 'target']]
X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]

  
Let's try Random Forest first.  

In [None]:
%%time

# Random forest

forest_reg = RandomForestRegressor(random_state=121, n_jobs=-1)
# Learn using training data and calculate score by cross-validation (CV = 4)
# Random forest learning takes time, so I am running it with CV = 4.
scores = cross_val_score(forest_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=4)

forest_rmse_scores = np.sqrt(-scores)
print('Random Forest performance:', forest_rmse_scores)
print('Random Forest performance_mean:', forest_rmse_scores.mean())


Learning Random Forest takes time.  
Next, let's predict with xgboost and lightGBM.


In [None]:
%%time
# xgboost

xgb_reg = XGBRegressor(random_state=121, objective = 'reg:squarederror', n_jobs=-1)

scores = cross_val_score(xgb_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
xgb_rmse_scores = np.sqrt(-scores)
print('XGBoost performance:', xgb_rmse_scores)
print('XGBoost performance_mean:', xgb_rmse_scores.mean())


xgboost is faster to learn and more predictive than Random Forest.

In [None]:
%%time
# lightGBM
lgbm_reg = LGBMRegressor(random_state=121)

scores = cross_val_score(lgbm_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
lgbm_rmse_scores = np.sqrt(-scores)
print('LGBM performance:', lgbm_rmse_scores)
print('LGBM performance_mean:', lgbm_rmse_scores.mean())


lightGBM learns faster than xgboost.  
I can't expect much, but let's take a look at Lasso and Ridge for linear regression.

In [None]:
# LASSO regression model
# The default regularization strength is alpha = 1.0.

lasso_reg = Lasso()

scores = cross_val_score(lasso_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
lasso_rmse_scores = np.sqrt(-scores)
print('LASSO performance:', lasso_rmse_scores)
print('LASSO performance_mean:', lasso_rmse_scores.mean())

In [None]:
# Ridge regression model
# The default regularization strength is alpha = 1.0.

ridge_reg = Ridge()

scores = cross_val_score(ridge_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
ridge_rmse_scores = np.sqrt(-scores)
print('Ridge performance:', ridge_rmse_scores)
print('Ridge performance_mean:', ridge_rmse_scores.mean())

The performance of both LASSO and Ridge is not high.  
Try to improve the prediction performance by tuning the parameters.  

In [None]:
# Change the value of alpha and try to improve the model.

# To try and compare multiple alpha conditions, 
# create a function that calculates rmse_scores and returns the mean.
def rmse_mean(model):
    """
    RMSE averaging function
    
    """
    rmse_scores_mean = np.sqrt(
        -cross_val_score(   # Calculate score by cross-validation
            model, X_train, y_train, # Model, training data, correct value
            scoring="neg_mean_squared_error", 
            cv=5,          # Divide the data into 5 and use 80% for training
            )).mean()       # Calculate the average of RMSE
    return(rmse_scores_mean)

# Prepare 6 patterns of L1 regularization intensity (alphas)
# Larger alpha makes a simple model
alphas = [1, 10**-1, 10**-2, 10**-3, 10**-4, 10**-5]

# Perform lasso regression at each intensity of regularization
# Calculate RMSE with CV = 5, get the average and assign it to the list
lasso_regs = [rmse_mean(Lasso(alpha = alpha)) for alpha in alphas]

# Convert lasso_regs to Pandas Series object
lasso_regs = pd.Series(lasso_regs, index=alphas)

# Output score
print("LASSO RMSE loss:")
print(lasso_regs, "\n")

# Output the minimum score
print("LASSO RMSE best_alpha :", lasso_regs.idxmin())
# Outputs the regularization term parameter at the minimum score
print("LASSO RMSE best_score value :", lasso_regs.min(), "\n")

# Graph the score for each intensity of regularization
plt.figure(figsize=(10, 5))
plt.plot(lasso_regs)
plt.grid()
plt.title("LASSO: Validation_score - by regularization strength")
plt.xlabel("Alpha")
plt.ylabel("RMSE")
plt.show()

The optimum point was not found in the searched range.  
Similarly, try increasing the strength of the regularization of the Ridge regression to improve the model.

In [None]:
# Prepare 9 patterns of L2 regularization intensity (alphas)
# Larger alpha makes a simple model
alphas = [ 1, 5, 8, 10, 11, 12, 13, 15, 20]

# Perform Ridge regression at each intensity of regularization
# Calculate RMSE with CV = 5, get the average and assign it to the list
redge_regs = [rmse_mean(Ridge(alpha = alpha)) for alpha in alphas]

# Convert redge_regs to Pandas Series object
redge_regs = pd.Series(redge_regs, index=alphas)

# Output score
print("Ridge RMSE loss:")
print(redge_regs, "\n")

# Output the minimum score
print("Ridge RMSE best_alpha :", redge_regs.idxmin())
# Outputs the regularization term parameter at the minimum score
print("Ridge RMSE Loss best_score value :", redge_regs.min(), "\n")

# Graph the score for each intensity of regularization
plt.figure(figsize=(10, 5))
plt.plot(redge_regs)
plt.grid()
plt.title("Ridge: Validation_score - by regularization strength")
plt.xlabel("Alpha")
plt.ylabel("RMSE")
plt.show()


Alpha = 11 was the best parameter in the searched range. Unfortunately, no major improvement can be expected.

For XGboost, try optimizing hyperparameters using Optuna.  

The code is taken from the notebook below.  

+ https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna
+ https://www.kaggle.com/sakuraandblackcat/leaning-validation-curve-and-optuna-for-gbdts




In [None]:

# Store training data in X and'target' in y

X = train_df[features]
y = train_df['target']

In [None]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'tree_method':'gpu_hist',  # 'gpu_hist'this parameter means using the GPU 
        
        'lambda': 
            trial.suggest_loguniform('lambda', 1e-3, 1),
        'alpha': 
            trial.suggest_loguniform('alpha', 1e-3, 1),
        'colsample_bytree': 
            trial.suggest_categorical('colsample_bytree', 
                                      [0.1, 0.2, 0.3,0.5,0.7,0.9]),
        'subsample': 
            trial.suggest_categorical('subsample', 
                                      [0.1, 0.2,0.3,0.4,0.5,0.8,1.0]),
        'learning_rate': 
            trial.suggest_categorical('learning_rate', 
                                      [0.0008, 0.01, 0.015, 0.02,0.03, 0.05,0.08,0.1]),
        'n_estimators': 4000,
        'max_depth': 
            trial.suggest_categorical('max_depth', 
                                      [5,7,9,11,13,15,17,20,23,25]),
        'random_state': 48,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 400),
    }
    
    model = xgb.XGBRegressor(**param)
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], 
              early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse




In [None]:
# Create a study object and record the learning content.
study = optuna.create_study(direction='minimize') #、Minimize the objective function

In [None]:
# This cell will take some time.
# The search may be inadequate, but if it takes too long, reduce the number of attempts.
# You can add the number of searches by executing this cell multiple times.

study.optimize(objective, n_trials=25)

In [None]:
# Output the number of trials and best parameters
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

You can check the search results in the data frame.   
It is also possible to output and save as a csv file.

In [None]:
study_data_table = study.trials_dataframe()
study_data_table.to_csv('study_xgboost.csv', index=False)

study_data_table

Make predictions with the parameters with the best score.

The best parameters when creating a notebook were as follows.  
Number of finished trials: 25  


Trial 22 finished with value: 0.6936483703969993 and parameters: {'lambda': 0.03349655513592068, 'alpha': 0.12097952030992898, 'colsample_bytree': 0.5, 'subsample': 0.4, 'learning_rate': 0.01, 'max_depth': 11, 'min_child_weight': 179}. Best is trial 22 with value: 0.6936483703969993.



In [None]:
best_trial_paras = {'tree_method':'gpu_hist', 'lambda': 0.03349655513592068, 
                    'alpha': 0.12097952030992898, 'colsample_bytree': 0.5, 
                    'subsample': 0.4, 'learning_rate': 0.01, 
              'n_estimators': 4000, 'max_depth': 11, 'min_child_weight': 179, 
              'random_state': 2021 
              }

In [None]:
# At CV = 5, add up each rmse and use the average value.

preds = np.zeros(test_df.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
rmse=[]  # list contains rmse for each fold
n=0
for trn_idx, test_idx in kf.split(train_df[features],train_df['target']):
    X_tr,X_val=train_df[features].iloc[trn_idx],train_df[features].iloc[test_idx]
    y_tr,y_val=train_df['target'].iloc[trn_idx],train_df['target'].iloc[test_idx]
    model = xgb.XGBRegressor(**best_trial_paras)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100, verbose=False)
    preds+=model.predict(test_df[features])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1


In [None]:
np.mean(rmse)

## Submission

In [None]:
# Reading submission data
sub = pd.read_csv('../input/tabular-playground-series-jan-2021/sample_submission.csv')
print(sub.head())

In [None]:
sub['target']=preds
print(sub.head())
sub.to_csv('xgboost_submission.csv', index=False)

Thank you for reading my notebook.  
I hope the content of the article will be useful to you.