In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
import pylab as plot

# import datasets
train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# setting up options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. Train dataset
As stated before, train dataset is mainly used to train predictive model as there is an available target variable in this set. This dataset is also used to explore more on the data itself including find a relation between each predictors and the target variable.

Observations:



Below is the first 5 rows of test dataset:

In [None]:
train.head()

The dimension and number of missing values in the train dataset is as below:

In [None]:
print(f'Number of rows: {train.shape[0]};  Number of columns: {train.shape[1]}; No of missing values: {sum(train.isna().sum())}')

**Data types:**
Except for column id, f1, f16, f27, f55, f60, f86 and loss column which are in int64 type, other columns are in float64. (to see the details, please expand)

In [None]:
train.dtypes

** Basic statistics:
**Below is the basic statistics for each variables which contain information on count, mean, standard deviation, minimum, 1st quartile, median, 3rd quartile and maximum.

In [None]:
train.describe().T

2. Test dataset
Test dataset is used to make a prediction based on the model that has previously trained. Exploration in this dataset is also needed to see how the data is structured and especially on it’s similiarity with the train dataset.


Below is the first 5 rows of test dataset:

In [None]:
test.head()

In [None]:
print(f'Number of rows: {test.shape[0]};  Number of columns: {test.shape[1]}; No of missing values: {sum(test.isna().sum())}')

In [None]:
test.describe().T

** Data types
**Except for column id, f1, f16, f27, f55, f60, f86 and loss column which are in int64 type, other columns are in float64 which is consistent with the train dataset. (to see the details, please expand)

In [None]:
test.dtypes

Number of features available to be used to create a prediction model are 100. The analysis is started by looking on number of uniques value on integer features which are f1, f16, f27, f55, f60 and f86.

In [None]:
features=['f1', 'f16', 'f27', 'f55', 'f86', 'loss']

for col in features:
    print(f'{col} unique value : {train[col].nunique()}')

3.  Submission
The submission file is expected to have an id and loss columns.

Below is the first 5 rows of submission file:

In [None]:
submission.head()

In [None]:
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(17, 8))

target_count = train['loss'].value_counts().sort_index()

ax.bar(target_count.index, target_count, color=['#1520E6' if i%2==0 else '#93D1FF' for i in range(9)],
       width=0.55, 
       edgecolor='black', 
       linewidth=0.7)

ax.margins(0.02, 0.05)

for i in range(20):
    ax.annotate(f'{target_count[i]/len(train)*100:.3}', xy=(i, target_count[i]+1000),
                   va='center', ha='center',
               )
#Annotate the point xy with text text.

#In the simplest form, the text is placed at xy.

ax.set_title('Target Distribution', weight='bold', fontsize=15)
ax.grid(axis='y', linestyle='-', alpha=0.4)

fig.tight_layout()
plt.show()

There are a total of 43 discrete losses.
The top 12 distributions account for 80% of the total.
All except the order of 2 and 1 are in increasing order.

In [None]:
target_count = train['loss'].value_counts().sort_index()
target_count_df = pd.DataFrame(target_count)
#pd.options.display.float_format = '{:,.2f}%'.format
target_count_df['loss(%)'] = (target_count_df/target_count.sum()*100)
target_count_df.sort_values('loss(%)', ascending=False, inplace=True)
display(target_count_df)

K-Fold Benchmark Visualization
Let's create a total of 4 models.

For demonstration, I used Decistion Tree, SVM, RandomForest, AdaBoost.

The score is RMSE.

In [None]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [None]:
def train_model(model_cls, X, y):
    model = model_cls() 
    train_scores, valid_scores = [], []
    
    skf = KFold(n_splits=5, shuffle=True)

    for tr_idx, va_idx in skf.split(X, y):
        X_train, X_val = X.iloc[tr_idx], X.iloc[va_idx]
        y_train, y_val = y[tr_idx], y[va_idx]
        model.fit(X_train, y_train)
        
        pred = model.predict(X_train)
        train_score = mean_squared_error(y_train, pred)
        
        pred = model.predict(X_val)
        valid_score = mean_squared_error(y_val, pred)
        
        train_scores.append(train_score)    
        valid_scores.append(valid_score)
        
    
    print('train score mean : ',np.mean(train_scores))
    print('valid score mean : ',np.mean(valid_scores))
    return train_scores, valid_scores

For fast implementation, only 1000 random data were used.


In [None]:

train = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv').sample(1000)
target = train['loss']
train = train.drop(['loss'], axis=1)


In [None]:
dt_train, dt_val = train_model(DecisionTreeRegressor, train, target.values)
svm_train, svm_val = train_model(SVR, train, target.values)
rf_train, rf_val = train_model(RandomForestRegressor, train, target.values)
ada_train, ada_val = train_model(AdaBoostRegressor, train, target.values)

In [None]:
train2=train.iloc[:,[2,17,28,56,87]]
train2.head()

In [None]:
dt_train2, dt_val2 = train_model(DecisionTreeRegressor, train2, target.values)
svm_train2, svm_val2 = train_model(SVR, train2, target.values)
rf_train2, rf_val2 = train_model(RandomForestRegressor, train2, target.values)
ada_train2, ada_val2 = train_model(AdaBoostRegressor, train2, target.values)

The result is bundled using numpy's stack and overlaid with a DataFrame.

In [None]:
raw_data = np.vstack([dt_train, dt_train2,dt_val, dt_val2,
           svm_train,svm_train2, svm_val, svm_val2,
           rf_train,rf_train2, rf_val, rf_val2,
           ada_train, ada_train2,ada_val, ada_val2]).T

raw_data = np.vstack([raw_data, raw_data.mean(axis=0)])
df = pd.DataFrame(raw_data,
                  index=pd.Index([f'Fold {idx}' for idx in range(5)]+['Mean'], name='#:'),
                  columns=pd.MultiIndex.from_product([['Decision Tree', 'SVM', 'Random Forest', 'AdaBoost'],
                                                     ['Train', 'Train1','Valid','Valid1']], 
                                                     names=['Model:', 'Train/Split']))
display(df)
s = df.style.format('{:.3f}')

In [None]:
main_color = '#00539C'
sub_color = '#FFD662'

# Cell
cell_hover = {
    'selector': 'td:hover',
    'props': [('background-color', sub_color),
              ('color', main_color),
              ('font-weight', 'bold')
             ]
}

# Index Explaination
index_names = {
    'selector': '.index_name',
    'props': [('font-style', 'italic'), 
              ('color', 'darkgrey'),  
              ('font-weight', 'normal')]
}

# header
headers = {
    'selector': 'th:not(.index_name)',
    'props': [('background-color', main_color),
              ('color', 'white')]
}

headers_head = {
    'selector': 'th.col_heading', 
    'props': [('text-align', 'center')]
}

# border
border_head1 = {
    'selector': 'th.col_heading.level0', 
    'props': [
        ('font-weight', 'bold'),
        ('color', sub_color),
        ('border-left', '1px solid white'),
    
    ]
}

border_head2 = {
    'selector': 'th:nth-child(2n+2)', 
    'props': [('border-left', '1px solid white')]
}

border_body = {
    'selector': 'td:nth-child(2n+2)', 
    'props': [('border-left', f'1px solid {main_color}')]
}

border_footer1 = {
    'selector': 'tr:last-child td', 
    'props': [('border-top', f'1px solid {main_color}')]
}

border_footer2 = {
    'selector': 'tr:last-child td', 
    'props': [('border-top', f'1px solid {main_color}')]
}

border_footer3 = {
    'selector': 'tr:last-child', 
    'props': [('background-color', main_color+'20')]
}

In [None]:
s =s.set_table_styles([cell_hover, index_names, headers, headers_head, 
                    border_head1, border_head2, border_body, 
                    border_footer1, border_footer2, border_footer3])
s