# Multi-Class Prediction of Cirrhosis Outcomes

https://www.kaggle.com/competitions/playground-series-s3e26/data?select=train.csv

## 0 - IMPORTS AND FUNCTIONS

In [1]:
# Data Analysis and Data Manipulation Libraries
import pandas                  as pd
import numpy                   as np
import seaborn                 as sns
import scikitplot              as skplt
import matplotlib.pyplot       as plt
import plotly.express          as px
import inflection
from ydata_profiling           import ProfileReport 

# Machine Learning Models
import optuna
from sklearn                   import ensemble        as en
from sklearn.linear_model      import LogisticRegression
from catboost                  import CatBoostClassifier
from sklearn.neighbors         import KNeighborsClassifier
import lightgbm                as lgbm

from sklearn                   import model_selection as ms
from sklearn                   import preprocessing   as pp
from sklearn.model_selection   import  KFold, StratifiedKFold

# Metrics and Performance
from sklearn import metrics as mt

# Data Transformer
#from feature_engine.encoding       import CountFrequencyEncoder

# Dysplay
%matplotlib inline
from IPython.core.display    import HTML
from IPython.display         import Image

import warnings
warnings.filterwarnings( 'ignore' )

  from .autonotebook import tqdm as notebook_tqdm


### 0.1 - HELPER FUNCTIONS

In [2]:
le = pp.LabelEncoder()
ss = pp.StandardScaler()
rr = pp.RobustScaler()

In [3]:
def data_trans(df_trans):
    # CATEGORICAL VARIABLES
    df_trans['drug']         = le.fit_transform( df_trans[['drug']].values )
    df_trans['sex']          = le.fit_transform( df_trans[['sex']].values )
    df_trans['ascites']      = le.fit_transform( df_trans[['ascites']].values )
    df_trans['hepatomegaly'] = le.fit_transform( df_trans[['hepatomegaly']].values )
    df_trans['spiders']      = le.fit_transform( df_trans[['spiders']].values )
    df_trans['edema']        = le.fit_transform( df_trans[['edema']].values )
    
    # NUMERICAL VARIABLES
    # STANDARD SCALER

    #df_trans['n_days']    = ss.fit_transform( df_trans[['n_days']].values )
    #df_trans['age']       = ss.fit_transform( df_trans[['age']].values )
    #df_trans['albumin']   = ss.fit_transform( df_trans[['albumin']].values )
    #df_trans['platelets'] = ss.fit_transform( df_trans[['platelets']].values )

    # ROBUST SCALER
    
    #df_trans['bilirubin']     = rr.fit_transform( df_trans[['bilirubin']].values )
    #df_trans['cholesterol']   = rr.fit_transform( df_trans[['cholesterol']].values )
    #df_trans['copper']        = rr.fit_transform( df_trans[['copper']].values )
    #df_trans['sgot']          = rr.fit_transform( df_trans[['sgot']].values )
    #df_trans['tryglicerides'] = rr.fit_transform( df_trans[['tryglicerides']].values )
    #df_trans['platelets']     = rr.fit_transform( df_trans[['platelets']].values )
    #df_trans['prothrombin']   = rr.fit_transform( df_trans[['prothrombin']].values )
    
    #df_trans = df_trans[['bilirubin','n_days','copper', 'hepatomegaly','prothrombin']]
    
    return(df_trans)

def jupyter_settings():
    %matplotlib inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [18, 9]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
   
    sns.set()
jupyter_settings()

In [4]:
pd.set_option('display.float_format', lambda x: '%.6f' %x)

## 1 - DATAS

In [5]:
# Train Datas - Windows
df_raw = pd.read_csv('../data/train.csv')

In [6]:
## Renomeando as colunas, removendo letras maiusculas e substituindo espacos por _
cols_old = ['id', 'N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly','Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
            'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin','Stage', 'Status']

snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
cols_new = list( map( snakecase, cols_old ) )

df_raw.columns = cols_new

### 1.1 - DATASET SPLIT

In [7]:
# Split dataset into train, test and validation datasets
X = df_raw.drop( ['status'], axis=1 )
y = df_raw['status'].copy()
x_train, x_valid, y_train, y_valid = ms.train_test_split( X, y, test_size=0.20, random_state=42 )
x_train, x_test, y_train, y_test   = ms.train_test_split( x_train, y_train, test_size=0.25, shuffle=True, random_state=42 )

print(f" x_train: {x_train.shape}/n y_train: {y_train.shape}/n x_valid: {x_valid.shape}/n y_valid: {y_valid.shape}/n x_test: {x_test.shape}/n y_test: {y_test.shape}/n")

 x_train: (4743, 19)/n y_train: (4743,)/n x_valid: (1581, 19)/n y_valid: (1581,)/n x_test: (1581, 19)/n y_test: (1581,)/n


In [None]:
# Split dataset into train and validation datasets
X = df_raw.drop( ['status'], axis=1 )
y = df_raw['status'].copy()
x_train, x_valid, y_train, y_valid = ms.train_test_split( X, y, test_size=0.20, random_state=42 )

print(f"x_train: {x_train.shape}/n y_train: {y_train.shape}/n x_valid: {x_valid.shape}/n y_valid: {y_valid.shape}/n")

### 1.2 - DATA DESCRIPTION

In [8]:
df1 = df_raw.copy()

In [9]:
#prof = ProfileReport(df1)
#rof.to_file(output_file='data_descriptive.html')

In [10]:
# Data Dimensions
print( 'Number of Rows: {}'.format( df1.shape[0] ) )
print( 'Number of Cols: {}'.format( df1.shape[1] ) )

Number of Rows: 7905
Number of Cols: 20


In [11]:
# Data Types
df1.dtypes

id                 int64
n_days             int64
drug              object
age                int64
sex               object
ascites           object
hepatomegaly      object
spiders           object
edema             object
bilirubin        float64
cholesterol      float64
albumin          float64
copper           float64
alk_phos         float64
sgot             float64
tryglicerides    float64
platelets        float64
prothrombin      float64
stage            float64
status            object
dtype: object

In [12]:
# Check NAN Values
df1.isna().sum()

id               0
n_days           0
drug             0
age              0
sex              0
ascites          0
hepatomegaly     0
spiders          0
edema            0
bilirubin        0
cholesterol      0
albumin          0
copper           0
alk_phos         0
sgot             0
tryglicerides    0
platelets        0
prothrombin      0
stage            0
status           0
dtype: int64

### 1.3 - DATA DESCRIPTIVE

In [None]:
df1.info()

In [None]:
df1.describe().T

In [None]:
# Select columns where the content is different 0 or 1.
# numerical features
num_attributes = df1.select_dtypes(include=['int64', 'float64'])

# categorical features
cat_attributes = df1.select_dtypes(exclude=['int64', 'float64'])

In [None]:
num_attributes.hist(bins=25);
plt.tight_layout()

In [None]:
# Central Tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean ) ).T
ct2 = pd.DataFrame( num_attributes.apply( np.median ) ).T

# Dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std ) ).T
d2 = pd.DataFrame( num_attributes.apply( min ) ).T
d3 = pd.DataFrame( num_attributes.apply( max ) ).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T

# Concatenar
m = pd.concat( [ d2, d3, d4, ct1, ct2, d1, d5, d6 ] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis']
m

# 2 - FEATURE ENGINEERING

In [None]:
df2 = df1.copy()

# 3 - DATA FILTERING

In [None]:
df3 = df2.copy()

# 4 - EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
df4 = df3.copy()

### 4.1 - UNIVARIATE ANALYSIS

In [None]:
# DRUG
aux = df1[['id','drug','status']].groupby(['status','drug'] ).count().reset_index()
aux

In [None]:
# SEX
aux_one = df1[['id','sex','status']].groupby(['status','sex'] ).count().reset_index()
aux_one

In [None]:
# ASCITES
aux_two = df1[['id','ascites','status']].groupby(['status','ascites'] ).count().reset_index()
aux_two

In [None]:
# HEPATOMEGALY
aux_three = df1[['id','hepatomegaly','status']].groupby(['status','hepatomegaly'] ).count().reset_index()
aux_three

In [None]:
# SPIDERS
aux_four = df1[['id','spiders','status']].groupby(['status','spiders'] ).count().reset_index()
aux_four

In [None]:
# EDEMA
aux_five = df1[['id','edema','status']].groupby(['status','edema'] ).count().reset_index()
aux_five

### 4.2 - BIVARIATE ANALYSIS

In [None]:
#1.The interest on purchase the vehicle insurance is greater for customers that damaged their vehicle before and doesn't have insurance. (FALSE)
d1 = df4[( df4['previously_insured'] == 0) & (df4['vehicle_damage'] == 1)]

ax1 = d1[['response','id']].groupby('response').count().reset_index()
ax1['percentage'] = round(ax1['id'] / d1['id'].count()*100)
ax1['response'] = ax1['response'].apply(lambda x: 'No' if (x == 0) else 'Yes')

# Bar chart
fig1 = px.bar(ax1, x='response', 
              y='percentage', 
              color='response', 
              text='percentage', 
              width=1500,
              height=650,
              color_discrete_sequence=px.colors.qualitative.Pastel2)
fig1.update_traces(texttemplate='%{text:.2s}', 
                   textposition='outside')
fig1.update_layout(title_text="Hypothesis 1: Purchase Interest vs Vehicle Damage.", 
                   title_x=0.25,
                   font=dict(size=20))
fig1.show()

Hiphotesys is false once 75% of customers doesn't show interest in buying the vehicle insurance. 

In [None]:
# 2. The interest on purchase the vehicle insurance is greater for woman than men. (FALSE)
ax2 = pd.crosstab(df4['gender'], df4['response'])
ax2['percentage'] = round(ax2[1]/(ax2[0]+ax2[1])*100)

# Creating new dataframe for a bar chart
aux2 = [['Female', 10],['Male',14]]
aux2_df = pd.DataFrame(aux2, columns=['gender','percentage'])

# Bar chart
fig = px.bar(aux2_df, 
             x='gender', 
             y='percentage', 
             color='gender', 
             text='percentage',
             width=1500,
             height=650, 
             color_discrete_sequence=px.colors.qualitative.Pastel2)
fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')
fig.update_layout(title_text="Hypothesis 2: Purchase Interest vs Gender.", 
                   title_x=0.25,
                   font=dict(size=20))
fig.show()


Hiphotesys is false once 14% of the men show interest in buying the vehicle insurance against 10% of women. 

Hiphotesys is false, according the analysis, the period that the customers are with the company doesn't affect the interest in buying vehicle insurance. 

In [None]:
# 3. The interest on purchase the vehicle insurance is greater for customers that spend less than 30k for annual premium.
aux5 = df4[df4['annual_premium'] > 30000][['id','response']]
aux6 = df4[df4['annual_premium'] <= 30000][['id','response']]

# Percentage of interested customers that spend more than 30k in annual premium
aux16 = aux5[['response','id']].groupby('response').count().reset_index()
aux16['percentage'] = round( aux16['id'] / aux5.shape[0] * 100 )

# Percentage of interested customers that spend less than 30k in annual premium
aux17 = aux6[['response','id']].groupby('response').count().reset_index()
aux17['percentage'] = round( aux17['id'] / aux6.shape[0] * 100 )

# Creating new dataframe for a bar chart
ax4 = [['30k or more in annual premium', 13],['30k or less in annual premium', 11]]
ax4_df = pd.DataFrame(ax4, columns=['annual_premium','percentage'])

# Bar chart
fig = px.bar(ax4_df,
             x='annual_premium',
             y='percentage', 
             color='annual_premium', 
             text='percentage', 
             width=1500, 
             height=650, 
             color_discrete_sequence=px.colors.qualitative.Pastel2)
fig.update_traces(texttemplate='%{text:.2s}',
                  textposition='outside')
fig.update_layout(title_text="Hypothesis 3: Purchase Interest vs Annual Premium",
                  title_x=0.17,
                  font=dict(size=20))
fig.show()

False, customers that spend more than 30k yearly show greter interest on purchase vehicle insurance.

In [None]:
# 4. The interest on purchase the vehicle insurance is greater for customers that have driver license.
aux9 = pd.crosstab(df4['driving_license'], df4['response'])
aux9['percentage'] = round(aux9[1]/(aux9[0]+aux9[1])*100)

# Creating new dataframe for a bar chart
ax9 = [['No', 5 ],['Yes', 12]]
ax9_df = pd.DataFrame(ax9, columns=['driving_license','percentage'])

# Bar chart
fig = px.bar(ax9_df, 
             x='driving_license', 
             y='percentage', 
             color='driving_license', 
             text='percentage', 
             width=1500,
             height=650, 
             color_discrete_sequence=px.colors.qualitative.Pastel2 )
fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')
fig.update_layout(title_text="Hypothesis 4: Purchase Interest vs Driving License.", 
                  title_x=0.25,
                  font=dict(size=20))
fig.show()

In [None]:
# 5. The interest on purchase the vehicle insurance is greater for customers that have new cars.
aux10 = pd.crosstab(df4['vehicle_age'], df4['response'])
aux10['percentage'] = round(aux10[1]/(aux10[0]+aux10[1])*100)

# Creating new dataframe for a bar chart
ax10 = [['New', 4 ],['Used', 17],['Old', 29]]
ax10_df = pd.DataFrame(ax10, columns=['vehicle_age','percentage'])

# Bar chart
fig = px.bar(ax10_df, 
             x='vehicle_age', 
             y='percentage', 
             color='vehicle_age', 
             text='percentage',
             width=1500,
             height=650,
             color_discrete_sequence=px.colors.qualitative.Pastel2)
fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')
fig.update_layout(title_text="Hypothesis 5: Purchase Interest vs Vehicle Age.", 
                   title_x=0.25,
                   font=dict(size=20))
fig.show()

In [None]:
# 6. The interest on purchase the vehicle insurance is greater for customers that have new cars and have damaged their vehicles.
aux11 = df4[(df4['vehicle_damage'] == 1 )]
aux12 = pd.crosstab(aux11['vehicle_age'], aux11['response'])
aux12['percentage'] = round(aux12[1]/(aux12[0]+aux12[1])*100)

# Creating new dataframe for a bar chart
ax11 = [['New', 14 ],['Used', 27],['Old', 29]]
ax11_df = pd.DataFrame(ax11, columns=['vehicle_age','percentage'])

# Bar chart
fig = px.bar(ax11_df, 
             x='vehicle_age', 
             y='percentage', 
             color='vehicle_age', 
             text='percentage',
             width=1500,
             height=650, 
             color_discrete_sequence=px.colors.qualitative.Pastel2)
fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')
fig.update_layout(title_text="Hypothesis 6: Purchase Interest vs New (damaged) cars.", 
                   title_x=0.25,
                   font=dict(size=20))
fig.show()

In [None]:
# 7. The interest on purchase the vehicle insurance is lower for customers that are already insured.
aux15 = pd.crosstab(df4['previously_insured'], df4['response'])
aux15['percentage'] = round(aux15[1]/(aux15[0]+aux15[1])*100)
aux15

# Creating new dataframe for a bar chart
ax14 = [['No', 23 ],['Yes', 0]] 
ax14_df = pd.DataFrame(ax14, columns=['previously_insured','percentage'])

# Bar chart
fig = px.bar(ax14_df, 
             x='previously_insured', 
             y='percentage', 
             color='previously_insured', 
             text='percentage',
             width = 1500,
             height=650, 
             color_discrete_sequence=px.colors.qualitative.Pastel2)
fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside')
fig.update_layout(title_text="Hypothesis 7: Purchase Interest vs Previusly Insured.", 
                   title_x=0.25,
                   font=dict(size=20))
fig.show()


**Hypothesis Validation**

1. The interest on purchase the vehicle insurance is greater for customers that damaged their vehicle before and doesn't have insurance.
    **False, of the customers that damaged their car and doesn't have insurance, only 25% show interest in acquire vehicle insurance.**

2. The interest on purchase the vehicle insurance is greater for woman than men.
    **False, only 10% of women show interest in acquire vehicle insurance, whereas 13% of the men show interest on acquire vehicle insurance.**

3. The interest on purchase vehicle insurance is greater for vintage customers ( 7 months or more ).
    **False, the period that customers are on the company doensn't show influency on interest in buying vehicle insurance.**

4. The interest on purchase the vehicle insurance is greater for young customers.(Between 18 and 30 years old.)
    **False, customers that spend more than 30k yearly show greter interest on purchase vehicle insurance.**

5. The interest on purchase the vehicle insurance is greater for young customers.(Between 18 and 30 years old.)
    **False, adults and elderlies show greater interest on buying vehicle insurance.**

6. The interest on purchase the vehicle insurance is greater for customers that have driver license.
    **True, arround 12% of customers that hold a driving license show interest in buying the vehicle insurance.**

7. The interest on purchase the vehicle insurance is greater for customers that have new cars.
    **False, the interest is greater for customers that own an old car.**

8. The interest on purchase the vehicle insurance is greater for customers that have new cars and have damaged their vehicles.
    **False, of the customers who damaged their car, the ones that own a old car show greater interest in buying the vehicle insurance (29%), followed by customers that own used cars (27%).**

9. The interest on purchase the vehicle insurance is greater for elderly women.
    **False, adult women show greater interest in buying the vehicle insurance.**

10. The interest on purchase the vehicle insurance is lower for customers that are already insured.
    **True, less than 1% of customers already insured show interest on purchase the vehicle insurance.**


### 4.3 - MULTIVARIATE ANALYSIS

In [None]:
correlation = df4.corr().round(2)
plt.figure(figsize = (14,7))
sns.heatmap(correlation, annot = True)

# 5 - DATA PREPARATION

n_days - standard_scaler
age    - parece que esta em dias, posso transformar em anos, standard_scaler
bilirubin - tem outliers, que parecem parte do processo. Aplicar robust_scaller ou filtrar valore e usar apenas menores que 10.
cholesterol - tem outliers, que parecem parte do processo. Aplicar robust_scaller
albumin - standard_scaler
copper - Aplicar robust_scaller
alk_phos - Aplicar robust_scaller
sgot - Aplicar robust_scaller
tryglicerides - Aplicar robust_scaller
platelets - standard_scaler
prothrombin - Aplicar robust_scaller

In [None]:
df5 = df4.copy()

In [None]:
# ENCODER
le = pp.LabelEncoder()

df5['drug'] = le.fit_transform( df5[['drug']].values )
df5['sex'] = le.fit_transform( df5[['sex']].values )
df5['ascites'] = le.fit_transform( df5[['ascites']].values )
df5['hepatomegaly'] = le.fit_transform( df5[['hepatomegaly']].values )
df5['spiders'] = le.fit_transform( df5[['spiders']].values )
df5['edema'] = le.fit_transform( df5[['edema']].values )

In [None]:
# STANDARD SCALER
ss = pp.StandardScaler()

df5['n_days'] = ss.fit_transform( df5[['n_days']].values )
df5['age']    = ss.fit_transform( df5[['age']].values )
df5['albumin'] = ss.fit_transform( df5[['albumin']].values )
df5['platelets'] = ss.fit_transform( df5[['platelets']].values )


# ROBUST SCALER
rr = pp.RobustScaler()
df5['bilirubin'] = rr.fit_transform( df5[['bilirubin']].values )
df5['cholesterol'] = rr.fit_transform( df5[['cholesterol']].values )
df5['copper'] = rr.fit_transform( df5[['copper']].values )
df5['sgot'] = rr.fit_transform( df5[['sgot']].values )
df5['tryglicerides'] = rr.fit_transform( df5[['tryglicerides']].values )
df5['platelets'] = rr.fit_transform( df5[['platelets']].values )
df5['prothrombin'] = rr.fit_transform( df5[['prothrombin']].values )

In [13]:
x_training   = data_trans(x_train)
x_validation = data_trans(x_valid)
x_testing    = data_trans(x_test)

# 6 - FEATURE SELECTION

In [None]:
df6 = x_train.copy()

In [None]:
# model definition
forest = en.ExtraTreesClassifier( n_estimators=500, random_state=0, n_jobs=-1 )

# data preparation
df6_n = df6
y_train_n  = y_train.values
forest.fit( df6_n, y_train_n )

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0 )
indices = np.argsort(importances)[::-1]

# Print the feature ranking 
print("Feature ranking:")
df = pd.DataFrame()
for i,j in zip( df6_n, forest.feature_importances_ ):
    aux = pd.DataFrame( {'feature': i, 'importance': j}, index=[0] )
    df = pd.concat( [df,aux], axis = 0 )
    
print( df.sort_values( 'importance', ascending=False ) )

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(df6_n.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(df6_n.shape[1]), indices)
plt.xlim([-1, df6_n.shape[1]])
plt.show()

# 7 - MACHINE LEARNING MODELS

## 7.1 - Logistic Regression

In [14]:
# Model Definition
clf_lr = LogisticRegression()

# Model Training
clf_lr.fit(x_training, y_train)

# Model Prediction
yhat_lr = clf_lr.predict_proba(x_validation)

In [15]:
# Evaluation
lr_logloss = mt.log_loss(y_valid, yhat_lr )
lr_logloss

0.6445539313614146

## 7.2 - LGBM

In [31]:
# Model Definition
clf_lgbm = lgbm.LGBMClassifier()

# Model Training
clf_lgbm.fit(x_training, y_train)

# Model Prediction
yhat_lgbm = clf_lgbm.predict_proba(x_validation)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2157
[LightGBM] [Info] Number of data points in the train set: 4743, number of used features: 19
[LightGBM] [Info] Start training from score -0.451744
[LightGBM] [Info] Start training from score -3.352437
[LightGBM] [Info] Start training from score -1.113267


In [32]:
# Evaluation
lgbm_logloss = mt.log_loss(y_valid, yhat_lgbm )
lgbm_logloss

0.4825904318802329

# 7.3 - CatBoost

In [40]:
cb_train = data_trans(X)
cby_train = y

In [41]:
# Definition
cb_model = CatBoostClassifier()

# Model training
cb_model.fit(cb_train, cby_train)

# Model Prediction
#yhat_cb = cb_model.predict_proba(x_validation)

Learning rate set to 0.087926
0:	learn: 1.0094352	total: 18.2ms	remaining: 18.1s
1:	learn: 0.9368265	total: 37.9ms	remaining: 18.9s
2:	learn: 0.8790452	total: 54.4ms	remaining: 18.1s
3:	learn: 0.8296673	total: 64.9ms	remaining: 16.2s
4:	learn: 0.7870421	total: 76.1ms	remaining: 15.1s
5:	learn: 0.7517556	total: 88.8ms	remaining: 14.7s
6:	learn: 0.7209905	total: 99.4ms	remaining: 14.1s
7:	learn: 0.6938450	total: 111ms	remaining: 13.7s
8:	learn: 0.6698595	total: 121ms	remaining: 13.3s
9:	learn: 0.6493445	total: 130ms	remaining: 12.9s
10:	learn: 0.6312896	total: 140ms	remaining: 12.6s
11:	learn: 0.6153405	total: 149ms	remaining: 12.3s
12:	learn: 0.6004008	total: 159ms	remaining: 12.1s
13:	learn: 0.5877668	total: 170ms	remaining: 12s
14:	learn: 0.5763207	total: 180ms	remaining: 11.8s
15:	learn: 0.5655510	total: 190ms	remaining: 11.7s
16:	learn: 0.5564930	total: 201ms	remaining: 11.6s
17:	learn: 0.5478166	total: 212ms	remaining: 11.6s
18:	learn: 0.5407116	total: 222ms	remaining: 11.4s
19:	le

171:	learn: 0.3750142	total: 1.65s	remaining: 7.94s
172:	learn: 0.3746218	total: 1.66s	remaining: 7.93s
173:	learn: 0.3743024	total: 1.67s	remaining: 7.92s
174:	learn: 0.3737941	total: 1.68s	remaining: 7.91s
175:	learn: 0.3734048	total: 1.69s	remaining: 7.9s
176:	learn: 0.3728838	total: 1.7s	remaining: 7.89s
177:	learn: 0.3724980	total: 1.7s	remaining: 7.87s
178:	learn: 0.3720065	total: 1.71s	remaining: 7.86s
179:	learn: 0.3716497	total: 1.72s	remaining: 7.85s
180:	learn: 0.3712097	total: 1.73s	remaining: 7.84s
181:	learn: 0.3708980	total: 1.74s	remaining: 7.83s
182:	learn: 0.3706074	total: 1.75s	remaining: 7.81s
183:	learn: 0.3703026	total: 1.76s	remaining: 7.81s
184:	learn: 0.3696536	total: 1.77s	remaining: 7.8s
185:	learn: 0.3691733	total: 1.78s	remaining: 7.79s
186:	learn: 0.3685912	total: 1.79s	remaining: 7.78s
187:	learn: 0.3680533	total: 1.81s	remaining: 7.82s
188:	learn: 0.3673129	total: 1.82s	remaining: 7.81s
189:	learn: 0.3667196	total: 1.83s	remaining: 7.8s
190:	learn: 0.366

342:	learn: 0.3103719	total: 3.3s	remaining: 6.32s
343:	learn: 0.3100971	total: 3.31s	remaining: 6.31s
344:	learn: 0.3098300	total: 3.32s	remaining: 6.31s
345:	learn: 0.3095809	total: 3.33s	remaining: 6.3s
346:	learn: 0.3094166	total: 3.35s	remaining: 6.29s
347:	learn: 0.3091542	total: 3.35s	remaining: 6.29s
348:	learn: 0.3087728	total: 3.36s	remaining: 6.27s
349:	learn: 0.3084727	total: 3.37s	remaining: 6.26s
350:	learn: 0.3081085	total: 3.38s	remaining: 6.25s
351:	learn: 0.3078346	total: 3.39s	remaining: 6.24s
352:	learn: 0.3074885	total: 3.4s	remaining: 6.23s
353:	learn: 0.3071605	total: 3.4s	remaining: 6.21s
354:	learn: 0.3068285	total: 3.42s	remaining: 6.21s
355:	learn: 0.3064836	total: 3.43s	remaining: 6.2s
356:	learn: 0.3062010	total: 3.44s	remaining: 6.19s
357:	learn: 0.3059159	total: 3.45s	remaining: 6.18s
358:	learn: 0.3056604	total: 3.46s	remaining: 6.18s
359:	learn: 0.3053478	total: 3.47s	remaining: 6.17s
360:	learn: 0.3052057	total: 3.48s	remaining: 6.16s
361:	learn: 0.304

515:	learn: 0.2633413	total: 4.96s	remaining: 4.65s
516:	learn: 0.2632045	total: 4.97s	remaining: 4.64s
517:	learn: 0.2629411	total: 4.98s	remaining: 4.63s
518:	learn: 0.2623995	total: 4.99s	remaining: 4.62s
519:	learn: 0.2620826	total: 5s	remaining: 4.61s
520:	learn: 0.2619970	total: 5s	remaining: 4.6s
521:	learn: 0.2615669	total: 5.01s	remaining: 4.59s
522:	learn: 0.2614121	total: 5.02s	remaining: 4.58s
523:	learn: 0.2611272	total: 5.03s	remaining: 4.57s
524:	learn: 0.2608853	total: 5.04s	remaining: 4.56s
525:	learn: 0.2606347	total: 5.05s	remaining: 4.55s
526:	learn: 0.2603918	total: 5.06s	remaining: 4.54s
527:	learn: 0.2602335	total: 5.07s	remaining: 4.53s
528:	learn: 0.2599401	total: 5.08s	remaining: 4.52s
529:	learn: 0.2597540	total: 5.08s	remaining: 4.51s
530:	learn: 0.2594445	total: 5.09s	remaining: 4.5s
531:	learn: 0.2591875	total: 5.11s	remaining: 4.49s
532:	learn: 0.2589541	total: 5.12s	remaining: 4.49s
533:	learn: 0.2586366	total: 5.13s	remaining: 4.48s
534:	learn: 0.258437

688:	learn: 0.2244675	total: 6.64s	remaining: 3s
689:	learn: 0.2242844	total: 6.65s	remaining: 2.99s
690:	learn: 0.2240863	total: 6.66s	remaining: 2.98s
691:	learn: 0.2238913	total: 6.67s	remaining: 2.97s
692:	learn: 0.2237305	total: 6.68s	remaining: 2.96s
693:	learn: 0.2236509	total: 6.69s	remaining: 2.95s
694:	learn: 0.2235053	total: 6.7s	remaining: 2.94s
695:	learn: 0.2233552	total: 6.71s	remaining: 2.93s
696:	learn: 0.2231840	total: 6.72s	remaining: 2.92s
697:	learn: 0.2229582	total: 6.72s	remaining: 2.91s
698:	learn: 0.2228491	total: 6.73s	remaining: 2.9s
699:	learn: 0.2226990	total: 6.74s	remaining: 2.89s
700:	learn: 0.2225433	total: 6.75s	remaining: 2.88s
701:	learn: 0.2224290	total: 6.76s	remaining: 2.87s
702:	learn: 0.2221784	total: 6.77s	remaining: 2.86s
703:	learn: 0.2220496	total: 6.78s	remaining: 2.85s
704:	learn: 0.2218692	total: 6.79s	remaining: 2.84s
705:	learn: 0.2216725	total: 6.8s	remaining: 2.83s
706:	learn: 0.2215064	total: 6.81s	remaining: 2.82s
707:	learn: 0.2212

854:	learn: 0.1946279	total: 8.32s	remaining: 1.41s
855:	learn: 0.1945080	total: 8.32s	remaining: 1.4s
856:	learn: 0.1943115	total: 8.33s	remaining: 1.39s
857:	learn: 0.1941246	total: 8.34s	remaining: 1.38s
858:	learn: 0.1939088	total: 8.35s	remaining: 1.37s
859:	learn: 0.1937467	total: 8.35s	remaining: 1.36s
860:	learn: 0.1936934	total: 8.36s	remaining: 1.35s
861:	learn: 0.1934936	total: 8.37s	remaining: 1.34s
862:	learn: 0.1933082	total: 8.38s	remaining: 1.33s
863:	learn: 0.1932016	total: 8.39s	remaining: 1.32s
864:	learn: 0.1930574	total: 8.39s	remaining: 1.31s
865:	learn: 0.1929567	total: 8.4s	remaining: 1.3s
866:	learn: 0.1928214	total: 8.41s	remaining: 1.29s
867:	learn: 0.1927456	total: 8.42s	remaining: 1.28s
868:	learn: 0.1924725	total: 8.43s	remaining: 1.27s
869:	learn: 0.1922358	total: 8.44s	remaining: 1.26s
870:	learn: 0.1919765	total: 8.45s	remaining: 1.25s
871:	learn: 0.1918541	total: 8.46s	remaining: 1.24s
872:	learn: 0.1915167	total: 8.47s	remaining: 1.23s
873:	learn: 0.1

<catboost.core.CatBoostClassifier at 0x12003762dd0>

In [20]:
# Evaluation
cb_logloss = mt.log_loss(y_valid, yhat_cb)
cb_logloss

0.4734070439173265

## 9 - HYPERPARAMETER FINE TUNING

# 10 - SUBMISSIONS

In [42]:
df_test = pd.read_csv('../data/test.csv')

In [43]:
## Renomeando as colunas, removendo letras maiusculas e substituindo espacos por _
cols_old = ['id', 'N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides',\
            'Platelets', 'Prothrombin', 'Stage']

snakecase = lambda x: inflection.underscore(x.replace(' ', '_'))
cols_new = list( map( snakecase, cols_old ) )

df_test.columns = cols_new

In [44]:
df_pred = data_trans(df_test)

In [45]:
# Model Prediction
yhat_submission = cb_model.predict_proba(df_pred)

In [47]:
sub_mission = df_test.copy()
sub_mission['Status_C'] = yhat_submission[:, 0]
sub_mission['Status_CL'] = yhat_submission[:, 1]
sub_mission['Status_D'] = yhat_submission[:, 2]

In [48]:
submission = sub_mission[['id','Status_C','Status_CL','Status_D']]

In [50]:
submission.to_csv("submission_three.csv", index=None, sep=',')