# 03 - Model Building

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
sns.set_style("darkgrid")
pd.set_option('display.max_columns', None)  
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import sys, os, yaml

SEED = 12312

## Datasets

In [2]:
df_train = pd.read_pickle("data/train.pkl")
print(df_train.shape)
df_train.head(1)

(116191, 165)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,Target
174751,1.21459,7,-7.518755,0.508802,-0.523882,x005_002,0.978668,1,x008_000,-0.49315,1.707698,x011_000,0.786082,0.284092,1.369472,0.104588,x016_048,1.614284,x018_000,0.321757,1.369472,3,0.482551,1.707698,0.63204,0.578781,-0.845579,0.51478,0.687255,2.449408,33,-2.692525,0.734246,-0.523882,x034_009,x035_014,-0.845579,5.300132,0.284181,0.51478,5,3.33274,0,0.123406,x044_001,6.347903,x046_043,0,0.321757,0.720373,0.933439,1.362273,0.036903,x053_001,0.388479,-6.252365,0.265856,0.0,0.963896,1.614284,0.576614,0.470437,x062_012,0,-1.422448,0.732592,0.036903,0.924275,1,0.963896,8,x071_262,x072_001,x073_014,x074_001,5,0.583409,0.681257,0.749298,0,-0.992194,0.42349,x082_003,0.486232,1.434464,-5.641925,0.039399,0.771266,3,3.183357,-0.614246,3.183357,x092_005,0.123406,0.491216,4.239643,1.80209,x097_004,1.752007,0.294131,1.21459,1,-1.753608,0.836078,0.732592,0.880611,x106_010,x107_006,0.517673,x109_001,x110_057,x111_000,0.687255,0.736414,3.931558,1.970665,0.404195,0.66199,3.617007,0.836078,0.517673,0.0,0.366659,10,x124_141,x125_002,x126_000,0.734246,0.720373,x129_010,0.353233,-2.487784,x132_027,1,x134_000,2.114684,0.902612,x137_001,0,1.085685,0.42349,0.576614,-1.753608,-3.188953,1.085685,6.347903,1.434464,1.362273,0.039399,4.239643,x150_001,-1.422448,31,0.66199,0.578781,0.933439,0.924275,x157_026,33,0.276511,-0.992194,0.849202,0.067726,x163_003,Class_3


In [3]:
df_test = pd.read_pickle("data/test.pkl")
print(df_test.shape)
df_test.head(1)

(77462, 165)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,Target
27645,3.264616,0,-2.601464,0.720072,1.735794,x005_000,1.343014,1,x008_000,-0.477114,1.616468,x011_001,0.82489,0.621359,0.118163,-0.881664,x016_048,1.013106,x018_000,0.950694,0.118163,3,0.545123,1.616468,-2.6363,0.643326,0.550672,0.028714,0.377223,-0.359695,33,-1.767291,0.777358,1.735794,x034_006,x035_014,0.550672,-5.419521,0.428992,0.028714,4,-3.140256,0,0.781694,x044_005,4.251926,x046_043,8,0.950694,-0.266988,1.708252,-2.254745,-0.494557,x053_001,0.348384,-0.419942,0.995117,0.0,0.218484,1.013106,0.606012,0.513714,x062_012,6,-1.961242,0.812633,-0.494557,2.419912,1,0.218484,6,x071_186,x072_000,x073_002,x074_000,5,-0.348171,0.801081,0.225894,0,0.682962,0.414943,x082_000,0.418329,1.463937,-5.036523,2.551437,0.681832,1,-1.434285,0.964303,-1.434285,x092_001,0.781694,-0.136789,0.097106,1.086943,x097_002,0.082585,0.520159,3.264616,1,2.716104,0.830059,0.812633,0.767977,x106_010,x107_012,-0.330239,x109_001,x110_057,x111_000,0.377223,0.779995,-0.145459,-3.566871,0.297113,0.630096,0.859477,0.830059,-0.330239,0.0,0.223768,0,x124_213,x125_002,x126_000,0.777358,-0.266988,x129_050,0.339389,2.171469,x132_003,1,x134_000,-4.124905,0.078903,x137_001,0,0.288397,0.414943,0.606012,2.716104,-6.717663,0.288397,4.251926,1.463937,-2.254745,2.551437,0.097106,x150_001,-1.961242,174,0.630096,0.643326,1.708252,2.419912,x157_035,33,0.28627,0.682962,0.259458,-0.611174,x163_005,Class_1


In [4]:
df_score = pd.read_pickle("data/score.pkl")
print(df_score.shape)
df_score.head(1)

(90000, 164)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163
0,2.319538,0,5.020483,0.448529,-0.852546,x005_000,-0.998906,0,x008_000,1.288086,4.673962,x011_001,0.289747,0.284042,0.572018,-1.336891,x016_006,-1.13796,x018_002,2.442883,0.572018,3,0.298722,4.673962,1.958812,0.282664,4.85593,0.290681,0.480461,-0.091239,33,-0.047759,0.288419,-0.852546,x034_011,x035_014,4.85593,2.191471,0.302269,0.290681,4,-2.573493,1,0.789406,x044_000,-2.778244,x046_011,0,2.442883,-3.219411,-1.538807,0.634541,0.358364,x053_001,0.410863,4.025498,0.31478,1.0,0.885479,-1.13796,0.647797,0.475543,x062_001,0,-0.084794,0.36046,0.358364,-1.810576,1,0.885479,3,x071_018,x072_000,x073_002,x074_000,5,4.5169,0.73564,0.477497,0,0.084926,0.510748,x082_000,0.310362,-3.29531,2.907393,0.377336,0.680271,1,-1.684449,-1.089001,-1.684449,x092_004,0.789406,-1.758931,-0.920925,-1.816968,x097_002,-3.853039,0.33526,2.319538,1,1.21015,0.823916,0.36046,0.422275,x106_010,x107_006,-2.845971,x109_001,x110_059,x111_000,0.480461,0.221567,1.620115,-3.280507,0.619155,0.612774,-1.956931,0.823916,-2.845971,0.0,0.431122,12,x124_022,x125_000,x126_000,0.288419,-3.219411,x129_023,0.153943,3.630768,x132_009,1,x134_000,-0.712996,1.072984,x137_001,0,0.708755,0.510748,0.647797,1.21015,4.905962,0.708755,-2.778244,-3.29531,0.634541,0.377336,-0.920925,x150_003,-0.084794,159,0.612774,0.282664,-1.538807,-1.810576,x157_051,33,0.307058,0.084926,0.27934,0.626329,x163_007


### Pre Processing

### Stupidly small eda

In [5]:
target = 'Target'
target_levels = sorted(df_train[target].unique())
target_levels

['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5']

In [6]:
df_train[target].value_counts(normalize=True)

Class_5    0.239950
Class_1    0.226386
Class_2    0.224045
Class_3    0.187493
Class_4    0.122126
Name: Target, dtype: float64

In [7]:
df_train[target].value_counts(normalize=True).sort_index().values

array([0.22638586, 0.22404489, 0.18749301, 0.1221265 , 0.23994974])

### Aside - log loss and example submissions

In [8]:
df_submission = pd.DataFrame(index=df_score.index,columns=target_levels)
df_submission.index.name = "id"
df_submission.loc[:] = [0.22638586, 0.22404489, 0.18749301, 0.1221265 , 0.23994974]

df_submission.to_csv("output/example_class_1_observed.csv")
print(df_submission.shape)

df_submission.head()

(90000, 5)


Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.226386,0.224045,0.187493,0.122126,0.23995
1,0.226386,0.224045,0.187493,0.122126,0.23995
2,0.226386,0.224045,0.187493,0.122126,0.23995
3,0.226386,0.224045,0.187493,0.122126,0.23995
4,0.226386,0.224045,0.187493,0.122126,0.23995


### Local Scoring(using test)

In [9]:
df_submission = pd.DataFrame(index=df_test.index,columns=target_levels)
df_submission.index.name = "id"
df_submission.loc[:] = [0.22638586, 0.22404489, 0.18749301, 0.1221265 , 0.23994974]


In [10]:
from sklearn.metrics import log_loss

y_test = df_test[target]

log_loss(y_test , df_submission.values)

1.5846028862409596

### Simple Model

* only "num" features
* naive full na with 0
* scaling(ss)

### pre processing

In [11]:
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)
df_score.fillna(0, inplace=True)

### Define features

In [12]:
cat_features = list(df_train.select_dtypes(include=['object']).columns)
num_features = list(df_train.select_dtypes(['float','int']))
features = cat_features + num_features
assert target is not features, f"hey, target is in features"

print(f"{target=}")
print(f"{len(cat_features)=}")
print(f"{len(num_features)=}")

target='Target'
len(cat_features)=34
len(num_features)=131


### Standardize numerical features

In [13]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

ss = StandardScaler()
ss.fit(df_train[num_features])
X_train = ss.transform(df_train[num_features])
X_test = ss.transform(df_test[num_features])
X_score = ss.transform(df_score[num_features])

### Encode target variable

In [14]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train[target])

## Model Building

In [15]:
# Initialize XGBoost model
model = XGBClassifier()

In [16]:
# Fit the model
model.fit(X_train, y_train)

#### Attempted grid search but my laptop kept crashing due to poor hardware

#### Used smote on xgboost and got back a worse score than just using xgboost

#### Tried combining 2 models to see if they would perform better but resulted in a worse kaggle score

### Scoring

In [17]:
y_test = label_encoder.transform(df_test[target])

#df_test[target] = label_encoder.fit_transform(df_test[target])

### Submission

In [18]:
y_pred_proba = model.predict_proba(X_score)

#scaling each row so that row total equals 1
y_pred_proba = (y_pred_proba.T / y_pred_proba.sum(axis=1)).T

y_pred_proba.shape

(90000, 5)

In [19]:
df_submission = pd.DataFrame(index=df_score.index,columns=target_levels)
df_submission.index.name = "id"

df_submission.loc[:] = y_pred_proba

df_submission.to_csv("output/xgboost_submission.csv")
df_submission.head()

Unnamed: 0_level_0,Class_1,Class_2,Class_3,Class_4,Class_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.489024,0.073181,0.163305,0.258446,0.016044
1,0.007425,0.201352,0.244238,0.007583,0.539403
2,0.265941,0.206005,0.310409,0.171561,0.046083
3,0.223668,0.152354,0.262009,0.147669,0.214301
4,0.375312,0.100494,0.207172,0.201293,0.115729
