### Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import xgboost as xgb
# import optuna

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from matplotlib import pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
#import lux

import pickle

### Loading datasets

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

### Data Exploration

In [3]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
profile = ProfileReport(train_df, title="Spaceship Profiling")
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Preprocessing

In [6]:
def preprocessing(df):
    # split to group and ID
    df[['Group', 'ID']] = df['PassengerId'].str.split('_', expand=True)
    df.drop(['PassengerId', 'ID'], axis=1, inplace=True)
    
    # if home planet is missing, fill it with group planet if possible
    df2=df.loc[:,['Group','HomePlanet']].dropna().drop_duplicates('Group')
    df.loc[(pd.isna(df.HomePlanet)), 'HomePlanet'] = df2.HomePlanet
    
    df['CryoSleep'].fillna('No value', inplace=True)

    # if Cabin value missing, fill it from group if posiible. Not always true but mostly
    df2=df.loc[:,['Group','Cabin']].dropna().drop_duplicates('Group')
    df.loc[(pd.isna(df.Cabin)), 'Cabin'] = df2.Cabin
    # split Cabin to deck/num/side
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df.drop(['Cabin', 'Num'], axis=1, inplace=True)

    # high correlation of G_deck with Earth destination
    df.loc[(pd.isna(df.HomePlanet) & df.Deck == 'G'), 'HomePlanet'] = 'Earth'
    df['HomePlanet'].fillna('No value', inplace=True)

    df['Destination'].fillna('No value', inplace=True)

    df['Age'].fillna(df['Age'].mean(), inplace=True)

    # if VIP value missing, fill it from group if posiible
    df2=df.loc[:,['Group','VIP']].dropna().drop_duplicates('Group')
    df.loc[(pd.isna(df.VIP)), 'VIP'] = df2.VIP
    df['VIP'].fillna('No value', inplace=True)

    # if CryoSleep or Age<10 then no expenses
    expenses_col = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
    df.loc[(df.CryoSleep == True), expenses_col] = 0
    df.loc[(df.Age < 10), expenses_col] = 0

    num_imp = SimpleImputer(strategy='mean')
    df[expenses_col] = pd.DataFrame(num_imp.fit_transform(df[expenses_col]),columns=expenses_col)
    df['ShoppingMall'] = round(df['ShoppingMall'], 1)

    df.drop('Name', axis=1, inplace=True)
    df.drop('Group', axis=1, inplace=True)

In [7]:
X = train_df.copy()
preprocessing(X)

# split to train/test data
y = X['Transported']
X = X.drop(['Transported'], axis=1)

X = pd.get_dummies(X)

In [8]:
X.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_No value,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,39.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
4,16.0,303.0,70.0,151.0,565.0,2.0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [9]:
fig = plt.subplots(figsize=(100,50))   
sns.heatmap(test_df.corr(),annot=True)

<AxesSubplot:>

### Building a ML Pipeline

In [13]:
pipelines = {
    'svm': make_pipeline(StandardScaler(), SVC()),
    'lgbm': make_pipeline(StandardScaler(), lgb.LGBMClassifier()),
    'xgb': make_pipeline(StandardScaler(), xgb.XGBClassifier())
}

In [20]:
grid = {
    'svm': {'svc__C': [0.1, 1, 10], 'svc__gamma': [0.1, 1, 10]},
    'lgbm':{'lgbmclassifier__n_estimators':[100,250,400], 'lgbmclassifier__max_depth':[4,5], 'lgbmclassifier__learning_rate':[0.1, 0.05]},
    'xgb':{'xgbclassifier__n_estimators':[100,250,400], 'xgbclassifier__max_depth':[4,5], 'xgbclassifier__learning_rate':[0.1, 0.05]} 
}  

In [21]:
pipelines.items()

dict_items([('svm', Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])), ('lgbm', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lgbmclassifier', LGBMClassifier())])), ('xgb', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('xgbclassifier',
                 XGBClassifier(base_score=None, booster=None, callbacks=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None,
                               early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=None, gpu_id=None,
                               grow_policy=None, importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=N

In [22]:
fit_models = {}

for algorythm, pipeline in pipelines.items():

    skf = StratifiedKFold(n_splits=10, random_state=None)
    model = GridSearchCV(pipeline, grid[algorythm], n_jobs=-1, cv=skf)
    model.fit(X, y)
    fit_models[algorythm] = model

    print(f'{algorythm} model have been trained.')

svm model have been trained.
lgbm model have been trained.
xgb model have been trained.


### Evaluation

In [34]:
fig, ax = plt.subplots(1, 3)
i = 0
for algorythm, model in fit_models.items():
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    
    print(f'Metrics for {algorythm}= Accuracy: {accuracy}, Recall: {recall}, Precision: {precision}')

    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, ax=ax[i],  annot=True)
    i += 1

Metrics for svm= Accuracy: 0.8223858276774416, Recall: 0.8184102329830973, Precision: 0.827100646352724
Metrics for lgbm= Accuracy: 0.8284826872196019, Recall: 0.8547281863864779, Precision: 0.8140091363933
Metrics for xgb= Accuracy: 0.8250316346485678, Recall: 0.854956601187757, Precision: 0.808597969323828


In [35]:
# save the model
model_name = 'lgbm'

with open(f'models/{model_name}.pkl', 'wb') as f: 
  pickle.dump(fit_models[f'{model_name}'], f)

In [None]:
# read the model
with open('models/gradientboosted.pkl', 'rb') as f: 
  reloaded_model = pickle.load(f)

### Predict on test data

In [36]:
pred_model = 'lgbm'

pred_test = test_df.copy()
preprocessing(pred_test)
pred_test = pd.get_dummies(pred_test)

y_pred_test = fit_models[f'{pred_model}'].predict(pred_test)

# yhat_test_log = []
# for i in yhat_test:
#     if i == 1:
#         yhat_test_log.append('True')
#     if i == 0:
#         yhat_test_log.append('False')
# logits = pd.DataFrame(yhat_test_log, columns=['Transported'])

array([ True, False,  True, ...,  True,  True,  True])

In [37]:
submission = pd.DataFrame([test_df['PassengerId'], y_pred_test]).T
submission.columns = ['PassengerID', 'Transported']

In [38]:
submission.to_csv(f'kaggle_submission_{pred_model}.csv', index=False)