In [4]:
import numpy as np
import random as rd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.impute import KNNImputer

In [187]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train, df_test], sort=False)
df.drop(['Name'], axis=1, inplace=True)
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [188]:
df_train.shape, df_test.shape, df.shape

((8693, 14), (4277, 14), (12970, 13))

In [189]:
# see null values in each column
df.isna().sum() 

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
dtype: int64

In [190]:
df['HomePlanet'].value_counts()

HomePlanet
Earth     6865
Europa    3133
Mars      2684
Name: count, dtype: int64

In [191]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)

In [192]:
df = df.drop(columns=['Cabin'])

In [193]:

df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [194]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
Deck              0
Num               0
Side              0
dtype: int64

In [195]:
# label encoding - converting string labels into numerical values

# using sklearn LabelEncoder
from sklearn.preprocessing import LabelEncoder
le_deck = LabelEncoder()
df['Deck'] = le_deck.fit_transform(df['Deck'])

In [196]:
# using pandas
df['Deck_encoded'] = df['Deck'].astype('category').cat.codes

In [197]:
# using builtin map function
df['Side'] = df['Side'].map({'U':-1, 'P':1, 'S':2})

In [198]:
impute_list = [
'CryoSleep',
'Age',
'VIP',
'Transported',
'RoomService',
'FoodCourt',
'ShoppingMall',
'Spa',
'VRDeck',
'Deck',
'Num',
'Side']

rest = list(set(df.columns) - set(impute_list))
df_rest = df[rest]

In [199]:
imp = KNNImputer(n_neighbors=5)
df_imputed  = imp.fit_transform(df[impute_list])
df_imputed = pd.DataFrame(df_imputed, columns=impute_list)

In [200]:
df = pd.concat([df_rest.reset_index(drop=True), df_imputed.reset_index(drop=True)], axis=1)

In [201]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')


# we want to one-hot encode these columns
category_colls = ['HomePlanet', 'Destination']
for col in category_colls:
    # pd.get_dummies(df[col], prefix=col) # its going to convert all the categories into a new column
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)

In [202]:
df = df.drop(columns=category_colls)

In [203]:
df.isna().sum()

PassengerId                  0
Deck_encoded                 0
CryoSleep                    0
Age                          0
VIP                          0
Transported                  0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Deck                         0
Num                          0
Side                         0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
HomePlanet_U                 0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Destination_U                0
dtype: int64

### Feature Engineering

In [204]:
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amt_spent'] = df[bill_cols].sum(axis=1)
df['std_amt_spent'] = df[bill_cols].std(axis=1)
df['mean_amt_spent'] = df[bill_cols].mean(axis=1)

In [205]:
df.drop(columns=['PassengerId'])

Unnamed: 0,Deck_encoded,CryoSleep,Age,VIP,Transported,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,amt_spent,std_amt_spent,mean_amt_spent
0,1,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,True,False,0.0,0.000000,0.0
1,5,0.0,24.0,0.0,1.0,109.0,9.0,25.0,549.0,44.0,...,False,False,False,False,False,True,False,736.0,227.807375,147.2
2,0,0.0,58.0,1.0,0.0,43.0,3576.0,0.0,6715.0,49.0,...,True,False,False,False,False,True,False,10383.0,3013.383198,2076.6
3,0,0.0,33.0,0.0,0.0,0.0,1283.0,371.0,3329.0,193.0,...,True,False,False,False,False,True,False,5176.0,1373.410427,1035.2
4,5,0.0,16.0,0.0,1.0,303.0,70.0,151.0,565.0,2.0,...,False,False,False,False,False,True,False,1091.0,223.988169,218.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,6,1.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,True,False,0.0,0.000000,0.0
12966,8,0.0,42.0,0.0,0.0,0.0,847.0,17.0,10.0,144.0,...,False,False,False,False,False,True,False,1018.0,364.441079,203.6
12967,3,1.0,15.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,False,True,False,False,False,0.0,0.000000,0.0
12968,3,0.0,30.2,0.0,0.0,0.0,2680.0,0.0,0.0,523.0,...,True,False,False,False,False,False,True,3203.0,1162.334633,640.6


In [206]:
# finding the correlation between transported and other columns
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324446
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Side                         0.059872
FoodCourt                    0.034737
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004171
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050238
Destination_TRAPPIST-1e     -0.072731
Deck                        -0.084981
Deck_encoded                -0.084981
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121154
amt_spent                   -0.140439
mean_amt_spent              -0.140439
VRDeck                      -0.142771
Spa                         -0.154815
RoomService                 -0.174781
Name: Transported, dtype: float64

In [207]:
# creating 3 new columns with correspondence to transported based on their correspondence
df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols']  = df['mean_amt_spent'] + df['amt_spent'] + df['HomePlanet_Earth']

In [208]:
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324446
3_high_cols                  0.284198
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Side                         0.059872
FoodCourt                    0.034737
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004171
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050238
Destination_TRAPPIST-1e     -0.072731
Deck                        -0.084981
Deck_encoded                -0.084981
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121154
amt_spent                   -0.140439
mean_amt_spent              -0.140439
3_low_cols                  -0.140463
VRDeck                      -0.142771
Spa                         -0.154815
RoomService 

### Testing out all different models

In [209]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [210]:
df = df.drop(columns='PassengerId', axis=1)

In [211]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns='Transported')
df_train.shape, df_test.shape

((8693, 26), (4277, 25))

In [212]:
X = df_train.drop(columns='Transported')
y = df_train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [213]:
# we'll create 5 different models to see which one is the best
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = XGBClassifier()
model5 = LGBMClassifier()

In [214]:
model1.fit(X_train, y_train)
pred = model1.predict(X_test)
accuracy_score(y_test, pred)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7740080506037953

In [215]:
model2.fit(X_train, y_train)
pred = model2.predict(X_test)
accuracy_score(y_test, pred)

0.753306497987349

In [216]:
model3.fit(X_train, y_train)
pred = model3.predict(X_test)
accuracy_score(y_test, pred)

0.7912593444508338

In [217]:
model4.fit(X_train, y_train)
pred = model4.predict(X_test)
accuracy_score(y_test, pred)

0.7941345600920069

In [218]:
model5.fit(X_train, y_train)
pred = model5.predict(X_test)
accuracy_score(y_test, pred)

0.7998849913743531

In [221]:
df_dummy = pd.read_csv('test.csv')
pred = model5.predict(df_test)
final = pd.DataFrame()

final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred.astype(bool)

final.to_csv('submission.csv', index=False)