### Import Library and Data

In [3]:
import pandas as pd
from sklearn.impute import KNNImputer

In [163]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train,df_test], axis=0)
df.drop(columns=['Name','PassengerId'],axis=1, inplace=True)

In [164]:
df.isna().sum()

HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Transported       0
dtype: int64

## Exploratory Data

In [165]:
df[['Deck','Num','Side']] = df['Cabin'].str.split('/',expand=True)
df.drop(columns=['Cabin'],axis=1,inplace=True)

In [166]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [167]:
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [168]:
df['Side'].value_counts()

Side
S    6381
P    6290
U     299
Name: count, dtype: int64

In [169]:
df['Deck'] = df['Deck'].map({'F':0,'G':1,'E':2,'B':3,'C':4,'D':5,'A':6,'U':7,'T':8})
df['Side'] = df['Side'].map({'S':1,'P':2,'U':3})

In [170]:
imputed_list = ['CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Num']
rest = list(set(df.columns)-set(imputed_list))
df_rest = df[rest]
imp = KNNImputer()
df_imp = imp.fit_transform(df[imputed_list])
df_imp = pd.DataFrame(df_imp, columns=imputed_list)
df = pd.concat([df_rest.reset_index(drop=True),df_imp.reset_index(drop=True)],axis=1)

In [171]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')

## One-Hot Encoding
apply encoding to categorical (object) columns

In [172]:
category = ['HomePlanet','Destination']

for i in category:
    df = pd.concat([df,pd.get_dummies(df[i],prefix=i)],axis=1)

df.drop(columns=category,axis=1,inplace=True)

In [173]:
df.head()

Unnamed: 0,Deck,Transported,Side,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
0,3,False,2,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,False,True,False
1,0,True,1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,0.0,True,False,False,False,False,False,True,False
2,6,False,1,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,False,True,False,False,False,False,True,False
3,6,False,1,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,False,True,False,False,False,False,True,False
4,0,True,1,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,True,False,False,False,False,False,True,False


## Feature Engineering

Create several new columns to be predicted

In [174]:
bill_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

df['amt_spent'] = df[bill_cols].sum(axis=1)
df['std_amt_spent'] = df[bill_cols].std(axis=1)
df['mean_amt_spent'] = df[bill_cols].mean(axis=1) 

df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['amt_spent'] + df['mean_amt_spent'] + df['HomePlanet_Earth']

In [175]:
df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.324347
3_high_cols                  0.284149
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Deck                         0.062790
FoodCourt                    0.034724
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004140
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018644
Num                         -0.035240
Age                         -0.050694
Side                        -0.068138
Destination_TRAPPIST-1e     -0.072731
HomePlanet_Earth            -0.119644
std_amt_spent               -0.121138
amt_spent                   -0.140485
mean_amt_spent              -0.140485
3_low_cols                  -0.140508
VRDeck                      -0.142780
Spa                         -0.154849
RoomService                 -0.174839
Name: Transported, dtype: float64

## Models

Testing multiple models to identify the best-performing one

In [176]:
df_train, df_test = df[:len(df_train)],df[len(df_train):]
df_test.drop(columns=['Transported'] ,axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns=['Transported'] ,axis=1, inplace=True)


In [177]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [202]:
X = df_train.drop(columns=['Transported'])
y = df_train['Transported']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [203]:
model_1 = XGBClassifier()
model_1.fit(X_train, Y_train)
y_pred = model_1.predict(X_test)
accuracy_score(Y_test, y_pred)

0.7975848188614146

In [204]:
model_2 = DecisionTreeClassifier()
model_2.fit(X_train,Y_train)
y_pred = model_2.predict(X_test)
accuracy_score(Y_test,y_pred)

0.7423806785508913

In [205]:
model_3 = LGBMClassifier()
model_3.fit(X_train,Y_train)
y_pred = model_3.predict(X_test)
accuracy_score(Y_test,y_pred)

0.7958596894767107

In [206]:
model_4 = RandomForestClassifier()
model_4.fit(X_train,Y_train)
y_pred = model_4.predict(X_test)
accuracy_score(Y_test,y_pred)

0.7901092581943646

In [207]:
model_5 = LogisticRegression()
model_5.fit(X_train,Y_train)
y_pred = model_5.predict(X_test)
accuracy_score(Y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.772857964347326

*Model_3 give the best result so we use it to predict the data test*

In [208]:
pred = model_3.predict(df_test)

In [213]:
df_2 = pd.read_csv('test.csv')
final = pd.DataFrame()
final['PassengerId'] = df_2['PassengerId']
final['Predict'] = pred
final.to_csv('Submission.csv',index=False)