In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train,df_test] , sort = False)


In [4]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
df.isna().sum()

PassengerId       0
HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
Transported       0
dtype: int64

In [6]:
df[['Deck','Num','Side']] = df['Cabin'].str.split('/', expand = True)

In [7]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [8]:
df = df.drop( columns = ['Cabin'])

In [9]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [10]:
df['Deck'].value_counts()

Deck
F    4239
G    3781
E    1323
B    1141
C    1102
D     720
A     354
T      11
Name: count, dtype: int64

In [12]:
df['Deck'] = df['Deck'].fillna('U')

In [13]:
df['Deck'].value_counts()

Deck
F    4239
G    3781
E    1323
B    1141
C    1102
D     720
A     354
U     299
T      11
Name: count, dtype: int64

In [15]:
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [16]:
df['Deck'] = df['Deck'].map({'G' : 0, 'F' : 1, 'E' : 2, 'D' : 3, 'C' : 4, 'B' : 5, 'A' : 6, 'U' : 7, 'T' : 8})
df['Side'] = df['Side'].map({'U' : -1, 'P': 1, 'S': 2})

In [17]:
impute_lis = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_lis))
df_rest = df[rest]
imp = KNNImputer()
df_imputed = imp.fit_transform(df[impute_lis])
df_imputed = pd.DataFrame(df_imputed, columns = impute_lis)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis = 1)


In [18]:
df.head()

Unnamed: 0,HomePlanet,PassengerId,Name,Transported,Destination,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,0001_01,Maham Ofracculy,False,TRAPPIST-1e,39.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0
1,Earth,0002_01,Juanna Vines,True,TRAPPIST-1e,24.0,0.0,0.0,0.0,2.0,1.0,109.0,9.0,25.0,549.0,44.0
2,Europa,0003_01,Altark Susent,False,TRAPPIST-1e,58.0,1.0,0.0,0.0,2.0,6.0,43.0,3576.0,0.0,6715.0,49.0
3,Europa,0003_02,Solam Susent,False,TRAPPIST-1e,33.0,0.0,0.0,0.0,2.0,6.0,0.0,1283.0,371.0,3329.0,193.0
4,Earth,0004_01,Willy Santantines,True,TRAPPIST-1e,16.0,0.0,1.0,0.0,2.0,1.0,303.0,70.0,151.0,565.0,2.0


In [19]:
df.isna().sum()

HomePlanet      288
PassengerId       0
Name            294
Transported       0
Destination     274
Age               0
VIP               0
Num               0
CryoSleep         0
Side              0
Deck              0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
dtype: int64

In [20]:
df.drop(['Name', 'PassengerId'], axis = 1, inplace = True)

In [21]:
df['Destination'] = df['Destination'].fillna('U')
df['HomePlanet'] = df['HomePlanet'].fillna('U')

In [22]:
df.isna().sum()

HomePlanet      0
Transported     0
Destination     0
Age             0
VIP             0
Num             0
CryoSleep       0
Side            0
Deck            0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [23]:
category_colls = ['HomePlanet', 'Destination']
for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)




In [24]:
df = df.drop(columns = category_colls)


In [25]:
df.head()


Unnamed: 0,Transported,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
0,False,39.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,False,True,False
1,True,24.0,0.0,0.0,0.0,2.0,1.0,109.0,9.0,25.0,549.0,44.0,True,False,False,False,False,False,True,False
2,False,58.0,1.0,0.0,0.0,2.0,6.0,43.0,3576.0,0.0,6715.0,49.0,False,True,False,False,False,False,True,False
3,False,33.0,0.0,0.0,0.0,2.0,6.0,0.0,1283.0,371.0,3329.0,193.0,False,True,False,False,False,False,True,False
4,True,16.0,0.0,1.0,0.0,2.0,1.0,303.0,70.0,151.0,565.0,2.0,True,False,False,False,False,False,True,False


In [26]:
df.corr()

Unnamed: 0,Transported,Age,VIP,Num,CryoSleep,Side,Deck,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
Transported,1.0,-0.05052,-0.01872,-0.03524,0.324373,0.059872,0.077959,-0.174781,0.034746,0.004154,-0.154832,-0.142783,-0.119644,0.131977,0.005643,0.006403,0.083625,0.00076,-0.072731,-0.000554
Age,-0.05052,1.0,0.081136,-0.135687,-0.066183,0.010802,0.224186,0.067464,0.122608,0.036293,0.117329,0.101963,-0.207863,0.218534,0.024209,0.002761,0.022141,-0.032381,-0.000651,0.004959
VIP,-0.01872,0.081136,1.0,-0.091,-0.081129,-0.011211,0.156045,0.061711,0.122637,0.025194,0.080584,0.111061,-0.156609,0.139783,0.045486,-0.000597,0.038727,-0.006223,-0.026614,-0.009939
Num,-0.03524,-0.135687,-0.091,1.0,-0.039949,0.079494,-0.550705,-0.010884,-0.175264,-0.008314,-0.129783,-0.133408,0.359698,-0.4771,0.058,0.007981,-0.14499,0.105792,0.059594,0.001456
CryoSleep,0.324373,-0.066183,-0.081129,-0.039949,1.0,0.001871,-0.008525,-0.257433,-0.212606,-0.222795,-0.204402,-0.195308,-0.114836,0.102049,0.03356,0.000297,0.069125,0.087528,-0.108492,-0.018113
Side,0.059872,0.010802,-0.011211,0.079494,0.001871,1.0,-0.231266,-0.017612,0.007978,-0.011452,-0.002416,0.009292,-0.011429,0.014239,-0.004209,0.008924,0.009081,-0.008157,-0.00437,0.005035
Deck,0.077959,0.224186,0.156045,-0.550705,-0.008525,-0.231266,1.0,0.031076,0.285966,0.020585,0.221117,0.223562,-0.622228,0.775551,-0.050491,-0.006267,0.244529,-0.182194,-0.096539,-0.007727
RoomService,-0.174781,0.067464,0.061711,-0.010884,-0.257433,-0.017612,0.031076,1.0,-0.01874,0.059951,0.010343,-0.023413,-0.14083,-0.073826,0.253314,-0.004935,-0.023375,-0.061011,0.059772,-0.005594
FoodCourt,0.034746,0.122608,0.122637,-0.175264,-0.212606,0.007978,0.285966,-0.01874,1.0,0.000593,0.22948,0.242382,-0.204307,0.36306,-0.127287,-0.0126,0.130868,-0.062057,-0.071604,-0.010676
ShoppingMall,0.004154,0.036293,0.025194,-0.008314,-0.222795,-0.011452,0.020585,0.059951,0.000593,1.0,0.013033,0.003773,-0.074101,-0.032324,0.125051,0.001107,-0.015733,-0.029709,0.032671,-0.002067


In [27]:
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['amt_spent'] = df[bill_cols].sum(axis = 1)
df['std_amt_spent'] = df[bill_cols].std(axis = 1)
df['mean_amt_spent'] = df[bill_cols].mean(axis = 1)




In [30]:
df.corr()['Transported'].sort_values()

RoomService                 -0.174781
Spa                         -0.154832
VRDeck                      -0.142783
amt_spent                   -0.140452
mean_amt_spent              -0.140452
std_amt_spent               -0.121173
HomePlanet_Earth            -0.119644
Destination_TRAPPIST-1e     -0.072731
Age                         -0.050520
Num                         -0.035240
VIP                         -0.018720
Destination_U               -0.000554
Destination_PSO J318.5-22    0.000760
ShoppingMall                 0.004154
HomePlanet_Mars              0.005643
HomePlanet_U                 0.006403
FoodCourt                    0.034746
Side                         0.059872
Deck                         0.077959
Destination_55 Cancri e      0.083625
HomePlanet_Europa            0.131977
CryoSleep                    0.324373
Transported                  1.000000
Name: Transported, dtype: float64

In [31]:
df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['mean_amt_spent'] + df['amt_spent'] + df['HomePlanet_Earth']


In [36]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [33]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 640.0 kB/s eta 0:03:55
   ---------------------------------------- 0.0/150.0 MB 487.6 kB/s eta 0:05:08
   ---------------------------------------- 0.2/150.0 MB 1.2 MB/s eta 0:02:02
   ---------------------------------------- 0.8/150.0 MB 4.9 MB/s eta 0:00:31
   ---------------------------------------- 1.7/150.0 MB 7.8 MB/s eta 0:00:19
    --------------------------------------- 2.7/150.0 MB 9.9 MB/s eta 0:00:15
    --------------------------------------- 3.5/150.0 MB 11.2 MB/s eta 0:00:14
   - -------------------------------------- 4.4/150.0 MB 12.2 MB/s eta 0:00:12
   - -------------------------------------- 4.6/150.0 MB 11.8 MB/s eta 0:00:13
   - -------------------------------------- 5.2/150.0 MB 11.5 MB/s eta 0


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 660.6 kB/s eta 0:00:03
   -- ------------------------------------- 0.1/1.5 MB 1.1 MB/s eta 0:00:02
   ---------------- ----------------------- 0.6/1.5 MB 4.6 MB/s eta 0:00:01
   ---------------------------------------  1.4/1.5 MB 9.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 8.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns = 'Transported')
df_train.shape, df_test.shape


((8693, 25), (4277, 24))

In [38]:
X = df_train.drop(columns = 'Transported')
y = df_train['Transported']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [40]:
model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = XGBClassifier()
model_5 = LGBMClassifier()


In [41]:
model_1.fit(X_train,y_train)
pred = model_1.predict(X_test)
accuracy_score(y_test,pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7705577918343876

In [42]:
model_2.fit(X_train,y_train)
pred = model_2.predict(X_test)
accuracy_score(y_test,pred)

0.7521564117308798

In [43]:
model_3.fit(X_train,y_train)
pred = model_3.predict(X_test)
accuracy_score(y_test,pred)

0.7929844738355377

In [44]:
model_4.fit(X_train,y_train)
pred = model_4.predict(X_test)
accuracy_score(y_test,pred)

0.7941345600920069

In [45]:
model_5.fit(X_train,y_train)
pred = model_5.predict(X_test)
accuracy_score(y_test,pred)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2703
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


0.79700977573318

In [46]:
df_dummy = pd.read_csv('test.csv')
pred = model_5.predict(df_test)
final = pd.DataFrame()
final['PassengerId'] = df_dummy['PassengerId']
final['Transported'] = pred
final.to_csv('submission.csv', index = False)
