In [1]:
# ===============================
# Author: Pranjal Kumar Shukla
# GitHub: https://github.com/PranjalKumar09/machine-learning-projects
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train, df_test])
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.isna().sum()[df.isna().sum()>0]

HomePlanet      288
CryoSleep       310
Cabin           299
Destination     274
Age             270
VIP             296
RoomService     263
FoodCourt       289
ShoppingMall    306
Spa             284
VRDeck          268
Name            294
dtype: int64

In [4]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df  = df.drop(columns=['Name', 'Cabin'])


In [5]:

df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')

In [6]:
encoder = LabelEncoder()
df['Deck']  = encoder.fit_transform(df['Deck'])
df['Side'] = encoder.fit_transform(df['Side'])

In [8]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,0,0
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,5,0,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0,0,1
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0,0,1
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,5,1,1


In [9]:
impute_lis = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns)- set(impute_lis))
df_rest = df[rest]
imp = KNNImputer()
df_imputed = imp.fit_transform(df[impute_lis])
df_imputed = pd.DataFrame(df_imputed, columns = impute_lis )
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop=True)], axis = 1)


In [10]:
bill_clos = ['RoomService', 'FoodCourt', 'ShoppingMall','Spa' ]
df['mean_amt_spent'] = df[bill_clos].mean(axis =1)
df['std_amt_spent'] = df[bill_clos].std(axis =1)

df= df.fillna('U')

category_colls  = ['Destination', 'HomePlanet']

for col in category_colls:
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis = 1)


In [11]:
df = df.drop(columns = category_colls)

In [12]:
df.corr()['Transported'].sort_values(ascending = False)

Transported                  1.000000
CryoSleep                    0.324373
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Side                         0.067358
FoodCourt                    0.034746
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004154
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050478
Destination_TRAPPIST-1e     -0.072731
std_amt_spent               -0.077729
Deck                        -0.084981
mean_amt_spent              -0.099098
HomePlanet_Earth            -0.119644
VRDeck                      -0.142783
Spa                         -0.154832
RoomService                 -0.174781
Name: Transported, dtype: float64

In [14]:
df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['HomePlanet_Europa']+df['Destination_55 Cancri e']
df['2_low_cols'] = df['RoomService'] + df['Spa'] + df['HomePlanet_Earth']
df.corr()['Transported'].sort_values(ascending = False)

Transported                  1.000000
CryoSleep                    0.324373
3_high_cols                  0.251587
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
Side                         0.067358
FoodCourt                    0.034746
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
ShoppingMall                 0.004154
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018720
Num                         -0.035240
Age                         -0.050478
Destination_TRAPPIST-1e     -0.072731
std_amt_spent               -0.077729
Deck                        -0.084981
mean_amt_spent              -0.099098
HomePlanet_Earth            -0.119644
VRDeck                      -0.142783
Spa                         -0.154832
RoomService                 -0.174781
2_low_cols                  -0.220362
Name: Transported, dtype: float64

In [19]:
# ! pip install lightbgm
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# from lightbgm import LGBMClassifier

df_train , df_test  = df[:df_train.shape[0]], df[df_train.shape[0]:]




In [20]:
df_test = df_test.drop(columns="Transported")
df_train.shape, df_train.test

In [21]:
X = df_train.drop(columns="Transported")
Y = df_train["Transported"]

X_train, X_test , Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 23)

model_1 = LogisticRegression()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = XGBClassifier()


In [22]:
model_1.fit(X_train, Y_train)
pred = model_1.predict(X_test)
accuracy_score(Y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.78953421506613

In [23]:
model_2.fit(X_train, Y_train)
pred = model_2.predict(X_test)
accuracy_score(Y_test, pred)

0.7343300747556066

In [24]:
model_3.fit(X_train, Y_train)
pred = model_3.predict(X_test)
accuracy_score(Y_test, pred)

0.7981598619896493

In [25]:
model_4.fit(X_train, Y_train)
pred = model_4.predict(X_test)
accuracy_score(Y_test, pred)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:PassengerId: object

In [29]:
pred = model_3.predict(df_test)
final = pd.DataFrame()
final['PassengerId'] = df_test['PassengerId']
final['Transported'] = pred

In [30]:
final.to_csv('prediction.csv', index = False)