In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [44]:
def OHE (x_train , x_test , test , column):
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoder_x_train = encoder.fit_transform(x_train[[column]])
    encoder_x_test = encoder.transform(x_test[[column]])
    encoder_test = encoder.transform(test[[column]])

    encoder_x_train_df = pd.DataFrame(encoder_x_train, columns=encoder.get_feature_names_out([column]))
    encoder_x_test_df = pd.DataFrame(encoder_x_test, columns=encoder.get_feature_names_out([column]))
    encoder_test_df = pd.DataFrame(encoder_test, columns=encoder.get_feature_names_out([column]))

    x_train = pd.concat([x_train.drop(column, axis=1).reset_index(drop=True), encoder_x_train_df], axis=1)
    x_test = pd.concat([x_test.drop(column, axis=1).reset_index(drop=True), encoder_x_test_df], axis=1)
    test = pd.concat([test.drop(column, axis=1).reset_index(drop=True), encoder_test_df], axis=1)

    return x_train, x_test, test

In [45]:
def ModeImputer (x_train , x_test , test , column):
    si = SimpleImputer (strategy = 'most_frequent')
    x_train[column] = si.fit_transform (x_train[[column]]).ravel()
    x_test[column] = si.transform (x_test[[column]]).ravel()
    test[column] = si.transform (test[[column]]).ravel()
    return x_train , x_test , test

In [46]:
def MeanImputer (x_train , x_test , test , column):
    si = SimpleImputer ()
    x_train[column] = si.fit_transform (x_train[[column]]).ravel()
    x_test[column] = si.transform (x_test[[column]]).ravel()
    test[column] = si.transform (test[[column]]).ravel()
    return x_train , x_test , test

In [47]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [48]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [50]:
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [51]:
data = data.drop(['PassengerId', 'Name'] , axis = 1)
test = test.drop(['PassengerId', 'Name'] , axis = 1)

In [52]:
x = data.drop('Transported' , axis = 1)
y = data['Transported']

In [54]:
x_train , x_test , y_train , y_test = train_test_split (x , y , test_size = 0.2 , random_state = 42)

In [55]:
x_train , x_test , test = ModeImputer (x_train , x_test , test , 'HomePlanet')

In [56]:
x_train['HomePlanet'].value_counts()

HomePlanet
Earth     3859
Europa    1702
Mars      1393
Name: count, dtype: int64

In [57]:
x_train , x_test , test = OHE (x_train , x_test , test , 'HomePlanet')



In [58]:
x_train , x_test , test = ModeImputer (x_train , x_test , test , 'CryoSleep')

In [59]:
x_train['CryoSleep'].value_counts()

CryoSleep
False    4554
True     2400
Name: count, dtype: int64

In [60]:
x_train['CryoSleep'] = x_train['CryoSleep'].map({True : 1 , False : 0})
x_test['CryoSleep'] = x_test['CryoSleep'].map({True : 1 , False : 0})
test['CryoSleep'] = test['CryoSleep'].map({True : 1 , False : 0})

In [61]:
x_train['Cabin'].value_counts()

Cabin
D/176/S    7
G/734/S    7
G/109/P    7
G/981/S    7
B/82/S     7
          ..
F/904/S    1
F/593/P    1
B/217/S    1
G/750/S    1
C/253/P    1
Name: count, Length: 5441, dtype: int64

In [62]:
x_train = x_train.drop('Cabin' , axis = 1)
x_test = x_test.drop('Cabin' , axis = 1)
test = test.drop('Cabin' , axis = 1)

In [63]:
x_train , x_test , test = ModeImputer (x_train , x_test , test , 'Destination')

In [64]:
x_train , x_test , test = OHE (x_train , x_test , test , 'Destination')



In [65]:
x_train['HomePlanet_Earth'].value_counts()

HomePlanet_Earth
1.0    3859
0.0    3095
Name: count, dtype: int64

In [66]:
x_train , x_test , test = MeanImputer (x_train , x_test , test , 'Age')

In [67]:
x_train['VIP'].value_counts()

VIP
False    6636
True      156
Name: count, dtype: int64

In [68]:
x_train , x_test , test = ModeImputer (x_train , x_test , test , 'VIP')

In [69]:
x_train['VIP'] = x_train['VIP'].map({True : 1 , False : 0})
x_test['VIP'] = x_test['VIP'].map({True : 1 , False : 0})
test['VIP'] = test['VIP'].map({True : 1 , False : 0})

In [70]:
x_train , x_test , test = MeanImputer (x_train , x_test , test , 'RoomService')

In [71]:
x_train , x_test , test = MeanImputer (x_train , x_test , test , 'FoodCourt')

In [72]:
x_train , x_test , test = MeanImputer (x_train , x_test , test , 'ShoppingMall')

In [73]:
x_train , x_test , test = MeanImputer (x_train , x_test , test , 'Spa')

In [74]:
x_train , x_test , test = MeanImputer (x_train , x_test , test , 'VRDeck')

In [75]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6954 entries, 0 to 6953
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  6954 non-null   int64  
 1   Age                        6954 non-null   float64
 2   VIP                        6954 non-null   int64  
 3   RoomService                6954 non-null   float64
 4   FoodCourt                  6954 non-null   float64
 5   ShoppingMall               6954 non-null   float64
 6   Spa                        6954 non-null   float64
 7   VRDeck                     6954 non-null   float64
 8   HomePlanet_Earth           6954 non-null   float64
 9   HomePlanet_Europa          6954 non-null   float64
 10  HomePlanet_Mars            6954 non-null   float64
 11  Destination_55 Cancri e    6954 non-null   float64
 12  Destination_PSO J318.5-22  6954 non-null   float64
 13  Destination_TRAPPIST-1e    6954 non-null   float

In [76]:
le = LabelEncoder()

In [77]:
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [78]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  4277 non-null   int64  
 1   Age                        4277 non-null   float64
 2   VIP                        4277 non-null   int64  
 3   RoomService                4277 non-null   float64
 4   FoodCourt                  4277 non-null   float64
 5   ShoppingMall               4277 non-null   float64
 6   Spa                        4277 non-null   float64
 7   VRDeck                     4277 non-null   float64
 8   HomePlanet_Earth           4277 non-null   float64
 9   HomePlanet_Europa          4277 non-null   float64
 10  HomePlanet_Mars            4277 non-null   float64
 11  Destination_55 Cancri e    4277 non-null   float64
 12  Destination_PSO J318.5-22  4277 non-null   float64
 13  Destination_TRAPPIST-1e    4277 non-null   float

In [79]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier()
}

In [80]:
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} : {accuracy}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression : 0.7745830937320299
Decision Tree : 0.7285796434732605
Random Forest : 0.7682576193214491
Gradient Boosting : 0.7832087406555491
AdaBoost : 0.7768832662449684
Support Vector Machine : 0.7648073605520413
K-Nearest Neighbors : 0.7619321449108684
XGBoost : 0.78953421506613


In [81]:
sample_submission_df = pd.read_csv('sample_submission.csv')
sample_submission_df['Transported'] = models['XGBoost'].predict(test)
sample_submission_df['Transported'] = le.inverse_transform(sample_submission_df['Transported'])
sample_submission_df.to_csv('submission.csv', index=False)
sample_submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
