## Importing required libraries

In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [2]:
data = pd.read_csv("./data/preprocessed_data.csv")
test_data = pd.read_csv("./data/test.csv")

## Splitting training and validation set

In [3]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]
X.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,...,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,...,0,1,0,0,0,0,0,0,1,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0,0,0,0,0,1,0,0,0,1
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [4]:
Y.head()

0    0
1    1
2    0
3    0
4    1
Name: Transported, dtype: int64

In [5]:
cols = X.columns
cols

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Cabin_num',
       'Expenses', 'Cabin_deck_A', 'Cabin_deck_B', 'Cabin_deck_C',
       'Cabin_deck_D', 'Cabin_deck_E', 'Cabin_deck_F', 'Cabin_deck_G',
       'Cabin_deck_T', 'Cabin_side_P', 'Cabin_side_S'],
      dtype='object')

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=0)

In [7]:
X_train.shape, y_train.shape

((6954, 26), (6954,))

In [8]:
X_valid.shape, y_valid.shape


((1739, 26), (1739,))

### Transforming test data

In [9]:
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [10]:
index = test_data.pop("PassengerId")
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [11]:
test_data.isnull().sum()

HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [12]:
test_data["HomePlanet"].mode()

0    Earth
Name: HomePlanet, dtype: object

In [13]:
test_data["CryoSleep"].mode()

0    False
Name: CryoSleep, dtype: object

In [14]:
test_data["HomePlanet"].fillna("Earth", inplace=True)
test_data["CryoSleep"].fillna(False, inplace=True)

In [15]:
cabin_values = test_data["Cabin"].value_counts().head(20).index
cabin_values

Index(['G/160/P', 'D/273/S', 'B/31/P', 'G/748/S', 'E/228/S', 'A/4/S',
       'G/597/P', 'G/73/S', 'F/579/P', 'D/92/P', 'B/214/P', 'G/587/P',
       'C/31/S', 'C/177/S', 'B/72/S', 'B/242/P', 'G/737/S', 'G/591/P',
       'C/295/P', 'G/1052/P'],
      dtype='object')

In [16]:
test_data["Cabin"].fillna("missing", inplace=True)

In [17]:
test_data["Cabin"] = test_data["Cabin"].map(
    lambda x: random.choice(cabin_values) if x=="missing" else x
)

In [18]:
test_data["Cabin"].isnull().sum()

0

In [19]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-null   bool   
 2   Cabin         4277 non-null   object 
 3   Destination   4185 non-null   object 
 4   Age           4186 non-null   float64
 5   VIP           4184 non-null   object 
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
 11  Name          4183 non-null   object 
dtypes: bool(1), float64(6), object(5)
memory usage: 371.9+ KB


In [20]:
test_data["Destination"].mode()

0    TRAPPIST-1e
Name: Destination, dtype: object

In [21]:
test_data["Destination"].fillna("TRAPPIST-1e", inplace=True)

In [22]:
test_data["Age"].median()

26.0

In [23]:
test_data["Age"].fillna(26, inplace=True)

In [24]:
test_data["VIP"].mode()

0    False
Name: VIP, dtype: object

In [25]:
test_data["VIP"].fillna(False, inplace=True)

In [26]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-null   bool   
 2   Cabin         4277 non-null   object 
 3   Destination   4277 non-null   object 
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   bool   
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
 11  Name          4183 non-null   object 
dtypes: bool(2), float64(6), object(4)
memory usage: 342.6+ KB


In [27]:
test_data["RoomService"].mode()

0    0.0
Name: RoomService, dtype: float64

In [28]:
test_data["RoomService"].fillna(0, inplace=True)

In [29]:
test_data["FoodCourt"].mode()

0    0.0
Name: FoodCourt, dtype: float64

In [30]:
test_data["FoodCourt"].fillna(0, inplace=True)

In [31]:
test_data["ShoppingMall"].mode()

0    0.0
Name: ShoppingMall, dtype: float64

In [32]:
test_data["ShoppingMall"].fillna(0, inplace=True)

In [33]:
test_data["Spa"].mode()

0    0.0
Name: Spa, dtype: float64

In [34]:
test_data["Spa"].fillna(0, inplace=True)

In [35]:
test_data["VRDeck"].mode()

0    0.0
Name: VRDeck, dtype: float64

In [36]:
test_data["VRDeck"].fillna(0, inplace=True)

In [37]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   object 
 1   CryoSleep     4277 non-null   bool   
 2   Cabin         4277 non-null   object 
 3   Destination   4277 non-null   object 
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   bool   
 6   RoomService   4277 non-null   float64
 7   FoodCourt     4277 non-null   float64
 8   ShoppingMall  4277 non-null   float64
 9   Spa           4277 non-null   float64
 10  VRDeck        4277 non-null   float64
 11  Name          4183 non-null   object 
dtypes: bool(2), float64(6), object(4)
memory usage: 342.6+ KB


In [38]:
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [39]:
test_data["CryoSleep"].replace({True:1, False:0}, inplace=True)

In [40]:
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,Europa,1,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [41]:
test_data["VIP"].replace({True:1, False:0}, inplace=True)

In [42]:
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,Europa,1,C/0/S,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [43]:
test_data["Expense"] = test_data[
    ["FoodCourt", "RoomService", "ShoppingMall", "Spa", "VRDeck"]
].sum(axis=1)

In [44]:
test_data[["Cabin_deck", "Cabin_num", "Cabin_side"]] = test_data["Cabin"].str.split("/", expand=True)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Expense,Cabin_deck,Cabin_num,Cabin_side
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0.0,G,3,S
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,2832.0,F,4,S
2,Europa,1,C/0/S,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,0.0,C,0,S
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,7418.0,C,1,S
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,645.0,F,5,S


In [45]:
test_data["Cabin_num"] = test_data["Cabin_num"].astype(int)

In [46]:
home_planet_dummies = pd.get_dummies(test_data["HomePlanet"], prefix="HomePlanet")
test_data = test_data.join(home_planet_dummies)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Expense,Cabin_deck,Cabin_num,Cabin_side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,0.0,G,3,S,1,0,0
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,2832.0,F,4,S,1,0,0
2,Europa,1,C/0/S,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,0.0,C,0,S,0,1,0
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,7418.0,C,1,S,0,1,0
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,645.0,F,5,S,1,0,0


In [47]:
destination_dummies = pd.get_dummies(test_data["Destination"], prefix="Destination")
test_data = test_data.join(destination_dummies)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,Expense,Cabin_deck,Cabin_num,Cabin_side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,...,0.0,G,3,S,1,0,0,0,0,1
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,...,2832.0,F,4,S,1,0,0,0,0,1
2,Europa,1,C/0/S,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,...,0.0,C,0,S,0,1,0,1,0,0
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,...,7418.0,C,1,S,0,1,0,0,0,1
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,...,645.0,F,5,S,1,0,0,0,0,1


In [48]:
cabin_deck_dummies = pd.get_dummies(test_data["Cabin_deck"], prefix="Cabin_deck")
test_data = test_data.join(cabin_deck_dummies)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,...,0,1,0,0,0,0,0,1,0,0
2,Europa,1,C/0/S,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,...,0,1,0,0,1,0,0,0,0,0
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,...,0,1,0,0,0,0,0,1,0,0


In [49]:
cabin_side_dummies = pd.get_dummies(test_data["Cabin_side"], prefix="Cabin_side")
test_data = test_data.join(cabin_side_dummies)
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S
0,Earth,1,G/3/S,TRAPPIST-1e,27.0,0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,1
1,Earth,0,F/4/S,TRAPPIST-1e,19.0,0,0.0,9.0,0.0,2823.0,...,0,0,0,0,0,1,0,0,0,1
2,Europa,1,C/0/S,55 Cancri e,31.0,0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1
3,Europa,0,C/1/S,TRAPPIST-1e,38.0,0,0.0,6652.0,0.0,181.0,...,0,0,1,0,0,0,0,0,0,1
4,Earth,0,F/5/S,TRAPPIST-1e,20.0,0,10.0,0.0,635.0,0.0,...,0,0,0,0,0,1,0,0,0,1


In [50]:
test_data.rename(columns={"Expense":"Expenses"}, inplace=True)

In [51]:
cols

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Cabin_num',
       'Expenses', 'Cabin_deck_A', 'Cabin_deck_B', 'Cabin_deck_C',
       'Cabin_deck_D', 'Cabin_deck_E', 'Cabin_deck_F', 'Cabin_deck_G',
       'Cabin_deck_T', 'Cabin_side_P', 'Cabin_side_S'],
      dtype='object')

In [52]:
pd.set_option("display.max_columns", 30)
test_data = test_data[cols]
test_data

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Cabin_num,Expenses,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S
0,1,27.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,1,3,0.0,0,0,0,0,0,0,1,0,0,1
1,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,1,0,0,0,0,1,4,2832.0,0,0,0,0,0,1,0,0,0,1
2,1,31.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0.0,0,0,1,0,0,0,0,0,0,1
3,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,0,1,0,0,0,1,1,7418.0,0,0,1,0,0,0,0,0,0,1
4,0,20.0,0,10.0,0.0,635.0,0.0,0.0,1,0,0,0,0,1,5,645.0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34.0,0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,1,1496,0.0,0,0,0,0,0,0,1,0,0,1
4273,0,42.0,0,0.0,847.0,17.0,10.0,144.0,1,0,0,0,0,1,4,1018.0,1,0,0,0,0,0,0,0,0,1
4274,1,26.0,0,0.0,0.0,0.0,0.0,0.0,0,0,1,1,0,0,296,0.0,0,0,0,1,0,0,0,0,1,0
4275,0,26.0,0,0.0,2680.0,0.0,0.0,523.0,0,1,0,0,0,1,297,3203.0,0,0,0,1,0,0,0,0,1,0


In [53]:
test_data.shape

(4277, 26)

In [54]:
X_train.shape

(6954, 26)

In [55]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  4277 non-null   int64  
 1   Age                        4277 non-null   float64
 2   VIP                        4277 non-null   int64  
 3   RoomService                4277 non-null   float64
 4   FoodCourt                  4277 non-null   float64
 5   ShoppingMall               4277 non-null   float64
 6   Spa                        4277 non-null   float64
 7   VRDeck                     4277 non-null   float64
 8   HomePlanet_Earth           4277 non-null   uint8  
 9   HomePlanet_Europa          4277 non-null   uint8  
 10  HomePlanet_Mars            4277 non-null   uint8  
 11  Destination_55 Cancri e    4277 non-null   uint8  
 12  Destination_PSO J318.5-22  4277 non-null   uint8  
 13  Destination_TRAPPIST-1e    4277 non-null   uint8

### Scaling data using `StandardScaler`

In [56]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

X_train[:5, :]

array([[-0.73173649,  1.75620953, -0.15296914, -0.33270064,  0.07279194,
        -0.27768257, 13.45945185,  2.21793704, -1.09987433,  1.75254916,
        -0.51080794,  1.96370702, -0.31341858, -1.54453524, -0.8381773 ,
         6.17998876, -0.17989836, -0.31919523,  3.1906213 , -0.2440828 ,
        -0.3445662 , -0.69024135, -0.65617821, -0.02682402, -0.9879927 ,
         0.9879927 ],
       [-0.73173649, -0.61644503, -0.15296914, -0.33270064, -0.27019708,
        -0.27610002,  0.35501789, -0.26222388,  0.90919478, -0.5705974 ,
        -0.51080794, -0.50924094, -0.31341858,  0.64744395,  1.38686771,
        -0.26264184, -0.17989836, -0.31919523, -0.31341858, -0.2440828 ,
        -0.3445662 ,  1.44876861, -0.65617821, -0.02682402,  1.01215322,
        -1.01215322],
       [-0.73173649,  0.988586  , -0.15296914,  2.32668723, -0.28292395,
        -0.20330269, -0.24604506, -0.26222388, -1.09987433, -0.5705974 ,
         1.95768296, -0.50924094, -0.31341858,  0.64744395, -0.98846543,
       

In [57]:
test_data = scaler.transform(test_data)
test_data[:5, :]

array([[ 1.36661219, -0.12795733, -0.15296914, -0.33270064, -0.28292395,
        -0.27768257, -0.27217823, -0.26222388,  0.90919478, -0.5705974 ,
        -0.51080794, -0.50924094, -0.31341858,  0.64744395, -1.1582715 ,
        -0.5210101 , -0.17989836, -0.31919523, -0.31341858, -0.2440828 ,
        -0.3445662 , -0.69024135,  1.52397624, -0.02682402, -0.9879927 ,
         0.9879927 ],
       [-0.73173649, -0.68622899, -0.15296914, -0.33270064, -0.27719686,
        -0.27768257,  2.27175088, -0.26222388,  0.90919478, -0.5705974 ,
        -0.51080794, -0.50924094, -0.31341858,  0.64744395, -1.15631971,
         0.49949048, -0.17989836, -0.31919523, -0.31341858, -0.2440828 ,
        -0.3445662 ,  1.44876861, -0.65617821, -0.02682402, -0.9879927 ,
         0.9879927 ],
       [ 1.36661219,  0.1511785 , -0.15296914, -0.33270064, -0.28292395,
        -0.27768257, -0.27217823, -0.26222388, -1.09987433,  1.75254916,
        -0.51080794,  1.96370702, -0.31341858, -1.54453524, -1.16412688,
       

### Logistic Regression model

In [58]:
model = LogisticRegression(random_state=0)
model.fit(X_train, y_train)

In [59]:
preds = model.predict(X_valid)
preds[:10]

array([1, 1, 0, 0, 1, 1, 0, 1, 1, 0], dtype=int64)

In [60]:
model.score(X_valid, y_valid)

0.7832087406555491

In [61]:
accuracy_score(y_valid, preds) * 100

78.32087406555492

In [62]:
print(classification_report(y_valid, preds))

              precision    recall  f1-score   support

           0       0.80      0.75      0.78       863
           1       0.77      0.81      0.79       876

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



In [63]:
test_preds = model.predict(test_data)
test_preds[:10]

array([1, 0, 1, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

In [70]:
output = pd.DataFrame({"PassengerId": index, "Transported": test_preds})

In [71]:
output["Transported"].replace({0:False, 1:True}, inplace=True)

In [72]:
output.to_csv("./data/submission_1.csv", index=False)