In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Preprocessing the data

In [5]:
#Converting the true and false values to 0 and 1 respectively
columns_to_convert = ['CryoSleep','VIP','Transported']
for col in columns_to_convert:
    df[col] = df[col].apply(lambda x:1 if x == True else 0)

In [6]:
#Dropping unnecessary columns
df=df.drop(columns = ['PassengerId','Name'], axis = 1)

In [7]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0
1,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1
2,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0
3,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0
4,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8693 non-null   int64  
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   int64  
dtypes: float64(6), int64(3), object(3)
memory usage: 815.1+ KB


In [9]:
print(df['HomePlanet'].mode()[0])
print(df['Destination'].mode()[0])

Earth
TRAPPIST-1e


In [10]:
#Filling the columns inplace of the NaN values
col = ["HomePlanet","Destination"]

for i in col:
    df[i].fillna(df[i].mode().iloc[0],inplace=True)

In [11]:
columns_to_fill = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for col in columns_to_fill:
    df[col].fillna(df[col].median(),inplace=True)

In [12]:
#Extracting the values from the cabin
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)

In [13]:
df.drop(columns=['Cabin'],inplace = True, axis = 1)

In [14]:
cols=["Deck","Side"]
for i in cols:
    df[i].fillna(df[i].mode().iloc[0],inplace=True)    

In [15]:
df['Num'].fillna(df['Num'].mode().iloc[0],inplace = True)

In [16]:
df['Num']=df['Num'].astype('int64')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8693 non-null   object 
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   int64  
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   int64  
 11  Deck          8693 non-null   object 
 12  Num           8693 non-null   int64  
 13  Side          8693 non-null   object 
dtypes: float64(6), int64(4), object(4)
memory usage: 950.9+ KB


In [20]:
#Converting the object type to integer type values using label encoder
object_columns = ['HomePlanet','Destination','Deck','Side']
for col in object_columns:
    print(df[col].unique())

['Europa' 'Earth' 'Mars']
['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e']
['B' 'F' 'A' 'G' 'E' 'D' 'C' 'T']
['P' 'S']


In [21]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df[object_columns] = df[object_columns].apply(encoder.fit_transform)

In [22]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1,5,0,1
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,0,1
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,0,1
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1,5,1,1


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   int32  
 1   CryoSleep     8693 non-null   int64  
 2   Destination   8693 non-null   int32  
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   int64  
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   int64  
 11  Deck          8693 non-null   int32  
 12  Num           8693 non-null   int64  
 13  Side          8693 non-null   int32  
dtypes: float64(6), int32(4), int64(4)
memory usage: 815.1 KB


# Splitting the dataset for training the model

In [24]:
target = 'Transported'
X = df.drop(columns = [target])
y = df[target]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Training the Machine Learning Model

In [26]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators= 100,min_samples_split=10, 
                              min_samples_leaf= 4,max_features="sqrt",max_depth= None,random_state = 62)
random_forest.fit(X_train,y_train)

In [27]:
predictions = random_forest.predict(X_train)
accuracy = accuracy_score(y_train,predictions)
print(accuracy)

0.8987633016968651


In [28]:
test_prediction = random_forest.predict(X_test)
test_score = accuracy_score(y_test,test_prediction)
print(test_score)

0.7993099482461185


# Applying it on the Test Dataset

In [53]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [56]:
passenger_id = test_data["PassengerId"]

In [57]:
test_data[['Deck', 'Num', 'Side']] = test_data['Cabin'].str.split('/', expand=True)
test_data=test_data.drop(columns = ['PassengerId','Name','Cabin'], axis = 1)

In [58]:
test_data.isnull().sum()

HomePlanet       87
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Deck            100
Num             100
Side            100
dtype: int64

In [59]:
columns_to_convert = ['CryoSleep','VIP']
for col in columns_to_convert:
    test_data[col] = test_data[col].apply(lambda x:1 if x == True else 0)

In [60]:
col = ["HomePlanet","Destination"]

for i in col:
    test_data[i].fillna(test_data[i].mode().iloc[0],inplace=True)

In [61]:
columns_to_fill = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
for col in columns_to_fill:
    test_data[col].fillna(test_data[col].median(),inplace=True)

In [62]:
cols=["Deck","Side"]
for i in cols:
    test_data[i].fillna(test_data[i].mode().iloc[0],inplace=True)  
test_data['Num'].fillna(test_data['Num'].mode().iloc[0],inplace = True)
test_data['Num']=test_data['Num'].astype('int64')

In [63]:
object_columns = ['HomePlanet','Destination','Deck','Side']
for col in object_columns:
    print(test_data[col].unique())

['Earth' 'Europa' 'Mars']
['TRAPPIST-1e' '55 Cancri e' 'PSO J318.5-22']
['G' 'F' 'C' 'B' 'D' 'E' 'A' 'T']
['S' 'P']


In [64]:
test_data[object_columns] = test_data[object_columns].apply(encoder.fit_transform)

In [65]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   int32  
 1   CryoSleep     4277 non-null   int64  
 2   Destination   4277 non-null   int32  
 3   Age           4277 non-null   float64
 4   VIP           4277 non-null   int64  
 5   RoomService   4277 non-null   float64
 6   FoodCourt     4277 non-null   float64
 7   ShoppingMall  4277 non-null   float64
 8   Spa           4277 non-null   float64
 9   VRDeck        4277 non-null   float64
 10  Deck          4277 non-null   int32  
 11  Num           4277 non-null   int64  
 12  Side          4277 non-null   int32  
dtypes: float64(6), int32(4), int64(3)
memory usage: 367.7 KB


In [66]:
final_predictions = random_forest.predict(test_data)

In [67]:
print(len(final_predictions))

4277


In [68]:
submission = pd.read_csv("sample_submission.csv")

In [70]:
submission["Transported"] = final_predictions
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,1
1,0018_01,0
2,0019_01,1
3,0021_01,1
4,0023_01,1


In [72]:
submission.Transported=submission.Transported.apply(lambda x: True if x==1 else False)
submission.Transported

0        True
1       False
2        True
3        True
4        True
        ...  
4272     True
4273    False
4274     True
4275     True
4276    False
Name: Transported, Length: 4277, dtype: bool

In [73]:
submission.to_csv("submission.csv",index=False)