**Importing Helper Library** 

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import  train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

**Importing Training and Test File** 

In [None]:
datapath = "./train.csv"
ogData = pd.read_csv(datapath)

testDatapath = "./test.csv"
ogTestData = pd.read_csv(testDatapath)

**Checking if data has NULL values or not** 

In [None]:
print(ogData.isnull().sum())
print(ogTestData.isnull().sum())

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


**Setting Original Data into New Variable, so that Original Data can't be Altered** 

In [None]:
data = ogData
testData = ogTestData

**Printing Unique Values of Data** 

In [None]:
for i in data:
  print(i, data[i].unique())

PassengerId ['0001_01' '0002_01' '0003_01' ... '9279_01' '9280_01' '9280_02']
HomePlanet ['Europa' 'Earth' 'Mars' nan]
CryoSleep [False True nan]
Cabin ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
Destination ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Age [39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.]
VIP [False True nan]
RoomService [   0.  109.   43. ... 1569. 8586.  745.]
FoodCourt [   0.    9. 3576. ... 3208. 6819. 4688.]
ShoppingMall [   0.   25.  371. ... 1085.  510. 1872.]
Spa [   0.  549. 6715. ... 2868. 1107. 1643.]
VRDeck [   0.   44.   49. ... 1164.  971. 3235.]
Name ['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' ... 'Fayey Connon'
 'Celeon Hontichre' 'Propsh Hontichre']

**Defining a function which changes the NULL values to median** 

In [None]:
def ReplaceMissingMedian(col,dataset):
  median = round(dataset[col].median())
  dataset[col].fillna(median, inplace=True)


**Defining a function which changes the NULL values to mode** 

In [None]:
def ReplaceMissingMode(col,dataset):
  mode = dataset[col].mode()
  dataset[col].fillna(mode, inplace=True)


**Calling ReplaceMissingMedian and ReplaceMissingMode to Replace NULL Values of Data** 

In [None]:
ReplaceMissingMode('HomePlanet',data)
ReplaceMissingMode('CryoSleep',data)
ReplaceMissingMode('Cabin',data)
ReplaceMissingMode('Destination',data)
ReplaceMissingMode('VIP',data)
ReplaceMissingMedian('Age',data)
ReplaceMissingMedian('RoomService',data)
ReplaceMissingMedian('FoodCourt',data)
ReplaceMissingMedian('ShoppingMall',data)
ReplaceMissingMedian('Spa',data)
ReplaceMissingMedian('VRDeck',data)

ReplaceMissingMode('HomePlanet',testData)
ReplaceMissingMode('CryoSleep',testData)
ReplaceMissingMode('Cabin',testData)
ReplaceMissingMode('Destination',testData)
ReplaceMissingMode('VIP',testData)
ReplaceMissingMedian('Age',testData)
ReplaceMissingMedian('RoomService',testData)
ReplaceMissingMedian('FoodCourt',testData)
ReplaceMissingMedian('ShoppingMall',testData)
ReplaceMissingMedian('Spa',testData)
ReplaceMissingMedian('VRDeck',testData)


**Fitting the LabelEncoder** 

In [None]:
a = data['Cabin'].unique()
b = testData['Cabin'].unique()
c = np.concatenate((a, b))
d = pd.Series(c).unique()

homePlanetLabel = LabelEncoder().fit(data['HomePlanet'])
cryoSleepLabel = LabelEncoder().fit(data['CryoSleep'])
cabinLabel = LabelEncoder().fit(d)
destinationLabel = LabelEncoder().fit(data['Destination'])
vipLabel = LabelEncoder().fit(data['VIP'])
transportedLabel = LabelEncoder().fit(data['Transported'])


**Replacing the data with LabelEncoder** 

In [None]:
data['HomePlanet'] = homePlanetLabel.transform(data['HomePlanet'])
data['CryoSleep'] = cryoSleepLabel.transform(data['CryoSleep'])
data['Cabin'] = cabinLabel.transform(data['Cabin'])
data['Destination'] = destinationLabel.transform(data['Destination'])
data['VIP'] = vipLabel.transform(data['VIP'])
data['Transported'] = transportedLabel.transform(data['Transported'])

testData['HomePlanet'] = homePlanetLabel.transform(testData['HomePlanet'])
testData['CryoSleep'] = cryoSleepLabel.transform(testData['CryoSleep'])
testData['Cabin'] = cabinLabel.transform(testData['Cabin'])
testData['Destination'] = destinationLabel.transform(testData['Destination'])
testData['VIP'] = vipLabel.transform(testData['VIP'])


**Checking if there are any Remaining NULL values** 

In [None]:
print(ogData.isnull().sum())
print(ogTestData.isnull().sum())

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin             0
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
dtype: int64
PassengerId      0
HomePlanet       0
CryoSleep        0
Cabin            0
Destination      0
Age              0
VIP              0
RoomService      0
FoodCourt        0
ShoppingMall     0
Spa              0
VRDeck           0
Name            94
dtype: int64


**Splitting Data into Features and Target Variable** 

In [None]:
"""['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin',
    'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt',
    'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported']"""

features = ['HomePlanet','CryoSleep', 'Cabin','Destination','Age','VIP',
            'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
target = ['Transported']

filterData = data[features + target]

X = filterData.iloc[:,:-1]
Y = filterData.iloc[:,-1]
testFileData = testData[features]

X.head()


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1,0,208,2,39.0,0,0.0,0.0,0.0,0.0,0.0
1,0,0,3241,2,24.0,0,109.0,9.0,25.0,549.0,44.0
2,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0
4,0,0,3243,2,16.0,0,303.0,70.0,151.0,565.0,2.0


**Splitting Train Data into Train and Test Variables** 

In [None]:
train_X,val_X,train_Y,val_Y = train_test_split(X,Y,test_size=0.2,random_state=0)

print("Training Data: ", train_Y.count(),end="\n\n")
print("Validation Data: ", val_Y.count(),end="\n\n")

train_X.head()

Training Data:  6954

Validation Data:  1739



Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
4278,1,0,1006,0,54.0,0,0.0,559.0,0.0,15238.0,2799.0
5971,0,0,3921,2,20.0,0,0.0,20.0,1.0,696.0,0.0
464,2,0,6657,2,43.0,0,1821.0,0.0,47.0,29.0,0.0
4475,0,0,6647,2,24.0,0,185.0,0.0,476.0,1810.0,53.0
8469,1,1,1348,0,25.0,0,0.0,0.0,0.0,0.0,0.0


**Predicting target variable using RandomForestClassifier Algorithm** 

In [None]:
forestModel = RandomForestClassifier(n_estimators=256, random_state=0)
forestModel.fit(train_X,train_Y)

forestPredict = forestModel.predict(val_X)

print(confusion_matrix(val_Y,forestPredict))
print(classification_report(val_Y,forestPredict))

[[701 162]
 [203 673]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.79       863
           1       0.81      0.77      0.79       876

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



**Predicting target variable using DecisionTreeClassifier Algorithm** 

In [None]:
treeModel = DecisionTreeClassifier(random_state=0)
treeModel.fit(train_X,train_Y)

treePredict = treeModel.predict(val_X)

print(confusion_matrix(val_Y,treePredict))
print(classification_report(val_Y,treePredict))

[[601 262]
 [222 654]]
              precision    recall  f1-score   support

           0       0.73      0.70      0.71       863
           1       0.71      0.75      0.73       876

    accuracy                           0.72      1739
   macro avg       0.72      0.72      0.72      1739
weighted avg       0.72      0.72      0.72      1739



**Predicting target variable using KNeighborsClassifier Algorithm** 

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(train_X,train_Y)

knnPredict = knn.predict(val_X)

print(confusion_matrix(val_Y,knnPredict))
print(classification_report(val_Y,knnPredict))

[[633 230]
 [176 700]]
              precision    recall  f1-score   support

           0       0.78      0.73      0.76       863
           1       0.75      0.80      0.78       876

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739



**Predicting target variable using KMeans Algorithm** 

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(train_X)

kmeansPredict = kmeans.predict(val_X)

print(confusion_matrix(val_Y,kmeansPredict))
print(classification_report(val_Y,kmeansPredict))

[[456 407]
 [435 441]]
              precision    recall  f1-score   support

           0       0.51      0.53      0.52       863
           1       0.52      0.50      0.51       876

    accuracy                           0.52      1739
   macro avg       0.52      0.52      0.52      1739
weighted avg       0.52      0.52      0.52      1739



**Predicting target variable using LogisticRegression Algorithm** 

In [None]:
logModel = LogisticRegression(random_state=0,max_iter=512)
logModel.fit(train_X,train_Y)

logPredict = logModel.predict(val_X)

print(confusion_matrix(val_Y,logPredict))
print(classification_report(val_Y,logPredict))

[[636 227]
 [167 709]]
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       863
           1       0.76      0.81      0.78       876

    accuracy                           0.77      1739
   macro avg       0.77      0.77      0.77      1739
weighted avg       0.77      0.77      0.77      1739



**Predicting target variable using VotingClassifier Algorithm** 

In [None]:
voteModel = VotingClassifier(estimators=[('forest', forestModel),('tree', treeModel), ('log', logModel)])
voteModel.fit(train_X,train_Y)

votePredict = voteModel.predict(val_X)

print(confusion_matrix(val_Y,votePredict))
print(classification_report(val_Y,votePredict))

[[674 189]
 [190 686]]
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       863
           1       0.78      0.78      0.78       876

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



#Submission

**As RandomForestClassifier Algorithm performed the better than other algorithm, we will retrain the model with the whole Train Dataset** 

In [None]:
model =  RandomForestClassifier(n_estimators=256, random_state=0)
model.fit(X,Y)

VotingClassifier(estimators=[('forest',
                              RandomForestClassifier(n_estimators=256,
                                                     random_state=0)),
                             ('tree', DecisionTreeClassifier(random_state=0)),
                             ('log',
                              LogisticRegression(max_iter=512,
                                                 random_state=0))])

**Predicting the Values of Test Data using the previously Trained Dataset and Storing it in a DataFrame** 

In [None]:
passengerID = ogTestData['PassengerId']
predictedValue = model.predict(testFileData)

ans = transportedLabel.inverse_transform(predictedValue)

output = pd.DataFrame()
output['PassengerId'] = passengerID
output['Transported'] = ans

output

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


**Saving the Output DataFrame into a CSV File** 

In [None]:
output.to_csv('final_17.csv',index=False)