# ML Final Project

### Tanay, Vishal, Nikshita, Garv

In [32]:
!pip install scikit-learn matplotlib numpy pandas



In [33]:
import pandas as pd
import matplotlib as plt
import numpy as np
import sklearn 
from pandas import DataFrame

%matplotlib inline

df = pd.read_csv("train.csv")

labels = df['Transported']
# features = df.drop(columns=['Transported'])
features = df

In [34]:
# Data Exploration

# Checking how many NaNs there are 
rows_with_nan = df.isnull().any(axis=1).sum()
rows_without_nan = len(df) - rows_with_nan

print(f"Rows with NaN: {rows_with_nan}")
print(f"Rows without NaN: {rows_without_nan}")

# Checking what the data looks like
df.head()

''' Based on the results of our data exploration, we have decided to process the 
    data in the following ways:
    We will drop the 2087 records with NaNs as there would still be 6606 records 
    left, which seems sufficient to train a model with. We will revisit this if 
    necessary.
    We will one hot encode the HomePlanet and Destination fields as they are 
    categorical. 
    We will drop the Name field since it is unique (or near unique) for each passenger,
    and it seems unlikely it could provide useful information.
    As the Cabin field essentially has three pieces of information (deck, number,
    and side), we have elected to break it down into three fields.
    Similarly, as the Passenger_Id field has two pieces of information (group number
    and passenger number), we will break it down into two fields.
    We will one hot encode the deck as it has only a handful of options.
    We will convert the new side feature from P or S into True or False.
    For all numeric features (RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
    Age, Room, Group, PassengerNumber), we will standardize the values so that 
    we can conduct PCA.
    Lastly, we will conduct PCA on the data.
'''

Rows with NaN: 2087
Rows without NaN: 6606


' Based on the results of our data exploration, we have decided to process the \n    data in the following ways:\n    We will drop the 2087 records with NaNs as there would still be 6606 records \n    left, which seems sufficient to train a model with. We will revisit this if \n    necessary.\n    We will one hot encode the HomePlanet and Destination fields as they are \n    categorical. \n    We will drop the Name field since it is unique (or near unique) for each passenger,\n    and it seems unlikely it could provide useful information.\n    As the Cabin field essentially has three pieces of information (deck, number,\n    and side), we have elected to break it down into three fields.\n    Similarly, as the Passenger_Id field has two pieces of information (group number\n    and passenger number), we will break it down into two fields.\n    We will one hot encode the deck as it has only a handful of options.\n    We will convert the new side feature from P or S into True or False.\n  

In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

print("Columns with NaNs: ", features.isnull().any())

# fill NaNs in HomePlanet with random values based on distribution
value_counts = features['HomePlanet'].value_counts(normalize=True)
features["HomePlanet"] = features["HomePlanet"].fillna(lambda: 
                                                       np.random.choice(value_counts.index, p=value_counts.values)) 

# fill NaNs in Destination with random values based on distribution
value_counts = features['Destination'].value_counts(normalize=True)
features["Destination"] = features["Destination"].fillna(lambda: 
                                                         np.random.choice(value_counts.index, p=value_counts.values))

# Impute RoomService with the mean
features["RoomService"] = features["RoomService"].fillna(features["RoomService"].mean())

# Impute FoodCourt with the mean
features["FoodCourt"] = features["FoodCourt"].fillna(features["FoodCourt"].mean())

# Impute ShoppingMall with the mean
features["ShoppingMall"] = features["ShoppingMall"].fillna(features["ShoppingMall"].mean())

# Impute Spa with the mean
features["Spa"] = features["Spa"].fillna(features["Spa"].mean())

# Impute VRDeck with the mean
features["VRDeck"] = features["VRDeck"].fillna(features["VRDeck"].mean())

# Impute Age with the mean
features['Age'] = features['Age'].fillna(features['Age'].mean())

print("Columns with NaNs: ", features.columns[features.isnull().any()].tolist())

# drop NaNs
processed_features = features.dropna()

print("Number of Dropped Records: ", len(features) - len(processed_features))
print("Number of Records Left: ", len(processed_features))

# Drop Name values
processed_features = processed_features.drop(columns=['Name'])

# Split Cabin values into three columns
processed_features[["Deck", "Room", "Side"]] = processed_features['Cabin'].str.split("/", expand=True)
processed_features = processed_features.drop(columns=['Cabin'])
processed_features.head()

# Split Passenger values into two columns
processed_features[["Group", "Passenger_Number"]] = processed_features['PassengerId'].str.split("_", expand=True)
processed_features = processed_features.drop(columns=['PassengerId'])
processed_features.head()

# One hot encode the HomePlanet
processed_features = pd.get_dummies(processed_features, columns=["HomePlanet"])

# One hot encode the DestinationPlanet
processed_features = pd.get_dummies(processed_features, columns=["Destination"])

# One hot encode the Deck
processed_features = pd.get_dummies(processed_features, columns=["Deck"])

# convert Side to T or F
processed_features["Side"] = processed_features['Side'].map({'P': True, 'S' : False})

# normalizing numeric features
scaler = StandardScaler()
processed_features['RoomService'] = scaler.fit_transform(processed_features[['RoomService']])
processed_features['FoodCourt'] = scaler.fit_transform(processed_features[['FoodCourt']])
processed_features['ShoppingMall'] = scaler.fit_transform(processed_features[['ShoppingMall']])
processed_features['Spa'] = scaler.fit_transform(processed_features[['Spa']])
processed_features['VRDeck'] = scaler.fit_transform(processed_features[['VRDeck']])
processed_features['Age'] = scaler.fit_transform(processed_features[['Age']])
processed_features['Group'] = scaler.fit_transform(processed_features[['Group']])
processed_features['Passenger_Number'] = scaler.fit_transform(processed_features[['Passenger_Number']])

labels = processed_features["Transported"]
processed_features = processed_features.drop(labels = "Transported",axis="columns")

pca = PCA(n_components=0.95, svd_solver='full')
pca_data = pca.fit_transform(processed_features)
pca_df = DataFrame(pca_data)

print("Original data shape:", processed_features.shape)
print("Transformed data shape:", pca_df.shape)

processed_features.head()

Columns with NaNs:  PassengerId     False
HomePlanet       True
CryoSleep        True
Cabin            True
Destination      True
Age              True
VIP              True
RoomService      True
FoodCourt        True
ShoppingMall     True
Spa              True
VRDeck           True
Name             True
Transported     False
dtype: bool
Columns with NaNs:  ['CryoSleep', 'Cabin', 'VIP', 'Name']
Number of Dropped Records:  786
Number of Records Left:  7907
Original data shape: (7907, 28)
Transformed data shape: (7907, 1)


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Room,Side,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,False,0.708808,False,-0.352522,-0.285832,-0.314414,-0.274772,-0.274206,0,True,...,False,True,False,True,False,False,False,False,False,False
1,False,-0.340738,False,-0.178372,-0.280207,-0.269673,0.213549,-0.23443,0,False,...,False,True,False,False,False,False,False,True,False,False
2,False,2.038234,True,-0.28382,1.949265,-0.314414,5.698051,-0.22991,0,False,...,False,True,True,False,False,False,False,False,False,False
3,False,0.28899,False,-0.352522,0.516078,0.349543,2.686289,-0.099733,0,False,...,False,True,True,False,False,False,False,False,False,False
4,False,-0.900496,False,0.131582,-0.24208,-0.044178,0.227781,-0.272398,1,False,...,False,True,False,False,False,False,False,True,False,False


In [39]:
# Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_leaf': [5, 10, 15, 20],
    'max_features': [5, 10, 15],
    'criterion': ['gini', 'entropy']
}
clf = DecisionTreeClassifier()

# runs the nested cross validation
acc = cross_val_score(GridSearchCV(clf, param_grid, cv=5), X=processed_features, y=labels, cv=10)
print(acc.mean() * 100)


74.99660740290291


In [37]:
# KNN Pipeline + Model
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

ss = StandardScaler()
pca = PCA()
knn = KNeighborsClassifier()

pipeline = Pipeline([
    ('scaler', ss),
    ('pca', pca),
    ('knn', knn),
])

param_grid = {
    'pca__n_components': list(range(1, 11)),
    'knn__n_neighbors': list(range(1, 10))
}

inner_cv = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
acc = cross_val_score(inner_cv, X=processed_features, y=labels, cv=5)

print(acc.mean() * 100)

73.11277008662444


In [38]:
# with a support vector machine
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
import sklearn as sk


pipeline = Pipeline([
    ('scaler', ss),
    ('pca', pca),
    ('svc', SVC())
])

params_grid = {
    'pca__n_components': list(range(5, 19)),
    'svc__kernel': ['linear', 'rbf', 'poly']
}

inner_cv = GridSearchCV(pipeline, params_grid, cv=5, scoring='accuracy')
label_preds = cross_val_predict(inner_cv, X=processed_features, y=labels, cv=10)

class_report = sk.metrics.classification_report(labels, label_preds)
print("\nClassification Report:\n", class_report)
# Accuracy is in the classification report

KeyboardInterrupt: 