# ML Final Project

### Tanay, Vishal, Nikshita, Garv

In [1]:
!pip install scikit-learn matplotlib numpy pandas



In [2]:
import pandas as pd
import matplotlib as plt
import numpy as np
import sklearn 
from pandas import DataFrame

%matplotlib inline

df = pd.read_csv("train.csv")

labels = df['Transported']
features = df.drop(columns=['Transported'])

In [None]:
# Data Exploration

# Checking how many NaNs there are 
rows_with_nan = features.isnull().any(axis=1).sum()
rows_without_nan = len(features) - rows_with_nan

print(f"Rows with NaN: {rows_with_nan}")
print(f"Rows without NaN: {rows_without_nan}")

# Checking what the data looks like
features.head()

''' Based on the results of our data exploration, we have decided to process the 
    data in the following ways:
    We will drop the 2087 records with NaNs as there would still be 6606 records 
    left, which seems sufficient to train a model with. We will revisit this if 
    necessary.
    We will one hot encode the HomePlanet and Destination fields as they are 
    categorical. 
    We will drop the Name field since it is unique (or near unique) for each passenger,
    and it seems unlikely it could provide useful information.
    As the Cabin field essentially has three pieces of information (deck, number,
    and side), we have elected to break it down into three fields.
    Similarly, as the Passenger_Id field has two pieces of information (group number
    and passenger number), we will break it down into two fields.
    We will one hot encode the deck as it has only a handful of options.
    We will convert the new side feature from P or S into True or False.
    For all numeric features (RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
    Age, Room, Group, PassengerNumber), we will standardize the values so that 
    we can conduct PCA.
    Lastly, we will conduct PCA on the data.
'''

Rows with NaN: 2087
Rows without NaN: 6606


' Based on the results of our data exploration, we have decided to process the \n    data in the following ways:\n    We will drop the 2087 records with NaNs as there would still be 6606 records \n    left, which seems sufficient to train a model with. We will revisit this if \n    necessary.\n    We will one hot encode the HomePlanet and Destination fields as they are \n    categorical. \n    We will drop the Name field since it is unique (or near unique) for each passenger,\n    and it seems unlikely it could provide useful information.\n    As the Cabin field essentially has three pieces of information (deck, number,\n    and side), we have elected to break it down into three fields.\n    Similarly, as the Passenger_Id field has two pieces of information (group number\n    and passenger number), we will break it down into two fields.\n    We will one hot encode the deck as it has\n'

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# drop NaNs
processed_features = features.dropna()
print("Number of Dropped Records: ", len(features) - len(processed_features))
print("Number of Records Left: ", len(processed_features))

# Drop Name values
processed_features = processed_features.drop(columns=['Name'])

# Split Cabin values into three columns
processed_features[["Deck", "Room", "Side"]] = processed_features['Cabin'].str.split("/", expand=True)
processed_features = processed_features.drop(columns=['Cabin'])
processed_features.head()

# Split Passenger values into two columns
processed_features[["Group", "Passenger_Number"]] = processed_features['PassengerId'].str.split("_", expand=True)
processed_features = processed_features.drop(columns=['PassengerId'])
processed_features.head()

# One hot encode the HomePlanet
processed_features = pd.get_dummies(processed_features, columns=["HomePlanet"])

# One hot encode the DestinationPlanet
processed_features = pd.get_dummies(processed_features, columns=["Destination"])

# One hot encode the Deck
processed_features = pd.get_dummies(processed_features, columns=["Deck"])

# convert Side to T or F
processed_features["Side"] = processed_features['Side'].map({'P': True, 'S' : False})

# normalizing numeric features
scaler = StandardScaler()
processed_features['RoomService'] = scaler.fit_transform(processed_features[['RoomService']])
processed_features['FoodCourt'] = scaler.fit_transform(processed_features[['FoodCourt']])
processed_features['ShoppingMall'] = scaler.fit_transform(processed_features[['ShoppingMall']])
processed_features['Spa'] = scaler.fit_transform(processed_features[['Spa']])
processed_features['VRDeck'] = scaler.fit_transform(processed_features[['VRDeck']])
processed_features['Age'] = scaler.fit_transform(processed_features[['Age']])
processed_features['Room'] = scaler.fit_transform(processed_features[['Room']])
processed_features['Group'] = scaler.fit_transform(processed_features[['Group']])
processed_features['Passenger_Number'] = scaler.fit_transform(processed_features[['Passenger_Number']])

pca = PCA(n_components=0.95, svd_solver='full')
pca_data = pca.fit_transform(processed_features)
pca_df = DataFrame(pca_data)

print("Original data shape:", processed_features.shape)
print("Transformed data shape:", pca_df.shape)

processed_features.head()
# pca_df.head()

Number of Dropped Records:  2087
Number of Records Left:  6606
Original data shape: (6606, 26)
Transformed data shape: (6606, 14)


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Room,Side,...,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,False,0.695413,False,-0.345756,-0.285355,-0.309494,-0.273759,-0.269534,-1.167051,True,...,False,True,False,True,False,False,False,False,False,False
1,False,-0.336769,False,-0.176748,-0.279993,-0.266112,0.206165,-0.230494,-1.167051,False,...,False,True,False,False,False,False,False,True,False,False
2,False,2.002842,True,-0.279083,1.845163,-0.309494,5.596357,-0.226058,-1.167051,False,...,False,True,True,False,False,False,False,False,False,False
3,False,0.28254,False,-0.345756,0.479034,0.334285,2.636384,-0.098291,-1.167051,False,...,False,True,True,False,False,False,False,False,False,False
4,False,-0.887266,False,0.124056,-0.24365,-0.04747,0.220152,-0.267759,-1.165103,False,...,False,True,False,False,False,False,False,True,False,False
