In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier, RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Spaceship Titanic !

Our goal here is to determine if someone on the spaceship survived or not. As we already got a train dataset, it's a supervised classification problem we will treat in this exercice.       
Let's not forget that we know what is our target variable, it's **"Transported"**

# I - pré-processing

## I.1 - A glance at our data

In [None]:
data_test = pd.read_csv("../input/spaceship-titanic/test.csv")
data_sample = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
data_train = pd.read_csv("../input/spaceship-titanic/train.csv")
df_test = data_test.copy(deep = True)
df_sample = data_sample.copy(deep = True)
df_train = data_train.copy(deep = True)

Let's take a look at our dataframes !

In [None]:
df_train.head()

- PassengerId : A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- HomePlanet : The planet the passenger departed from, typically their planet of permanent residence.
- CryoSleep : Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- Cabin : The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- Destination : The planet the passenger will be debarking to.
- Age : The age of the passenger.
- VIP : Whether the passenger has paid for special VIP service during the voyage.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- Name : The first and last names of the passenger.
- Transported : Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

As the name may not be an indicator of if someone survived or not, we may drop it.

In [None]:
df_train.drop(labels = 'Name', axis = 1, inplace = True)

## I.2 - Handling missing values

In [None]:
print("shape of the dataframe :", df_train.shape)
print("% of NaN in the dataframe :")
(df_train.isna().sum()/df_train.shape[0] * 100).sort_values()

In [None]:
plt.figure(figsize = (15,8))
sns.heatmap(~df_train.isna(), cbar = False)
plt.show()

The graphic above can be understood as : "If something is black, it's a missing value"       
As we can see, not that much values are missing and there don't seem to be any pattern in the missing values. However, what would happend if we drop row where there is at least one missing value ? 

In [None]:
df_train_without_missing = df_train.dropna(axis = 0, how = 'any').copy()
print("shape of the dataframe :", df_train.shape)
print("The number of rows went from : {} to {} so we still have {}% of our data".format(df_train.shape[0],
                                                                            df_train_without_missing.shape[0],
                                                                            round(df_train_without_missing.shape[0]/df_train.shape[0] *100 ,3)))

Basically we lost around 22% of our data which is kinda huge. In this kind of scenario, we could say two pretty opposite things :     
   - 1) We lost 22% that's big but we still got 6764 rows of EXACT data so we can still work with this      
   - 2) We lost 22% that's way too big. Furthermore what would we do if the sample dataset as the same issue and we still need to predict these missing rows ? What do we say to our employer ? "I won't do it" ?       
We can clearly understand that we ough to find a way to handle these missing values.

In [None]:
opt_knn_imputer_neighbor = 10
columns_Knn_imputer = ['CryoSleep','Age','VIP', 'RoomService',
                       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Transported']

columns_Simple_imputer = ['Destination','HomePlanet']
Knn_imputer = KNNImputer(n_neighbors= opt_knn_imputer_neighbor, weights = 'uniform')
df_train[columns_Knn_imputer] = Knn_imputer.fit_transform(df_train[columns_Knn_imputer])

Simple_imputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
df_train[columns_Simple_imputer] = Simple_imputer.fit_transform(df_train[columns_Simple_imputer])

An issue with Knn imputer is that for some cases where we have boolean variables, it replaced them with float but not just 1 or 0 values in float but with values such as 0.2 or 0.9. It's a big issues for such features so we will just say that anything above the mean of these features is a 1 and anything under is a 0.

In [None]:
data_train_CryoSleep_mean = np.mean(data_train["CryoSleep"])
data_train_VIP_mean = np.mean(data_train["VIP"])

df_train["CryoSleep"] = df_train["CryoSleep"].apply(lambda x: 1*(x >= data_train_CryoSleep_mean))
df_train["VIP"] = df_train["VIP"].apply(lambda x: 1*(x >= data_train_VIP_mean))

Let's see if there are any duplicates in our dataframe. As every passenger is supposed to have an unique passengerId, having two entries with the same passengerId should tell us that there are duplicates (or clones if you are more familliar with the sci-fi stuff)

In [None]:
if (df_train["PassengerId"].value_counts().max() == 1):
    print("no duplicates, we are good")
else:
    print("There are duplicates")

# II - Data Analysis : Looking for relationships

In this section, our goal is to find if there are any interesting relationship and which given features are interesting.

In [None]:
df_train.dtypes.value_counts()

In [None]:
print("The quantitative variables are : ")
for col in df_train.select_dtypes("int64"):
    print("- {}".format(col))
for col in df_train.select_dtypes("float64"):
    print("- {}".format(col))
    
print("\n")

print("The qualitative variables are : ")
for col in df_train.select_dtypes("object"):
    print("- {}".format(col))

## II.1 - Quantitative Variables

In [None]:
for col in df_train.select_dtypes("int64"):
    plt.subplots(figsize=(9,6))
    X = df_train[df_train['Transported'] == True][col]
    Y = df_train[df_train['Transported'] == False][col]
    plt.hist(X,bins = 20, label = "True", alpha = 0.6)
    plt.hist(Y,bins = 20, label = "False", alpha = 0.6)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.grid()
    plt.legend(title = "transported")
    plt.show()
    
for col in df_train.select_dtypes("float64"):
    plt.subplots(figsize=(9,6))
    X = df_train[df_train['Transported'] == True][col]
    Y = df_train[df_train['Transported'] == False][col]
    plt.hist(X,bins = 20, label = "True", alpha = 0.6)
    plt.hist(Y,bins = 20, label = "False", alpha = 0.6)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.grid()
    plt.legend(title = "transported")
    plt.show()

In [None]:
plt.subplots(figsize=(8,5))
sns.kdeplot(data = df_train, x = 'Age', hue = "Transported",alpha=.6, fill = True, palette="Pastel1")

min_ylim, max_ylim = plt.ylim()
plt.axvline(17, color='k', linestyle='dashed', linewidth=2)
plt.text(17*0.4, max_ylim*0.9, 'x = 17')


plt.grid()
plt.title("displot of Transported people against their age")
plt.show()

Looking at this graphic, we can see that after a specific age, there are no difference in distribution for the transported and non transported people but before this age (which might be around 17) there is a difference ! Age seems to have an influence on Transported.

In [None]:
df_train.select_dtypes("float64").describe()

As we can see,for "roomservice", "FoodCourt", "ShoppingMall", "Spa" and "VRDeck" most of our values are 0. 
However the standard deviation of each of these features is quite big (the lowest begin 638 for "ShoppingMall" and the biggest being 1676 for "FoodCourt"). Furthermore the distribution are not quietly the same if we distinct each feature according to the "Transported" feature.
Therefore we may consider all of these features as usefull for our futur model.

## II.2 - Bool Variables

In [None]:
print("The bool variables are : ")
for col in df_train.select_dtypes("int"):
    print("- {}".format(col))
for col in df_train.select_dtypes("bool"):
    print("- {}".format(col))

In [None]:
pd.crosstab(df_train["CryoSleep"],df_train["Transported"])

CryoSleep seems to play a big role, as if you are in cryosleep you have more chance to be transported !

In [None]:
pd.crosstab(df_train["VIP"],df_train["Transported"])

## II.3 - Object Variables

In [None]:
print("The object variables are : ")
for col in df_train.select_dtypes("object"):
    print("- {}".format(col))

In [None]:
df_train["HomePlanet"].describe()

In [None]:
sns.histplot(data = df_train, x = "HomePlanet", hue = "Transported", multiple= "dodge",alpha=.9, palette="Pastel1")
plt.grid()
plt.show()

There are different distributions according to the planet someone's from. Indeed from Europa you have more chance to be Transported wheareas if you are from Earth, it's the opposite and from Mars it's like flipping a coin.

In [None]:
sns.histplot(data = df_train, x = "Destination", hue = "Transported", multiple= "dodge",alpha=.9, palette="Pastel1")
plt.grid()
plt.title("histogram of destination")
plt.show()

Looking at the histplot, we can see that there are three possible Destination : "TRAPPIST-1e", "PSO J318.5-22" and "55 Cancri e".
Separating each destination according to "Transported", the distribution are not the same, for "TRAPPIST-1e" you have more chance of not begin transported wheareas is the total opposite for "55 Cancri e" and the oods are even for "PSO J318.5-22".

Due to the format of the cabin feature, we will have to slightly change it in order to see things. Indeed Cabin contain the deck and the side of each passenger. As these informations might be useful, we should try to take a look at them.

In [None]:
temp = df_train['Cabin'].str.split('/', expand=True)
df_train["Cabin_deck"] = temp[0]
df_train["Cabin_side"] = temp[2]

df_train.drop("Cabin",axis=1, inplace = True)

In [None]:
sns.histplot(data = df_train,
             x = "Cabin_deck",
             hue = "Transported",
             multiple= "dodge",
             alpha=.9,
             palette="Pastel1")
plt.grid()
plt.title("histogram of Cabin number")
plt.show()

In [None]:
sns.histplot(data = df_train,
             x = "Cabin_side",
             hue = "Transported",
             multiple= "dodge",
             alpha=.9,
             palette="Pastel1")
plt.grid()
plt.title("histogram of Cabin number")
plt.show()

As our model might not be able to take in account non numeric feature, we will have to slightly change our dataframe according to this. 

In [None]:
unique_HomePlanet = pd.unique(df_train["HomePlanet"])
unique_destination = pd.unique(df_train["Destination"])

for home in unique_HomePlanet:
    df_train["HomePlanet_" + home] = (df_train["HomePlanet"] == home).astype(int)

for dest in unique_destination:
    df_train["Destination_" + dest] = (df_train["Destination"] == dest).astype(int)
       
df_train.drop(["HomePlanet","Destination"],axis = 1, inplace =True)

temp = df_train.pop('Transported')
df_train.insert(len(df_train.columns), 'Transported', temp)

### II.4 -Summary

In [None]:
plt.subplots(figsize=(11,10))
sns.heatmap(np.abs(df_train.corr()), annot = True)
plt.title("Matrix of correlation (absolute value) of the different features")
plt.show()

Here we choose to show the absolute value of the correlation because we don't care if it's positive or negative, we just want to recap if a feature may or may not have an impact on "Transported". Therefore having only light colors for this is a better way to display it.

As we can see, CryoSleep is the most correlated variable followed by RoomService,SPA and VRDeck.     

**Remark**: We can see two groups of very correlated data, the "HomePlanet group" and the "Destination group". The variables inside these groups are very correlated as (by example) if you are from Mars, you are from Euphoria or Earth so it's pretty normal that the correlation is so huge.

In [None]:
usefull_features = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt','ShoppingMall',
                    'Spa', 'VRDeck','HomePlanet_Europa', 'HomePlanet_Earth', 'HomePlanet_Mars',
                     'Destination_TRAPPIST-1e', 'Destination_PSO J318.5-22',
                    'Destination_55 Cancri e']

# III - Model

In [None]:
# Handling missing values
df_test.drop(labels = 'Name', axis = 1, inplace = True)

columns_Knn_imputer = ['CryoSleep','Age','VIP', 'RoomService',
                       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
columns_Simple_imputer = ['Destination','HomePlanet']

Knn_imputer = KNNImputer(n_neighbors= opt_knn_imputer_neighbor, weights = 'distance')
df_test[columns_Knn_imputer] = Knn_imputer.fit_transform(df_test[columns_Knn_imputer])

Simple_imputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
df_test[columns_Simple_imputer] = Simple_imputer.fit_transform(df_test[columns_Simple_imputer])


df_test["CryoSleep"] = df_test["CryoSleep"].apply(lambda x: 1*(x >= data_train_CryoSleep_mean))
df_test["VIP"] = df_test["VIP"].apply(lambda x: 1*(x >= data_train_VIP_mean))

## handling HomePlanet and Destination
unique_HomePlanet = pd.unique(df_test["HomePlanet"])
unique_destination = pd.unique(df_test["Destination"])

for home in unique_HomePlanet:
    df_test["HomePlanet_" + home] = (df_test["HomePlanet"] == home).astype(int)

for dest in unique_destination:
    df_test["Destination_" + dest] = (df_test["Destination"] == dest).astype(int)
       
df_test.drop(["HomePlanet","Destination"],axis = 1, inplace =True)

## III.1 - Logistic Regression

The first model we can think of is a logistic regression model.

In [None]:
X_train = df_train[usefull_features]
y_train = df_train["Transported"]

logistic_score = cross_val_score(LogisticRegression(random_state=0,solver='lbfgs',max_iter = 1000),X_train,y_train, cv=5 ).mean()
print("The accuracy of the model is : {}".format(logistic_score))

The accuracy of the model is OK. Nothing really good but at least we can use it as a measure of it our other models are good enough or not (depending of if the scores are better than this or not)

In [None]:
model_logistic = LogisticRegression(random_state=0,solver='lbfgs',max_iter = 1000).fit(X_train, y_train)

In [None]:
df_test["Transported logistic"] = model_logistic.predict(df_test[usefull_features])
submission_logistic = df_test[["PassengerId","Transported logistic"]].copy()

submission_logistic["Transported logistic"] = submission_logistic["Transported logistic"].astype(type(df_sample["Transported"][0]))
submission_logistic.to_csv("submission_logistic.csv", index=False)

## III.2 - KNN

The 2nd model we can think of is the KNN algorithm.       
Indeed we want to predict a target variable so we are doing classification and as we have some data one can have the idea : **"To the same causes the same consequences"** so the most similar person will have the biggest odds of having the same "Target" value.        
That's the idea behind the KNN algorithm. 

However the KNN algorithm has 2 hyperpameters. The first one being the number of neighbors and the second one being the metric used to say if someone is close to someone else.       
The default metric of the KNN is the euclidian distance, we won't bother about it.        
About the number of neighbors, one idea to find the optimal number of neighbors is to iterate in a range of possible number of neighbors. We will train our model on each number of neighbors and then compare the accuracy. The optimal number of neighbors will be the one giving the maximum accuracy. 

In [None]:
X_train = df_train[usefull_features]
y_train = df_train["Transported"]

k_neighbors = np.arange(1,50)
accuracy_knn = []

for k in k_neighbors:
    score = cross_val_score(KNeighborsClassifier(k),X_train,y_train, cv=5 ).mean()
    accuracy_knn.append(score)
    
accuracy_knn = np.array(accuracy_knn)

In [None]:
plt.subplots(figsize=(8,5))

plt.plot(k_neighbors,accuracy_knn)
plt.xlabel("number of neighbors")
plt.ylabel("accuracy")
plt.title("Accuracy of KNN algorithm according to the number of neighbors")
plt.grid()
plt.show()

In [None]:
optimal_neighbors = k_neighbors[np.argmax(accuracy_knn)]
print("The maximum of accuracy is obtained at {} for k_neighbors = {}".format(np.max(accuracy_knn),optimal_neighbors))

In [None]:
model_KNN = KNeighborsClassifier(n_neighbors = optimal_neighbors).fit(X_train,y_train)

In [None]:
df_test["Transported KNN"] = model_KNN.predict(df_test[usefull_features])
submission_KNN = df_test[["PassengerId","Transported KNN"]].copy()

submission_KNN["Transported KNN"] = submission_KNN["Transported KNN"].astype(type(df_sample["Transported"][0]))
submission_KNN.to_csv("submission_KNN.csv", index=False)

## III.3 - Random Forest

In [None]:
X_train = df_train[usefull_features]
y_train = df_train["Transported"]

lst_max_depth = np.arange(1,30)
accuracy_RandomForestTree = []

for depth in lst_max_depth:
    score = cross_val_score(RandomForestClassifier(max_depth = depth),X_train,y_train, cv=5 ).mean()
    accuracy_RandomForestTree.append(score)
    
accuracy_RandomForestTree = np.array(accuracy_RandomForestTree)

In [None]:
plt.subplots(figsize=(8,5))

plt.plot(lst_max_depth,accuracy_RandomForestTree)
plt.xlabel("depth of the trees")
plt.ylabel("accuracy")
plt.title("Accuracy of Random Forest algorithm according to the depths")
plt.grid()
plt.show()

In [None]:
optimal_depth = lst_max_depth[np.argmax(accuracy_RandomForestTree)]
print("The maximum of accuracy is obtained at {} for a depth of = {}".format(np.max(accuracy_RandomForestTree),optimal_depth))

The accuracy is a bit better than KNN, it might be worth a try ! 

In [None]:
model_RF = RandomForestClassifier(max_depth = optimal_depth).fit(X_train,y_train)

In [None]:
df_test["Transported RF"] = model_RF.predict(df_test[usefull_features])
submission_RF = df_test[["PassengerId","Transported RF"]].copy()

submission_RF["Transported RF"] = submission_RF["Transported RF"].astype(type(df_sample["Transported"][0]))
submission_RF.to_csv("submission_RF.csv", index=False)

So far our models accuracy on the test dataset are the following :
   - logistic : 0.7858051695649325
   - KNN : 0.7945482735140693
   - RF : 0.7996113661343934         
Seems like the random forest model is the best one 

## III.4 - Vote Model

In [None]:
model_vote = VotingClassifier([('logistic',model_logistic),('KNN',model_KNN),('RF',model_RF)], voting = 'hard')

In [None]:
score_vote = cross_val_score(model_vote,X_train,y_train, cv=5 ).mean()
print("The vote model has an accuracy of {}".format(score_vote))

In [None]:
model_vote.fit(X_train,y_train)

In [None]:
df_test["Transported vote"] = model_vote.predict(df_test[usefull_features])
submission_vote = df_test[["PassengerId","Transported vote"]].copy()

submission_vote["Transported vote"] = submission_vote["Transported vote"].astype(type(df_sample["Transported"][0]))
submission_vote.to_csv("submission_vote.csv", index=False)

In [None]:
model_viz = PCA(n_components = 2)
df_reduced_train = model_viz.fit_transform(df_train[usefull_features].values)
df_reduced_test = model_viz.fit_transform(df_test[usefull_features].values)

In [None]:
fig = plt.figure(figsize=(20,6))
ax1 = fig.add_subplot(141); ax2 = fig.add_subplot(142) ; ax3 = fig.add_subplot(143) ; ax4 = fig.add_subplot(144)

ax1.scatter(x = df_reduced_test[:,0],y = df_reduced_test[:,1], c = df_test["Transported KNN"])
ax2.scatter(x = df_reduced_test[:,0],y = df_reduced_test[:,1], c = df_test["Transported RF"])
ax3.scatter(x = df_reduced_test[:,0],y = df_reduced_test[:,1], c = df_test["Transported logistic"])
ax4.scatter(x = df_reduced_test[:,0],y = df_reduced_test[:,1], c = df_test["Transported vote"])


ax1.set_title("KNN | PCA of df_test")
ax2.set_title("Random Forest | PCA of df_test")
ax3.set_title("logistic | PCA of df_test")
ax4.set_title("vote | PCA of df_test")

ax1.grid(); ax2.grid() ; ax3.grid() ; ax4.grid()
plt.show()

In [None]:
#submission_KNN.rename(columns={"Transported KNN": "Transported"}, inplace = True)
submission_vote.rename(columns={"Transported vote": "Transported"}, inplace = True)
submission_vote.to_csv("submission.csv", index=False)