## Loading the data:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

## Understanding the data first:

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.isna().sum()

## Data Preparation:

In [3]:
train_data['Transported'] = train_data['Transported'].apply(lambda x: 1 if x else 0)

train_data['Total_Spent'] = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1, numeric_only=True)
train_data.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis = 1, inplace = True)

train_data['Total_Spent_Band'] = pd.cut(train_data['Total_Spent'], 7)
train_data.drop(['Total_Spent'], axis = 1, inplace = True)

label_encoder_total_spent_band = LabelEncoder()
label_encoder_total_spent_band.fit(train_data['Total_Spent_Band'])
nan_index = train_data['Total_Spent_Band'].isna()
train_data['Total_Spent_Band'] = label_encoder_total_spent_band.transform(train_data['Total_Spent_Band'])
train_data.loc[nan_index,'Total_Spent_Band'] = int(train_data['Total_Spent_Band'].median()) 

train_data['Group'] = train_data['PassengerId'].apply(lambda x: int(x.split('_')[1]))

train_data['Age'].fillna(int(train_data['Age'].mean(skipna = True)), inplace = True)
train_data['Age_Band'] = pd.cut(train_data['Age'], 7)
train_data.drop(['Age'], axis = 1, inplace = True)

label_encoder_age_band = LabelEncoder()
label_encoder_age_band.fit(train_data['Age_Band'])
nan_index = train_data['Age_Band'].isna()
train_data['Age_Band'] = label_encoder_age_band.transform(train_data['Age_Band'])
train_data.loc[nan_index,'Age_Band'] = int(train_data['Age_Band'].median())  

train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].str.split('/', expand = True)
train_data.drop('Cabin', axis = 1, inplace = True)

label_encoder_deck = LabelEncoder()
label_encoder_deck.fit(train_data['Deck'])
nan_index = train_data['Deck'].isna()
train_data['Deck'] = label_encoder_deck.transform(train_data['Deck'])
train_data.loc[nan_index,'Deck'] = int(train_data['Deck'].median())   

label_encoder_side = LabelEncoder()
label_encoder_side.fit(train_data['Side'])
nan_index = train_data['Side'].isna()
train_data['Side'] = label_encoder_side.transform(train_data['Side'])
train_data.loc[nan_index,'Side'] = int(train_data['Side'].median()) 

label_encoder_home_planet = LabelEncoder()
label_encoder_home_planet.fit(train_data['HomePlanet'])
nan_index = train_data['HomePlanet'].isna()
train_data['HomePlanet'] = label_encoder_home_planet.transform(train_data['HomePlanet'])
train_data.loc[nan_index,'HomePlanet'] = int(train_data['HomePlanet'].median())

label_encoder_vip = LabelEncoder()
label_encoder_vip.fit(train_data['VIP'])
nan_index = train_data['VIP'].isna()
train_data['VIP'] = label_encoder_vip.transform(train_data['VIP'])
train_data.loc[nan_index,'VIP'] = int(train_data['VIP'].median())  

label_encoder_cryo_sleep = LabelEncoder()
label_encoder_cryo_sleep.fit(train_data['CryoSleep'])
nan_index = train_data['CryoSleep'].isna()
train_data['CryoSleep'] = label_encoder_cryo_sleep.transform(train_data['CryoSleep'])
train_data.loc[nan_index,'CryoSleep'] = int(train_data['CryoSleep'].median())  

label_encoder_cryo_destination = LabelEncoder()
label_encoder_cryo_destination.fit(train_data['Destination'])
nan_index = train_data['Destination'].isna()
train_data['Destination'] = label_encoder_cryo_destination.transform(train_data['Destination'])
train_data.loc[nan_index,'Destination'] = int(train_data['Destination'].median())

train_data[['First_Name', 'Last_Name']] = train_data['Name'].str.split(' ', expand=True)

train_data.to_csv('clean_data.csv', index = False)

In [4]:
def dataprocessing(data):

    data.drop('Name', axis = 1, inplace = True)
    
    data['Total_Spent'] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1, numeric_only=True)
    data.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis = 1, inplace = True)
    data['Total_Spent_Band'] = pd.cut(data['Total_Spent'], 7)
    data.drop(['Total_Spent'], axis = 1, inplace = True)

    data['Age'].fillna(int(data['Age'].mean(skipna = True)), inplace = True)
    data['Age_Band'] = pd.cut(data['Age'], 7)
    data.drop(['Age'], axis = 1, inplace = True)
    
    data['Group'] = data['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    data.drop('PassengerId', axis = 1, inplace = True)
    
    data[['Deck', 'Num', 'Side']] = data['Cabin'].str.split('/', expand = True)
    data.drop('Num', axis = 1, inplace = True)
    data.drop('Cabin', axis = 1, inplace = True)
    data['Deck'].fillna('Missing', inplace = True)
    data['Side'].fillna('Missing', inplace = True)

    data['Destination'].fillna('Missing', inplace = True)
    data['CryoSleep'] = train_data['CryoSleep'].apply(lambda x: 1 if x else 0)
    data['VIP'] = train_data['CryoSleep'].apply(lambda x: 1 if x else 0)

    data = pd.get_dummies(data)
    
    return data

In [8]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

target_train = train_data['PassengerId']
clean_train_data = dataprocessing(train_data)
clean_train_data['Transported'] = clean_train_data['Transported'].apply(lambda x: 1 if x else 0)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(clean_train_data, target_train, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred)*100,'%')
print('Recall:', recall_score(y_test, y_pred)*100,'%')
print('Precision:', precision_score(y_test, y_pred)*100,'%')
print('F1 Score:', f1_score(y_test, y_pred)*100,'%')

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred)*100,'%')
print('Recall:', recall_score(y_test, y_pred)*100,'%')
print('Precision:', precision_score(y_test, y_pred)*100,'%')
print('F1 Score:', f1_score(y_test, y_pred)*100,'%')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2) 
knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)
print('Accuracy:',accuracy_score(y_test, y_pred)*100,'%')
print('Recall:', recall_score(y_test, y_pred)*100,'%')
print('Precision:', precision_score(y_test, y_pred)*100,'%')
print('F1 Score:', f1_score(y_test, y_pred)*100,'%')

## First Assumptions:

### Understand how each feature is correlated with the Transported variable:

In this case, it doesn't seem to be any variable that right away might be correlated, so an exploratory analysis will be necessary:

For the first step, let's start studying the groups they were in, the home planet, their cryosleep status, their location in the ship (based on the cabin deck, number and side), their destination and age ranges and VIP status.

#### Group variable:

In [None]:
transported = [0,1]
for value in train_data['Group'].unique():
    for i in transported:
        print('Group', value, 'with transported:',i,'. Had a total of:',len(train_data.query('Group == '+str(value)+' & Transported == '+str(i))))
    
    

This indicates that there is no correlation between the groups and the number of transportations, so it's discarded.

#### Cabin variables:

In [None]:
transported = [0,1]
for value in train_data['Deck'].unique():
    for i in transported:
        print('Deck', value, 'with transported:',i,'. Had a total of:',len(train_data.query('Deck == '+str(value)+' & Transported == '+str(i))))
    print('-------------------------------------------------')
    

The most populated decks were 1, 2 and 3. The decks 1, 4 and 2 had a vast difference when it came to transported passengers.

In [None]:
transported = [0,1]
for value in train_data['Side'].unique():
    for i in transported:
        print('Side', value, 'with transported:',i,'. Had a total of:',len(train_data.query('Side == '+str(value)+' & Transported == '+str(i))))
    print('-------------------------------------------------')

The side variable also doesn't show any indications of a different distribution.

#### Money spent aboard variables:

#### Age variable:

In [None]:
train_data['Age_Band'].value_counts()

In [None]:
train_data.query('Transported == 1')['Age_Band'].value_counts()

In [None]:
train_data.query('Transported == 0')['Age_Band'].value_counts()

#### VIP variable

In [None]:
train_data.query('Transported == 0')['VIP'].value_counts()

In [None]:
train_data.query('Transported == 1')['VIP'].value_counts()

#### HomePlanet and Destination variables

In [None]:
train_data.query('Transported == 0')['HomePlanet'].value_counts()

In [None]:
train_data.query('Transported == 1')['HomePlanet'].value_counts()

Europa travellers had more proportional cases in comparison to other home planets

#### CryoSleep variable

In [4]:
train_data.query('Transported == 0')['CryoSleep'].value_counts()

CryoSleep
0    3761
1     554
Name: count, dtype: int64

In [5]:
train_data.query('Transported == 1')['CryoSleep'].value_counts()

CryoSleep
1    2483
0    1895
Name: count, dtype: int64

For now it's safe to say that those who were not in cryosleep were not transported

#### Destination variable

In [None]:
train_data.query('Transported == 0')['Destination'].value_counts()

In [None]:
train_data.query('Transported == 1')['Destination'].value_counts()

#### Name

In [None]:
file_female = open('female.txt','r')
female_names = file_female.read().splitlines()
file_female.close()
file_male = open('male.txt','r')
male_names = file_male.read().splitlines()
file_male.close()

female_dataset = pd.DataFrame({'Name':female_names, 'Gender':0})
male_dataset = pd.DataFrame({'Name':male_names, 'Gender':1})
names_dataset1 = pd.concat([male_dataset, female_dataset])

Names Corpus, Version 1.3 (1994-03-29)
Copyright (C) 1991 Mark Kantrowitz
Additions by Bill Ross

This corpus contains 5001 female names and 2943 male names, sorted
alphabetically, one per line.

You may use the lists of names for any purpose, so long as credit is
given in any published work. You may also redistribute the list if you
provide the recipients with a copy of this README file. The lists are
not in the public domain (I retain the copyright on the lists) but are
freely redistributable.  If you have any additions to the lists of
names, I would appreciate receiving them.

Mark Kantrowitz <mkant+@cs.cmu.edu>
http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/


In [None]:
#https://archive.ics.uci.edu/dataset/591/gender+by+name
names_dataset2 = pd.read_csv('name_gender_dataset.csv')
names_dataset2 = names_dataset2[['Name','Gender']]
names_dataset2['Gender'] = names_dataset2['Gender'].apply(lambda x: 0 if x == 'F' else 1)

In [None]:
names_dataset = pd.concat([names_dataset1, names_dataset2]).drop_duplicates()
names_list = names_dataset['Name']

In [None]:
nan_names_dataset_indexes = train_data['First_Name'].isna()
non_nan_names_dataset = train_data['First_Name'].dropna()

In [None]:
other_name = "Aamir"
names_dataset.query('Name == "'+other_name+'"')['Gender'][0]

In [None]:
def assignGender(name):
    aux_similar_gender = (0,0)
    print(name)
    for other_name in names_list:
        if name.lower() in other_name.lower():
            if name.lower() == other_name.lower():
                gender = names_dataset.query('Name == "'+other_name+'"')['Gender'].values[0]
                print(name, gender)
                return gender
            else:
                score = len(name)/len(other_name)
                if aux_similar_gender[1] < score:
                    aux_similar_gender = (names_dataset.query('Name == "'+other_name+'"')['Gender'].values[0], score)
        else:
            aux_counter = 0
            i = 0
            while i < len(name) and i < len(other_name):
                if name.lower()[i] == other_name.lower()[i]:
                    aux_counter += 1
                i += 1
            score = aux_counter/len(other_name)
            if score > aux_similar_gender[1]:
                aux_similar_gender = (names_dataset.query('Name == "'+other_name+'"')['Gender'].values[0], score)
                      
    return aux_similar_gender[0]


non_nan_first_names_index = train_data['First_Name'].notna()
non_nan_first_names_data = train_data[train_data['First_Name'].notna()]

train_data.loc[non_nan_first_names_index,'Sex'] = non_nan_first_names_data['First_Name'].apply(lambda x: assignGender(x))

In [None]:
train_data['Sex'].value_counts()

In [None]:
train_data.query('T')['Sex'].value_counts()