## Loading the data:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
train_data = pd.read_csv('train.csv')

## Understanding the data first:

In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   PassengerId   8693 non-null   object  
 1   HomePlanet    8693 non-null   int64   
 2   CryoSleep     8693 non-null   int64   
 3   Destination   8511 non-null   object  
 4   Age           8693 non-null   float64 
 5   VIP           8693 non-null   int64   
 6   RoomService   8512 non-null   float64 
 7   FoodCourt     8510 non-null   float64 
 8   ShoppingMall  8485 non-null   float64 
 9   Spa           8510 non-null   float64 
 10  VRDeck        8505 non-null   float64 
 11  Name          8493 non-null   object  
 12  Transported   8693 non-null   int64   
 13  Group         8693 non-null   int64   
 14  Deck          8494 non-null   object  
 15  Num           8494 non-null   object  
 16  Side          8494 non-null   object  
 17  Age_Band      8693 non-null   category
dtypes: categ

In [64]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Deck,Num,Side,Age_Band
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,1,1,1,0,0,"(33.857, 45.143]"
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,0,1,5,0,1,"(22.571, 33.857]"
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,1,1,0,0,1,"(56.429, 67.714]"
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,1,2,0,0,1,"(22.571, 33.857]"
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,0,1,5,1,1,"(11.286, 22.571]"


In [None]:
train_data.describe()

In [None]:
train_data.isna().sum()

## Data Preparation:

In [3]:
train_data['Transported'] = train_data['Transported'].apply(lambda x: 1 if x else 0)

train_data['Total_Spent'] = train_data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1, numeric_only=True)
train_data.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], axis = 1,inplace = True)

train_data['Total_Spent_Band'] = pd.cut(train_data['Total_Spent'], 7)
train_data.drop(['Total_Spent'], axis = 1, inplace = True)

label_encoder_total_spent_band = LabelEncoder()
label_encoder_total_spent_band.fit(train_data['Total_Spent_Band'])
nan_index = train_data['Total_Spent_Band'].isna()
train_data['Total_Spent_Band'] = label_encoder_total_spent_band.transform(train_data['Total_Spent_Band'])
train_data.loc[nan_index,'Total_Spent_Band'] = int(train_data['Total_Spent_Band'].median()) 

train_data['Group'] = train_data['PassengerId'].apply(lambda x: int(x.split('_')[1]))

train_data['Age'].fillna(int(train_data['Age'].mean(skipna = True)), inplace = True)
train_data['Age_Band'] = pd.cut(train_data['Age'], 7)
train_data.drop(['Age'], axis = 1, inplace = True)

label_encoder_age_band = LabelEncoder()
label_encoder_age_band.fit(train_data['Age_Band'])
nan_index = train_data['Age_Band'].isna()
train_data['Age_Band'] = label_encoder_age_band.transform(train_data['Age_Band'])
train_data.loc[nan_index,'Age_Band'] = int(train_data['Age_Band'].median())  

train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].str.split('/', expand = True)
train_data.drop('Cabin', axis = 1, inplace = True)

label_encoder_deck = LabelEncoder()
label_encoder_deck.fit(train_data['Deck'])
nan_index = train_data['Deck'].isna()
train_data['Deck'] = label_encoder_deck.transform(train_data['Deck'])
train_data.loc[nan_index,'Deck'] = int(train_data['Deck'].median())   

label_encoder_side = LabelEncoder()
label_encoder_side.fit(train_data['Side'])
nan_index = train_data['Side'].isna()
train_data['Side'] = label_encoder_side.transform(train_data['Side'])
train_data.loc[nan_index,'Side'] = int(train_data['Side'].median()) 

label_encoder_home_planet = LabelEncoder()
label_encoder_home_planet.fit(train_data['HomePlanet'])
nan_index = train_data['HomePlanet'].isna()
train_data['HomePlanet'] = label_encoder_home_planet.transform(train_data['HomePlanet'])
train_data.loc[nan_index,'HomePlanet'] = int(train_data['HomePlanet'].median())

label_encoder_vip = LabelEncoder()
label_encoder_vip.fit(train_data['VIP'])
nan_index = train_data['VIP'].isna()
train_data['VIP'] = label_encoder_vip.transform(train_data['VIP'])
train_data.loc[nan_index,'VIP'] = int(train_data['VIP'].median())  

label_encoder_cryo_sleep = LabelEncoder()
label_encoder_cryo_sleep.fit(train_data['CryoSleep'])
nan_index = train_data['CryoSleep'].isna()
train_data['CryoSleep'] = label_encoder_cryo_sleep.transform(train_data['CryoSleep'])
train_data.loc[nan_index,'CryoSleep'] = int(train_data['CryoSleep'].median())  

label_encoder_cryo_destination = LabelEncoder()
label_encoder_cryo_destination.fit(train_data['Destination'])
nan_index = train_data['Destination'].isna()
train_data['Destination'] = label_encoder_cryo_destination.transform(train_data['Destination'])
train_data.loc[nan_index,'Destination'] = int(train_data['Destination'].median())

train_data[['First_Name', 'Last_Name']] = train_data['Name'].str.split(' ', expand=True)

train_data.to_csv('clean_data.csv', index = False)

## First Assumptions:

### Understand how each feature is correlated with the Transported variable:

In this case, it doesn't seem to be any variable that right away might be correlated, so an exploratory analysis will be necessary:

For the first step, let's start studying the groups they were in, the home planet, their cryosleep status, their location in the ship (based on the cabin deck, number and side), their destination and age ranges and VIP status.

#### Group variable:

In [100]:
transported = [0,1]
for value in train_data['Group'].unique():
    for i in transported:
        print('Group', value, 'with transported:',i,'. Had a total of:',len(train_data.query('Group == '+str(value)+' & Transported == '+str(i))))
    
    

Group 1 with transported: 0 . Had a total of: 3258
Group 1 with transported: 1 . Had a total of: 2959
Group 2 with transported: 0 . Had a total of: 623
Group 2 with transported: 1 . Had a total of: 789
Group 3 with transported: 0 . Had a total of: 218
Group 3 with transported: 1 . Had a total of: 353
Group 4 with transported: 0 . Had a total of: 94
Group 4 with transported: 1 . Had a total of: 137
Group 5 with transported: 0 . Had a total of: 58
Group 5 with transported: 1 . Had a total of: 70
Group 6 with transported: 0 . Had a total of: 34
Group 6 with transported: 1 . Had a total of: 41
Group 7 with transported: 0 . Had a total of: 23
Group 7 with transported: 1 . Had a total of: 23
Group 8 with transported: 0 . Had a total of: 7
Group 8 with transported: 1 . Had a total of: 6


This indicates that there is no correlation between the groups and the number of transportations, so it's discarded.

#### Cabin variables:

In [78]:
transported = [0,1]
for value in train_data['Deck'].unique():
    for i in transported:
        print('Deck', value, 'with transported:',i,'. Had a total of:',len(train_data.query('Deck == '+str(value)+' & Transported == '+str(i))))
    print('-------------------------------------------------')
    

Deck 1 with transported: 0 . Had a total of: 207
Deck 1 with transported: 1 . Had a total of: 572
-------------------------------------------------
Deck 5 with transported: 0 . Had a total of: 1664
Deck 5 with transported: 1 . Had a total of: 1329
-------------------------------------------------
Deck 0 with transported: 0 . Had a total of: 129
Deck 0 with transported: 1 . Had a total of: 127
-------------------------------------------------
Deck 6 with transported: 0 . Had a total of: 1238
Deck 6 with transported: 1 . Had a total of: 1321
-------------------------------------------------
Deck 4 with transported: 0 . Had a total of: 563
Deck 4 with transported: 1 . Had a total of: 313
-------------------------------------------------
Deck 3 with transported: 0 . Had a total of: 271
Deck 3 with transported: 1 . Had a total of: 207
-------------------------------------------------
Deck 2 with transported: 0 . Had a total of: 239
Deck 2 with transported: 1 . Had a total of: 508
----------

The most populated decks were 1, 2 and 3. The decks 1, 4 and 2 had a vast difference when it came to transported passengers.

In [79]:
transported = [0,1]
for value in train_data['Side'].unique():
    for i in transported:
        print('Side', value, 'with transported:',i,'. Had a total of:',len(train_data.query('Side == '+str(value)+' & Transported == '+str(i))))
    print('-------------------------------------------------')

Side 0 with transported: 0 . Had a total of: 2308
Side 0 with transported: 1 . Had a total of: 1898
-------------------------------------------------
Side 1 with transported: 0 . Had a total of: 2007
Side 1 with transported: 1 . Had a total of: 2480
-------------------------------------------------


The side variable also doesn't show any indications of a different distribution.

#### Money spent aboard variables:

#### Age variable:

In [7]:
train_data['Age_Band'].value_counts()

Age_Band
(22.571, 33.857]    2824
(11.286, 22.571]    2272
(33.857, 45.143]    1681
(45.143, 56.429]     769
(-0.079, 11.286]     765
(56.429, 67.714]     318
(67.714, 79.0]        64
Name: count, dtype: int64

In [88]:
train_data.query('Transported == 1')['Age_Band'].value_counts()

Age_Band
(22.571, 33.857]    1336
(11.286, 22.571]    1121
(33.857, 45.143]     824
(-0.079, 11.286]     535
(45.143, 56.429]     370
(56.429, 67.714]     163
(67.714, 79.0]        29
Name: count, dtype: int64

In [84]:
train_data.query('Transported == 0')['Age_Band'].value_counts()

Age_Band
(22.571, 33.857]    1488
(11.286, 22.571]    1151
(33.857, 45.143]     857
(45.143, 56.429]     399
(-0.079, 11.286]     230
(56.429, 67.714]     155
(67.714, 79.0]        35
Name: count, dtype: int64

#### VIP variable

In [90]:
train_data.query('Transported == 0')['VIP'].value_counts()

VIP
0    4093
1     222
Name: count, dtype: int64

In [91]:
train_data.query('Transported == 1')['VIP'].value_counts()

VIP
0    4198
1     180
Name: count, dtype: int64

#### HomePlanet and Destination variables

In [9]:
train_data.query('Transported == 0')['HomePlanet'].value_counts()

HomePlanet
0    2749
2     839
1     727
Name: count, dtype: int64

In [10]:
train_data.query('Transported == 1')['HomePlanet'].value_counts()

HomePlanet
0    2054
1    1404
2     920
Name: count, dtype: int64

Europa travellers had more proportional cases in comparison to other home planets

#### CryoSleep variable

In [40]:
train_data.query('Transported == 0')['CryoSleep'].value_counts()

CryoSleep
False    3650
True      554
Name: count, dtype: int64

In [37]:
train_data.query('Transported == 1')['CryoSleep'].value_counts()

CryoSleep
True     2483
False    1789
Name: count, dtype: int64

For now it's safe to say that those who were not in cryosleep were not transported

#### Destination variable

In [45]:
train_data.query('Transported == 0')['Destination'].value_counts()

Destination
TRAPPIST-1e      3128
55 Cancri e       702
PSO J318.5-22     395
Name: count, dtype: int64

In [46]:
train_data.query('Transported == 1')['Destination'].value_counts()

Destination
TRAPPIST-1e      2787
55 Cancri e      1098
PSO J318.5-22     401
Name: count, dtype: int64

#### Name

In [5]:
file_female = open('female.txt','r')
female_names = file_female.read().splitlines()
file_female.close()
file_male = open('male.txt','r')
male_names = file_male.read().splitlines()
file_male.close()

female_dataset = pd.DataFrame({'Name':female_names, 'Gender':0})
male_dataset = pd.DataFrame({'Name':male_names, 'Gender':1})
names_dataset1 = pd.concat([male_dataset, female_dataset])

Names Corpus, Version 1.3 (1994-03-29)
Copyright (C) 1991 Mark Kantrowitz
Additions by Bill Ross

This corpus contains 5001 female names and 2943 male names, sorted
alphabetically, one per line.

You may use the lists of names for any purpose, so long as credit is
given in any published work. You may also redistribute the list if you
provide the recipients with a copy of this README file. The lists are
not in the public domain (I retain the copyright on the lists) but are
freely redistributable.  If you have any additions to the lists of
names, I would appreciate receiving them.

Mark Kantrowitz <mkant+@cs.cmu.edu>
http://www-2.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/


In [6]:
#https://archive.ics.uci.edu/dataset/591/gender+by+name
names_dataset2 = pd.read_csv('name_gender_dataset.csv')
names_dataset2 = names_dataset2[['Name','Gender']]
names_dataset2['Gender'] = names_dataset2['Gender'].apply(lambda x: 0 if x == 'F' else 1)

In [7]:
names_dataset = pd.concat([names_dataset1, names_dataset2]).drop_duplicates()
names_list = names_dataset['Name']

In [45]:
nan_names_dataset_indexes = train_data['First_Name'].isna()
non_nan_names_dataset = train_data['First_Name'].dropna()

In [None]:
def assignGender(name):
    aux_similar_gender = (0,0)
    print(name)
    for other_name in names_list:
        if name.lower() in other_name.lower():
            if name.lower() == other_name.lower():
                gender = names_dataset.query('Name == "'+other_name+'"')['Gender']
                #print("Name: "+name, "Gender:",gender, "Score: 1")
                return gender
            else:
                score = len(name)/len(other_name)
                if aux_similar_gender[1] < score:
                    aux_similar_gender = (names_dataset.query('Name == "'+other_name+'"')['Gender'], score)
        else:
            aux_counter = 0
            i = 0
            while i < len(name) and i < len(other_name):
                if name.lower()[i] == other_name.lower()[i]:
                    aux_counter += 1
                i += 1
            score = aux_counter/len(other_name)
            if score > aux_similar_gender[1]:
                aux_similar_gender = (names_dataset.query('Name == "'+other_name+'"')['Gender'], score)
                
    #print('Name: '+name, 'Gender:',aux_similar_gender[0], 'Score:',aux_similar_gender[1])        
    return aux_similar_gender[0]


non_nan_first_names_index = train_data['First_Name'].notna()
non_nan_first_names_data = train_data[train_data['First_Name'].notna()]


train_data.loc[non_nan_first_names_index,'First_Name'] = non_nan_first_names_data['First_Name'].apply(lambda x: assignGender(x))



Maham
Juanna
Altark
Solam
Willy
Sandie
Billex
Candra
Andona
Erraiam
Altardr
Wezena
Berers
Reney
Elle
Justie
Flats
Carry
Alus
Lyde
Philda
Almary
Glendy
Mollen
Breney
Mael
Terta
Penton
Karard
Anyoni
Ceros
Ginia
Coobix
Cinets
Dontch
Ziba
Luse
Marina
Loise
Jorgie
Margia
Ankalik
Jodye
Kayne
Cassa
Zelowl
Mass
Sony
Vivia
Elaney
Elson
Okulas
Instab
Zinoces
Warry
Shanya
Sterry
Colatz
Diandy
Ninaha
Celine
Velyne
Cinst
Meremy
Nelly
Thell
Gorn
Aldibah
Conk
Pon
Spuri
Dellie
Totse
Eaturs
Coren
Furudah
Jodye
Stmeal
Heremy
Deanne
Tinez
Gracy
Stald
Tiney
Alchium
Doria
Leence
Aliey
Thewis
Book
Ritany
Arlen
Shaele
Fanne
Carry
Coobix
Natald
Oline
Graviph
Aldeba
Alaratz
Izark
Aldera
Moth
Idary
Maura
Rohs
Alhenah
Batan
Kleeiak
Bees
Shelle
Iree
Milly
Miloss
Grohs
Barave
Race
Flynx
Egrasp
Camina
Fayene
Monah
Charda
Evaley
Verney
Pyrohs
Guadae
Verly
Samie
Corsh
Billya
Shanie
Andan
Gory
Kabiton
Scharab
Fany
Ianya
Graven
Miten
Rios
Wandy
Mollie
Coracy
Hanna
Hilip
Doryn
Toperon
Wees
Elaney
Raque
Sallyl
Nancis
Kso