<h1 align='center'>Titanic Data Set  Pre-Processing

## Importing Packages

In [35]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import preprocessing

# import sys
# sys.setrecursionlimit(10000)
# import sys 
# print(sys.getrecursionlimit()) 

---

## Loading Dataset

In [36]:
data = pd.read_csv('https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv', usecols=['Age','Sex','Pclass','SibSp','Parch','Cabin','Fare','Embarked','Survived'])

In [37]:
data = data[['Age','Sex','Pclass','SibSp','Parch','Cabin','Fare','Embarked','Survived']]
data.head()

Unnamed: 0,Age,Sex,Pclass,SibSp,Parch,Cabin,Fare,Embarked,Survived
0,22.0,male,3,1,0,,7.25,S,0
1,38.0,female,1,1,0,C85,71.2833,C,1
2,26.0,female,3,0,0,,7.925,S,1
3,35.0,female,1,1,0,C123,53.1,S,1
4,35.0,male,3,0,0,,8.05,S,0


In [38]:
data.groupby(['Survived' ,'Sex', 'Embarked' ]).mean()['Age']

Survived  Sex     Embarked
0         female  C           25.583333
                  Q           28.100000
                  S           24.698113
          male    C           34.744444
                  Q           31.066667
                  S           31.176667
1         female  C           28.645455
                  Q           21.571429
                  S           28.996241
          male    C           29.725833
                  Q           29.000000
                  S           26.386029
Name: Age, dtype: float64

In [39]:
data.dtypes

Age         float64
Sex          object
Pclass        int64
SibSp         int64
Parch         int64
Cabin        object
Fare        float64
Embarked     object
Survived      int64
dtype: object

## About Data

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Pclass    891 non-null    int64  
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Cabin     204 non-null    object 
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
 8   Survived  891 non-null    int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [41]:
data.describe()

Unnamed: 0,Age,Pclass,SibSp,Parch,Fare,Survived
count,714.0,891.0,891.0,891.0,891.0,891.0
mean,29.699118,2.308642,0.523008,0.381594,32.204208,0.383838
std,14.526497,0.836071,1.102743,0.806057,49.693429,0.486592
min,0.42,1.0,0.0,0.0,0.0,0.0
25%,20.125,2.0,0.0,0.0,7.9104,0.0
50%,28.0,3.0,0.0,0.0,14.4542,0.0
75%,38.0,3.0,1.0,0.0,31.0,1.0
max,80.0,3.0,8.0,6.0,512.3292,1.0


### Checking - Null Values

In [42]:
null_values_df = data.isnull().sum().to_frame()
null_val_df = pd.DataFrame(null_values_df)

null_values_df['% missing'] = null_val_df[0] / data.shape[0]
null_values_df

Unnamed: 0,0,% missing
Age,177,0.198653
Sex,0,0.0
Pclass,0,0.0
SibSp,0,0.0
Parch,0,0.0
Cabin,687,0.771044
Fare,0,0.0
Embarked,2,0.002245
Survived,0,0.0


### PREPROCESSING

#### Dropping the Cabin Column; As 77% of the data in the column were missing

In [43]:
data.drop(['Cabin'], axis=1, inplace=True)
data.head()

Unnamed: 0,Age,Sex,Pclass,SibSp,Parch,Fare,Embarked,Survived
0,22.0,male,3,1,0,7.25,S,0
1,38.0,female,1,1,0,71.2833,C,1
2,26.0,female,3,0,0,7.925,S,1
3,35.0,female,1,1,0,53.1,S,1
4,35.0,male,3,0,0,8.05,S,0


#### Converting the Embarked Categorical Values in Numerical Values

In [44]:
data.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [45]:
data['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [46]:
# Replaced the Nan Values with the S Values
data.Embarked.fillna(data.Embarked.mode()[0], inplace = True)

In [47]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'Embarked'.
data['Embarked_encoded']= label_encoder.fit_transform(data.Embarked)
data['Embarked_encoded'].unique()

r = \
'''
S --> 2
C --> 0
Q --> 1
'''

In [48]:
data.drop(['Embarked'], axis=1, inplace=True)

#### Computing the Age Column

In [49]:
# data['Age'].fillna(value=data['Age'].median, inplace=True)

In [51]:
data["Age"].fillna(round(data["Age"].mean(),1) , inplace = True)

In [68]:
# data['Age'].values

In [53]:
# data['Age'].isnull().sum()

0

In [None]:
# data['Age'] = data['Age'].astype(str).astype(int)

In [None]:
# data['Age'] = data['Age'].apply(lambda x: int(x))
# data.Age

In [None]:
# data['Age']=list(map(lambda x: int(x),data['Age']))

In [None]:
# type(data[['Age']])

In [None]:
# data['Age'].head()

In [54]:
data['Age'].dtypes

dtype('float64')

#### Computing the Sex Column 
<pre> Converting the Categorical Values to Numerical Values

In [55]:
# Encode labels in column 'Embarked'.
data['Sex_encoded']= label_encoder.fit_transform(data.Sex)
data['Sex_encoded'].unique()

r = \
'''
Male   --> 1
Female --> 0
'''

data.drop(['Sex'], axis=1, inplace= True)

In [56]:
data.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Fare,Survived,Embarked_encoded,Sex_encoded
0,22.0,3,1,0,7.25,0,2,1
1,38.0,1,1,0,71.2833,1,0,0
2,26.0,3,0,0,7.925,1,2,0
3,35.0,1,1,0,53.1,1,2,0
4,35.0,3,0,0,8.05,0,2,1


#### Computing the SibSp and Parch Column 

In [57]:
data['Family'] = data.Parch + data.SibSp
data['Is_Alone'] = data.Family == 0

In [58]:
data.drop(['SibSp','Parch'], axis=1, inplace=True)

In [59]:
# Encode labels in column 'Embarked'.
data['Is_Alone_encoded']= label_encoder.fit_transform(data.Is_Alone)
data['Is_Alone_encoded'].unique()

r = \
'''
True  --> 1
False --> 0
'''

data.drop(['Is_Alone'], axis=1, inplace=True)

#### DROPPING THE UNWANTED COLUMNS

In [61]:
# data.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [62]:
# Re-Arranging the Columns
final_data = data[['Age','Sex_encoded','Pclass', 'Embarked_encoded', 'Family', 'Is_Alone_encoded', 'Survived']]

In [63]:
# Re-Naming the Columns
col_name = ['Age', 'Sex', 'Pclass', 'Embarked', 'Family', 'Is_Alone', 'Survived']
final_data.columns = col_name

In [64]:
%store -r final_data 

no stored variable or alias final_data


In [67]:
final_data.to_csv('titanic_processed_data.csv')