## Data Processing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib


In [7]:
df = pd.read_csv('../data/raw/Titanic.csv')

In [8]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
0,male,22.0,1,0,7.25,S,Third,man,False,0
1,female,38.0,1,0,71.2833,C,First,woman,False,1
2,female,26.0,0,0,7.925,S,Third,woman,True,1
3,female,35.0,1,0,53.1,S,First,woman,False,1
4,male,35.0,0,0,8.05,S,Third,man,True,0


In [9]:
df.tail()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
886,male,27.0,0,0,13.0,S,Second,man,True,0
887,female,19.0,0,0,30.0,S,First,woman,True,1
888,female,,1,2,23.45,S,Third,woman,False,0
889,male,26.0,0,0,30.0,C,First,man,True,1
890,male,32.0,0,0,7.75,Q,Third,man,True,0


In [None]:
df.info() # checking for breif decription about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       891 non-null    object 
 1   age       714 non-null    float64
 2   sibsp     891 non-null    int64  
 3   parch     891 non-null    int64  
 4   fare      891 non-null    float64
 5   embarked  889 non-null    object 
 6   class     891 non-null    object 
 7   who       891 non-null    object 
 8   alone     891 non-null    bool   
 9   survived  891 non-null    int64  
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 63.6+ KB


In [11]:
# statistical overview
df.describe()

Unnamed: 0,age,sibsp,parch,fare,survived
count,714.0,891.0,891.0,891.0,891.0
mean,29.699118,0.523008,0.381594,32.204208,0.383838
std,14.526497,1.102743,0.806057,49.693429,0.486592
min,0.42,0.0,0.0,0.0,0.0
25%,20.125,0.0,0.0,7.9104,0.0
50%,28.0,0.0,0.0,14.4542,0.0
75%,38.0,1.0,0.0,31.0,1.0
max,80.0,8.0,6.0,512.3292,1.0


In [13]:
for col in df.columns.tolist():
    if df[col].nunique() > 5:
        print(f"Col {col} has {df[col].nunique()} unique values")
    else:
        print(f"Col {col} uniques values: {df[col].unique()}")

Col sex uniques values: ['male' 'female']
Col age has 88 unique values
Col sibsp has 7 unique values
Col parch has 7 unique values
Col fare has 248 unique values
Col embarked uniques values: ['S' 'C' 'Q' nan]
Col class uniques values: ['Third' 'First' 'Second']
Col who uniques values: ['man' 'woman' 'child']
Col alone uniques values: [False  True]
Col survived uniques values: [0 1]


## Data Cleaning

In [14]:
df.isnull().sum()

sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
class         0
who           0
alone         0
survived      0
dtype: int64

In [21]:
# filling age with the median value

df = df.fillna({'age': df['age'].median(), 'embarked': df['embarked'].mode()[0]})

In [22]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
0,male,22.0,1,0,7.25,S,Third,man,False,0
1,female,38.0,1,0,71.2833,C,First,woman,False,1
2,female,26.0,0,0,7.925,S,Third,woman,True,1
3,female,35.0,1,0,53.1,S,First,woman,False,1
4,male,35.0,0,0,8.05,S,Third,man,True,0


In [26]:
# converting age into an integer column

df['age'] = df['age'].apply(lambda x: int(x))

In [28]:
# converting alone to integer

df['alone'] = df['alone'].apply(lambda x: int(x))

In [31]:
# changing column name who to identity

df = df.rename({'who': 'identity'}, axis= 1)

In [32]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,identity,alone,survived
0,male,22,1,0,7.25,S,Third,man,0,0
1,female,38,1,0,71.2833,C,First,woman,0,1
2,female,26,0,0,7.925,S,Third,woman,1,1
3,female,35,1,0,53.1,S,First,woman,0,1
4,male,35,0,0,8.05,S,Third,man,1,0


In [34]:
df['fare'] = df['fare'].apply(lambda x: round(x, 2))

In [35]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,identity,alone,survived
0,male,22,1,0,7.25,S,Third,man,0,0
1,female,38,1,0,71.28,C,First,woman,0,1
2,female,26,0,0,7.92,S,Third,woman,1,1
3,female,35,1,0,53.1,S,First,woman,0,1
4,male,35,0,0,8.05,S,Third,man,1,0


In [36]:
for col in df.columns.tolist():
    if pd.api.types.is_object_dtype(df[col]):
        df[col] = df[col].str.lower()

In [37]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,identity,alone,survived
0,male,22,1,0,7.25,s,third,man,0,0
1,female,38,1,0,71.28,c,first,woman,0,1
2,female,26,0,0,7.92,s,third,woman,1,1
3,female,35,1,0,53.1,s,first,woman,0,1
4,male,35,0,0,8.05,s,third,man,1,0
