<center>

### Name:  Huzaifa Dad
### Email: huzaifadad.abt@gmail.com
### Github: https://github.com/HuzaifaDad 
### Purpose of Document: Data Preprocessing on titanic data file
</center>

# Installing Library

In [1]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading titanic csv file

In [3]:
df=pd.read_csv('huzaifa_titanic_data.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Task 01:
# Remove Name,Ticket,Cabin columns

In [4]:
df = df.drop(['Name','Ticket','Cabin'], axis=1)


In [5]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


# Task 02:
# Data Wrangling

# 1. Fill null values

In [6]:
#check null values in whole data
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [7]:
#check Age column null values
df['Age'].isnull().sum()

np.int64(177)

In [8]:
#Fill null values by mean
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [9]:
df['Age'].isnull().sum()

np.int64(0)

In [31]:
#Fill null values from Embarked column
df['Embarked'] = df['Embarked'].fillna(df['Fare'].mode()[0])
df['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [11]:
df['Embarked'].isnull().sum()

np.int64(0)

In [12]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [13]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Length: 891, dtype: bool

# Task 03:
# Change Gender column to 0 and 1 

In [14]:
df['Sex']

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [15]:
df['Sex'].isnull().sum()

np.int64(0)

In [16]:
print(df['Sex'].unique())

['male' 'female']


In [17]:
df['Sex'] = df['Sex'].map({'male':0,'female':1})
df['Sex']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: int64

In [18]:
df = df.rename(columns={'Sex':'Gender'})


In [19]:
df['Gender']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Gender, Length: 891, dtype: int64

In [20]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Gender,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22.000000,1,0,7.2500,S
1,2,1,1,1,38.000000,1,0,71.2833,C
2,3,1,3,1,26.000000,0,0,7.9250,S
3,4,1,1,1,35.000000,1,0,53.1000,S
4,5,0,3,0,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,13.0000,S
887,888,1,1,1,19.000000,0,0,30.0000,S
888,889,0,3,1,29.699118,1,2,23.4500,S
889,890,1,1,0,26.000000,0,0,30.0000,C


# Task 04:
# Make new comparison column

In [28]:
df['Age'] = df['Age'].astype(int)
df['Age']

0      22
1      38
2      26
3      35
4      35
       ..
886    27
887    19
888    29
889    26
890    32
Name: Age, Length: 891, dtype: int64

In [27]:
df['Fare'] = df['Fare'].astype(int)
df['Fare']

0       7
1      71
2       7
3      53
4       8
       ..
886    13
887    30
888    23
889    30
890     7
Name: Fare, Length: 891, dtype: int64

In [29]:
df['Comparison'] = 'Age:' + df['Age'].astype(str) + ',Fare:' + df['Fare'].astype(str)
df['Comparison']

0       Age:22,Fare:7
1      Age:38,Fare:71
2       Age:26,Fare:7
3      Age:35,Fare:53
4       Age:35,Fare:8
            ...      
886    Age:27,Fare:13
887    Age:19,Fare:30
888    Age:29,Fare:23
889    Age:26,Fare:30
890     Age:32,Fare:7
Name: Comparison, Length: 891, dtype: object

In [30]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Gender,Age,SibSp,Parch,Fare,Embarked,Comparison
0,1,0,3,0,22,1,0,7,S,"Age:22,Fare:7"
1,2,1,1,1,38,1,0,71,C,"Age:38,Fare:71"
2,3,1,3,1,26,0,0,7,S,"Age:26,Fare:7"
3,4,1,1,1,35,1,0,53,S,"Age:35,Fare:53"
4,5,0,3,0,35,0,0,8,S,"Age:35,Fare:8"
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27,0,0,13,S,"Age:27,Fare:13"
887,888,1,1,1,19,0,0,30,S,"Age:19,Fare:30"
888,889,0,3,1,29,1,2,23,S,"Age:29,Fare:23"
889,890,1,1,0,26,0,0,30,C,"Age:26,Fare:30"


# Task 05:
# Make user defined Function

In [32]:
def process_titanic_data(df):

    #Task 01: Remove Name,Ticket,Cabin
    df = df.drop(['Name','Ticket','Cabin'], axis=1)

    # Task 2: Fill null values in 'Age' and 'Embarked'
    if 'Age' in df.columns:
        df['Age'] = df['Age'].fillna(df['Age'].mean())
    if 'Embarked' in df.columns:
        df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Task 3: Convert 'Sex' column values
    if 'Sex' in df.columns:
        df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

    # Task 4: Create 'Comparison' column
    if 'Age' in df.columns and 'Fare' in df.columns:
        df['Comparison'] = "Age: " + df['Age'].astype(int).astype(str) + ", Fare: " + df['Fare'].astype(int).astype(str)
    
    return df
    

In [33]:
titanic_df = pd.read_csv('huzaifa_titanic_data.csv')
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [34]:
process_df = process_titanic_data(titanic_df)

In [35]:
process_df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Comparison
0,1,0,3,0,22.000000,1,0,7.2500,S,"Age: 22, Fare: 7"
1,2,1,1,1,38.000000,1,0,71.2833,C,"Age: 38, Fare: 71"
2,3,1,3,1,26.000000,0,0,7.9250,S,"Age: 26, Fare: 7"
3,4,1,1,1,35.000000,1,0,53.1000,S,"Age: 35, Fare: 53"
4,5,0,3,0,35.000000,0,0,8.0500,S,"Age: 35, Fare: 8"
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.000000,0,0,13.0000,S,"Age: 27, Fare: 13"
887,888,1,1,1,19.000000,0,0,30.0000,S,"Age: 19, Fare: 30"
888,889,0,3,1,29.699118,1,2,23.4500,S,"Age: 29, Fare: 23"
889,890,1,1,0,26.000000,0,0,30.0000,C,"Age: 26, Fare: 30"


In [36]:
process_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Comparison     0
dtype: int64

In [37]:
process_df['Comparison']

0       Age: 22, Fare: 7
1      Age: 38, Fare: 71
2       Age: 26, Fare: 7
3      Age: 35, Fare: 53
4       Age: 35, Fare: 8
             ...        
886    Age: 27, Fare: 13
887    Age: 19, Fare: 30
888    Age: 29, Fare: 23
889    Age: 26, Fare: 30
890     Age: 32, Fare: 7
Name: Comparison, Length: 891, dtype: object