In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Import the Data
Let's extract the our **Titanic data** from the .csv file, create a  pandas DataFrame and look at the available indicators:

- ***Survived***: Outcome of survival (0 = No; 1 = Yes)
- ***Pclass***: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)
- ***Name***: Name of passenger
- ***Sex***: Sex of the passenger
- ***Age***: Age of the passenger (Some entries contain NaN)
- ***SibSp***: Number of siblings and spouses of the passenger aboard
- ***Parch***: Number of parents and children of the passenger aboard
- ***Ticket***: Ticket number of the passenger
- ***Fare***: Fare paid by the passenger
- ***Cabin***: Cabin number of the passenger (Some entries contain NaN)
- ***Embarked***: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)

In [None]:
df = pd.read_csv("titanic_dataset.csv")
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

## Distribution plots

In [None]:
df['Parch']

In [None]:
df['Parch'].unique()

In [None]:
sns.histplot(df['Parch'],kde=True)
plt.show()

**As we can see, most passengers don't have neither parents nor children aboard.**

In [None]:
df['Age']

In [None]:
sns.distplot(df['Age'])
plt.show()

**As we can see that most of the passenger has the age between 20 to 40**

In [None]:
plt.figure(figsize=(8,8))
sns.distplot(df['Age'])
plt.show()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
# Heatmap
sns.heatmap(df.isnull(),yticklabels = False, cbar = False,cmap = 'tab20c_r')
plt.title('Missing Data')
plt.show()

In [None]:
df['Age'].median()

In [None]:
df['Age'].mean()

In [None]:
sns.boxplot(df['Age'])

In [None]:
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)
IQR = Q3 - Q1
print(Q1)
print(Q3)
print(IQR)

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(lower_bound)
print(upper_bound)

In [None]:
df_filtered = df[(df["Age"] >= lower_bound) & (df["Age"] <= upper_bound)]

print(f"Original Rows: {df.shape[0]}, Filtered Rows: {df_filtered.shape[0]}")

In [None]:
891-703

In [None]:
df_filtered

In [None]:
df_filtered.index

In [None]:
df.index

In [None]:
df[~df.index.isin(df_filtered.index)] 

In [None]:
df.Pclass.unique()

In [None]:
df['Pclass'].unique()

In [None]:
plt.figure(figsize = (10,7))
sns.boxplot(x = 'Pclass', y = 'Age', data = df, palette= 'GnBu_d').set_title('Age by Passenger Class')
plt.show()

In [None]:
# Imputation function
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]

    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age


In [None]:
df[['Age','Pclass']]

In [None]:
# Apply the function to the Age column
df['Age']=df[['Age','Pclass']].apply(impute_age, axis =1)

In [None]:
df['Age'].isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
df_filtered = df[(df["Age"] >= lower_bound) & (df["Age"] <= upper_bound)]

print(f"Original Rows: {df.shape[0]}, Filtered Rows: {df_filtered.shape[0]}")

In [None]:
891-880

In [None]:
sns.boxplot(df['Age'])

In [None]:
df.isnull().sum()

In [None]:
# Remove Cabin feature
df.drop('Cabin', axis = 1, inplace = True)

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.info()

In [None]:
out = [1,2,5,2,9,10,100,11,2,3.4,8,102]
sns.boxplot(out)

In [None]:
df = df.dropna()

In [None]:
df.info()