In [67]:
import pandas as pd
import seaborn as sns

In [68]:
# Load the Titanic dataset
!kaggle competitions download -c titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [69]:
import zipfile

with zipfile.ZipFile('titanic.zip', 'r') as zip_ref:
    zip_ref.extractall('.') 
titanic_dataset = pd.read_csv('train.csv')

# 🌟 Exercise 1: Duplicate Detection and Removal  
Instructions  
Objective: Identify and remove duplicate entries in the Titanic dataset.  

Load the Titanic dataset.  
Identify if there are any duplicate rows based on all columns.  
Remove any duplicate rows found in the dataset.  
Verify the removal of duplicates by checking the number of rows before and after the duplicate removal.  
Hint: Use the duplicated() and drop_duplicates() functions in Pandas.  

In [70]:
titanic_dataset.shape

(891, 12)

In [71]:
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [72]:
titanic_dataset.duplicated().sum()

0

In [73]:
titanic_dataset.drop_duplicates(inplace=True)
titanic_dataset.duplicated().sum()

0

# 🌟 Exercise 2: Handling Missing Values  
Instructions  
Identify columns in the Titanic dataset with missing values.  
Explore different strategies for handling missing data, such as removal, imputation, and filling with a constant value.  
Apply each strategy to different columns based on the nature of the data.  
Hint: Review methods like dropna(), fillna(), and SimpleImputer from scikit-learn.  

In [74]:
titanic_dataset.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [75]:
titanic_dataset.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [76]:
from sklearn.impute import SimpleImputer

Let's fill empty age valus with average.

In [77]:
mean_imputer = SimpleImputer(strategy='mean')

titanic_dataset['Age'] = mean_imputer.fit_transform(titanic_dataset[['Age']])

In [78]:
titanic_dataset.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

And all other columns will be filled with most frequent values.

In [79]:
mfreaquent_imputer = SimpleImputer(strategy='most_frequent')

titanic_dataset['Embarked'] = mfreaquent_imputer.fit_transform(titanic_dataset[['Embarked']]).ravel()
titanic_dataset['Cabin'] = mfreaquent_imputer.fit_transform(titanic_dataset[['Cabin']]).ravel()

titanic_dataset.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

# 🌟 Exercise 3: Feature Engineering  
Instructions  
Create new features, such as Family Size from SibSp and Parch, and Title extracted from the Name column.  
Convert categorical variables into numerical form using techniques like one-hot encoding or label encoding.  
Normalize or standardize numerical features if required.  
Hint: Utilize Pandas for data manipulation and scikit-learn’s preprocessing module for encoding.  

In [80]:
titanic_dataset['Title'] = titanic_dataset['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S,Mr


In [81]:
titanic_dataset['Family_size'] = titanic_dataset['Parch'] + titanic_dataset['SibSp']
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Family_size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S,Mr,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S,Miss,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S,Mr,0


We'll one-hot Sex and Title

In [82]:
titanic_dataset = pd.get_dummies(titanic_dataset, columns=['Sex', 'Title'], drop_first=True)
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,B96 B98,...,False,False,False,False,False,True,False,False,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,...,False,False,False,False,False,False,True,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,B96 B98,...,False,False,True,False,False,False,False,False,False,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,...,False,False,False,False,False,False,True,False,False,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,B96 B98,...,False,False,False,False,False,True,False,False,False,False


And label Emarked as it's an ordinal data (S-C-Q)

In [83]:
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
titanic_dataset['Embarked_Label'] = titanic_dataset['Embarked'].map(embarked_mapping)
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_Label
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,B96 B98,...,False,False,False,False,True,False,False,False,False,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,...,False,False,False,False,False,True,False,False,False,1
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,B96 B98,...,False,True,False,False,False,False,False,False,False,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,...,False,False,False,False,False,True,False,False,False,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,B96 B98,...,False,False,False,False,True,False,False,False,False,0


In [84]:
titanic_dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family_size', 'Sex_male',
       'Title_Col', 'Title_Countess', 'Title_Don', 'Title_Dr',
       'Title_Jonkheer', 'Title_Lady', 'Title_Major', 'Title_Master',
       'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr', 'Title_Mrs',
       'Title_Ms', 'Title_Rev', 'Title_Sir', 'Embarked_Label'],
      dtype='object')

# Dealing with outliers (Ex4)

In [85]:
import numpy as np

# 1. Detect Outliers Using IQR for Age and Fare

def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers


# 2. Detect Outliers Using Z-score for Age and Fare

def detect_outliers_zscore(df, column):
    mean = np.mean(df[column])
    std = np.std(df[column])
    z_scores = (df[column] - mean) / std
    outliers = df[np.abs(z_scores) > 3]
    return outliers

# Capping extreme values at the IQR boundaries
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    num_outliers_before = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    print(f'{num_outliers_before} outliers were capped in {column} column')
    return df


detect_outliers_iqr(titanic_dataset, 'Age')


titanic_dataset = cap_outliers(titanic_dataset, 'Age')
titanic_dataset = cap_outliers(titanic_dataset, 'Fare')
titanic_dataset = cap_outliers(titanic_dataset, 'SibSp')
titanic_dataset = cap_outliers(titanic_dataset, 'Parch')
titanic_dataset = cap_outliers(titanic_dataset, 'Family_size')



66 outliers were capped in Age column
116 outliers were capped in Fare column
46 outliers were capped in SibSp column
213 outliers were capped in Parch column
91 outliers were capped in Family_size column


|      Column    |   Transformation|
|-------------|---|
| Age  | Standardization. Age is normally distributed. By standardizing age we'll have mean of 1 and std of 1.
| SibSp       | Normalization. We'll have data scaled between 0 and 1  |   
| Parch       |  Norm |   
| Fare        |  Norm  |   
| Family_size | Norm   |   
| Pclass      | Leave as is. It's ordinal feature.  |   


In [86]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standardize Age
standard_scaler = StandardScaler()
titanic_dataset['Age_Standardized'] = standard_scaler.fit_transform(titanic_dataset[['Age']])

# Normalize Fare, SibSp, Parch and Family_size
min_max_scaler = MinMaxScaler()
titanic_dataset[['Fare_Normalized', 'SibSp_Normalized', 'Parch_Normalized', 'FS_Normalized']] = min_max_scaler.fit_transform(
    titanic_dataset[['Fare', 'SibSp', 'Parch', 'Family_size']])

In [87]:
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Embarked_Label,Age_Standardized,Fare_Normalized,SibSp_Normalized,Parch_Normalized,FS_Normalized
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1.0,0.0,A/5 21171,7.25,B96 B98,...,False,False,False,False,0,-0.611917,0.11046,0.4,0.0,0.4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1.0,0.0,PC 17599,65.6344,C85,...,True,False,False,False,1,0.715304,1.0,0.4,0.0,0.4
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0.0,0.0,STON/O2. 3101282,7.925,B96 B98,...,False,False,False,False,0,-0.280111,0.120745,0.0,0.0,0.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1.0,0.0,113803,53.1,C123,...,True,False,False,False,0,0.46645,0.809027,0.4,0.0,0.4
4,5,0,3,"Allen, Mr. William Henry",35.0,0.0,0.0,373450,8.05,B96 B98,...,False,False,False,False,0,0.46645,0.122649,0.0,0.0,0.0


# Exercise 4: Outlier Detection and Handling
Instructions  
Use statistical methods to detect outliers in columns like Fare and Age.  
Decide on a strategy to handle the identified outliers, such as capping, transformation, or removal.  
Implement the chosen strategy and assess its impact on the dataset.  
Hint: Explore methods like IQR (Interquartile Range) and Z-score for outlier detection.  