In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Titanic-Dataset.csv")
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# 2. Feature engineering:
- Feature engineering involves creating or modifying features to improve model performance. In the context of the Titanic dataset, this involves several steps:

###Handling missing values

- Age: The 'Age' column often has missing values. You can impute these missing values using the mean, median, or a more sophisticated method like K-Nearest Neighbors imputation.




- Embarked: This categorical feature may also have missing values. You can impute these by using the most frequent value (mode) of the column.




- Cabin: This column has a large number of missing values and might be best dropped or used to create a new feature representing the deck.


In [3]:
df["Age"] = df["Age"].fillna(df["Age"].mean())
df.drop(columns=["PassengerId","Cabin", "Ticket"], inplace=True)
df["Sex"] = df["Sex"].map({"male":0, "female": 1})
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,7.2500,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,7.9250,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,53.1000,S
4,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,13.0000,S
887,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,30.0000,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,23.4500,S
889,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,30.0000,C


###Creating new features
- Family Size: Combine 'SibSp' (number of siblings/spouses aboard) and 'Parch' (number of parents/children aboard) to create a new feature called 'FamilySize'. This can provide more insight into the impact of family on survival chances.
- IsAlone: Create a binary feature indicating whether a passenger was traveling alone or not, based on 'FamilySize'.
- Age Group: Categorize the 'Age' feature into groups (e.g., child, young adult, adult, elderly). This can capture non-linear relationships with survival and might be more robust to outliers.
Transforming features


In [4]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = df["FamilySize"].apply(lambda x:1 if x ==1 else 0)
year = [0,9,17,39,59,120]
labels = ["child", "Teen", "y_adult", "m_adult", "senior"]
df["AgeGroup"] = pd.cut(df["Age"],bins=year,labels=labels)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,AgeGroup
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,S,2,0,y_adult
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C,2,0,y_adult
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,S,1,1,y_adult
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,S,2,0,y_adult
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,S,1,1,y_adult


- Title: Extract titles from the 'Name' column (e.g., Mr., Mrs., Miss, Master). Titles can reveal social status and, consequently, have an impact on survival rates.

In [5]:
df["Title"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())
unique_title = df["Title"].unique()

def simplify_title(title):
    royalty = ['Don', 'Sir', 'Lady', 'the Countess', 'Jonkheer', 'Prince']
    military = ['Major', 'Col', 'Capt']
    professional = ['Dr', 'Rev']
    if title in ['Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title in royalty:
        return 'Royalty'
    elif title in military:
        return 'Military'
    elif title in professional:
        return 'Professional'
    else:
        return title  # Keep common titles like Mr, Mrs, Miss, Master

df["TitleGroup"] = df["Title"].apply(simplify_title)
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,AgeGroup,Title,TitleGroup
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,S,2,0,y_adult,Mr,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C,2,0,y_adult,Mrs,Mrs
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,S,1,1,y_adult,Miss,Miss


###Transforming Features
- Encoding categorical features: Machine learning algorithms require numerical input, so categorical features like 'Sex', 'Embarked', 'Pclass', and the newly created 'Title' need to be converted to numerical representations. One-hot encoding is a common method for this, creating new binary columns for each category.


In [6]:
df.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,AgeGroup,Title,TitleGroup
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,S,2,0,y_adult,Mr,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C,2,0,y_adult,Mrs,Mrs


In [7]:
df = pd.get_dummies(df, columns=["TitleGroup"], drop_first= True)
df = pd.get_dummies(df,columns=["Embarked"], drop_first= True)
df = pd.get_dummies(df, columns=["AgeGroup"], drop_first = True)
df.head(3)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,FamilySize,IsAlone,...,TitleGroup_Mr,TitleGroup_Mrs,TitleGroup_Professional,TitleGroup_Royalty,Embarked_Q,Embarked_S,AgeGroup_Teen,AgeGroup_y_adult,AgeGroup_m_adult,AgeGroup_senior
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,2,0,...,True,False,False,False,False,True,False,True,False,False
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,2,0,...,False,True,False,False,False,False,False,True,False,False
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,1,1,...,False,False,False,False,False,True,False,True,False,False


####Feature selection
- Dropping irrelevant features: Features like 'PassengerId', 'Name', 'Ticket', and potentially 'Cabin' might not be directly useful for predicting survival and can be dropped.
- Correlation analysis: Use correlation analysis to identify features that are strongly correlated with the target variable ('Survived') and consider removing features that have low correlation or are redundant.

In [8]:
df.drop(columns=["Name","Title", "SibSp", "Parch"], inplace=True)
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,FamilySize,IsAlone,TitleGroup_Military,TitleGroup_Miss,TitleGroup_Mr,TitleGroup_Mrs,TitleGroup_Professional,TitleGroup_Royalty,Embarked_Q,Embarked_S,AgeGroup_Teen,AgeGroup_y_adult,AgeGroup_m_adult,AgeGroup_senior
0,0,3,0,22.0,7.25,2,0,False,False,True,False,False,False,False,True,False,True,False,False
1,1,1,1,38.0,71.2833,2,0,False,False,False,True,False,False,False,False,False,True,False,False



- Dropping irrelevant features: Features like 'PassengerId', 'Name', 'Ticket', and potentially 'Cabin' might not be directly useful for predicting survival and can be dropped.
- Correlation analysis: Use correlation analysis to identify features that are strongly correlated with the target variable ('Survived') and consider removing features that have low
correlation or are redundant.
Conclusion.

### Data Normalization for:
  - Models: LR, KNN, XGB
  - Feature: Age, Fare, SibSp, Parch

In [9]:
# Apply logarithmic transformation to the 'Fare' column
df['Fare'] = np.log1p(df['Fare'])
dfNorm = df.copy()
display(dfNorm.head(3))

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,FamilySize,IsAlone,TitleGroup_Military,TitleGroup_Miss,TitleGroup_Mr,TitleGroup_Mrs,TitleGroup_Professional,TitleGroup_Royalty,Embarked_Q,Embarked_S,AgeGroup_Teen,AgeGroup_y_adult,AgeGroup_m_adult,AgeGroup_senior
0,0,3,0,22.0,2.110213,2,0,False,False,True,False,False,False,False,True,False,True,False,False
1,1,1,1,38.0,4.280593,2,0,False,False,False,True,False,False,False,False,False,True,False,False
2,1,3,1,26.0,2.188856,1,1,False,True,False,False,False,False,False,True,False,True,False,False


## Variance Inflation Factor (VIF)
 - VIF is a diagnostic toll that helps detect multicollinearity in regression modles

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = dfNorm[['Pclass', 'Age', 'Fare','FamilySize']]
X = add_constant(X)

vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["Features"] = X.columns
vif

Unnamed: 0,VIF,Features
0,77.881381,const
1,2.40328,Pclass
2,1.193825,Age
3,2.636869,Fare
4,1.540034,FamilySize


#Spliting dataset into test and train,

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfNorm.drop(columns=["Survived"]), dfNorm["Survived"], test_size=0.2, random_state=42)

In [12]:
display(X_train.head(3))
display(y_train.head(3))

Unnamed: 0,Pclass,Sex,Age,Fare,FamilySize,IsAlone,TitleGroup_Military,TitleGroup_Miss,TitleGroup_Mr,TitleGroup_Mrs,TitleGroup_Professional,TitleGroup_Royalty,Embarked_Q,Embarked_S,AgeGroup_Teen,AgeGroup_y_adult,AgeGroup_m_adult,AgeGroup_senior
331,1,0,45.5,3.38439,1,1,False,False,True,False,False,False,False,True,False,False,True,False
733,2,0,23.0,2.639057,1,1,False,False,True,False,False,False,False,True,False,True,False,False
382,3,0,32.0,2.188856,1,1,False,False,True,False,False,False,False,True,False,True,False,False


Unnamed: 0,Survived
331,0
733,0
382,0


## Scaling dataset after split to prevent data leakage

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Scale only o training data
X_train[['Pclass', 'Age', 'Fare','FamilySize']] = scaler.fit_transform(X_train[['Pclass', 'Age', 'Fare','FamilySize']])
# Apply same transformation to test data
X_test[['Pclass', 'Age', 'Fare','FamilySize']] = scaler.transform(X_test[['Pclass', 'Age', 'Fare','FamilySize']])

- Feature scaling: Numerical features like 'Age' and 'Fare' may have different scales, which can bias certain models. Scaling techniques like Standardization or Min-Max scaling can normalize these features to a uniform range (e.g., 0 to 1).
- Logarithmic transformation: If a numerical feature like 'Fare' is skewed, a logarithmic transformation can help normalize the distribution and reduce the impact of outliers.

In [14]:
X_train

Unnamed: 0,Pclass,Sex,Age,Fare,FamilySize,IsAlone,TitleGroup_Military,TitleGroup_Miss,TitleGroup_Mr,TitleGroup_Mrs,TitleGroup_Professional,TitleGroup_Royalty,Embarked_Q,Embarked_S,AgeGroup_Teen,AgeGroup_y_adult,AgeGroup_m_adult,AgeGroup_senior
331,-1.614136,0,1.229207,0.437404,-0.554666,1,False,False,True,False,False,False,False,True,False,False,True,False
733,-0.400551,0,-0.503505,-0.322935,-0.554666,1,False,False,True,False,False,False,False,True,False,True,False,False
382,0.813034,0,0.189580,-0.782201,-0.554666,1,False,False,True,False,False,False,False,True,False,True,False,False
704,0.813034,0,-0.272477,-0.790325,0.040096,0,False,False,True,False,False,False,False,True,False,True,False,False
813,0.813034,1,-1.812666,0.529116,3.013909,0,False,True,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.813034,1,-0.657524,-0.814128,-0.554666,1,False,True,False,False,False,False,False,True,False,True,False,False
270,-1.614136,0,0.012390,0.520387,-0.554666,1,False,False,True,False,False,False,False,True,False,True,False,False
860,0.813034,0,0.882665,-0.245215,0.634859,0,False,False,True,False,False,False,False,True,False,False,True,False
435,-1.614136,1,-1.196590,1.877220,1.229621,0,False,True,False,False,False,False,False,True,True,False,False,False



By thoroughly applying EDA, you gain a deep understanding of the Titanic dataset, uncovering relationships between features and survival. This understanding then guides the feature engineering process, where you create and refine features to improve the performance of your machine learning model. These two crucial steps are essential for building effective and accurate models for predicting Titanic passenger survival.