## Import libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

## Load dataset (adjust the path if needed)

In [2]:
df = pd.read_csv('titanic.csv')

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Fill missing Age with median


In [10]:
import random

# Randomly select 10 row indices
random_indices = random.sample(list(df.index), 10)

# Set Age to NaN in those rows
df.loc[random_indices, 'Age'] = np.nan

In [11]:
# Show count of missing (NaN) values in each column
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             10
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [12]:
df['Age'].fillna(df['Age'].median(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [13]:
# Show count of missing (NaN) values in each column
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Fill missing Embarked with mode

In [15]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [16]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


In [17]:
# Randomly select 10 row indices
random_indices = random.sample(list(df.index), 10)

# Set Fare to NaN in those rows
df.loc[random_indices, 'Fare'] = np.nan

In [19]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare            10
Cabin          687
Embarked         0
dtype: int64


## Fill missing Fare with median

In [20]:
df['Fare'].fillna(df['Fare'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)


In [21]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64


## Label Encoding for Sex

In [22]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

## One-hot encoding for Embarked

In [25]:
df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')


## Normalize numerical values

In [26]:
scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

## Extract title from Name

In [27]:
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


## Simplify rare titles

In [28]:
rare_titles = df['Title'].value_counts()[df['Title'].value_counts() < 10].index
df['Title'] = df['Title'].replace(rare_titles, 'Rare')

## Encode Title

In [29]:
df['Title'] = le.fit_transform(df['Title'])


## Create FamilySize feature

In [30]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1


## Final preview

In [31]:
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex       Age  SibSp  \
0                            Braund, Mr. Owen Harris    1  0.271174      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    0  0.472229      1   
2                             Heikkinen, Miss. Laina    0  0.321438      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  0.434531      1   
4                           Allen, Mr. William Henry    1  0.434531      0   

   Parch            Ticket      Fare Cabin  Embarked_C  Embarked_Q  \
0      0         A/5 21171  0.014151   NaN       False       False   
1      0          PC 17599  0.139136   C85        True       False   
2      0  STON/O2. 3101282  0.015469   NaN       False       False   
3      0            113803  0.1036