<a href="https://colab.research.google.com/github/Reuvenb29/di_bootcamp/blob/main/dibootcamp_w8_d2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Load Titanic dataset from Google Drive
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Di Bootcamp files/titanic/train.csv')

# Preview the shape
print("Original shape:", df.shape)


Original shape: (891, 12)


In [3]:
# Check for duplicates
duplicates = df.duplicated()

# Count how many duplicate rows there are
print("Number of duplicate rows:", duplicates.sum())


Number of duplicate rows: 0


In [4]:
# Drop duplicates (harmless even if there are none)
df = df.drop_duplicates()

# Confirm the shape is still the same
print("Final shape after duplicate removal (should be unchanged):", df.shape)


Final shape after duplicate removal (should be unchanged): (891, 12)


In [5]:
# Check for missing values in each column
df.isnull().sum()


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [7]:
df.drop(columns=['Cabin'], inplace=True)


KeyError: "['Cabin'] not found in axis"

In [9]:
df.drop(columns=['Cabin'], inplace=True, errors='ignore')


In [10]:
# See most common value in 'Embarked'
print(df['Embarked'].value_counts())

# Fill missing values
df['Embarked'].fillna('S', inplace=True)


Embarked
S    644
C    168
Q     77
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)


In [11]:
# Show median age
print("Median Age:", df['Age'].median())

# Fill missing values
df['Age'].fillna(df['Age'].median(), inplace=True)


Median Age: 28.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [12]:
# prompt: make sure no missing values remain

# Check for missing values again
print(df.isnull().sum())


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [13]:
# Create FamilySize feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # +1 includes the passenger themself
df[['SibSp', 'Parch', 'FamilySize']].head()


Unnamed: 0,SibSp,Parch,FamilySize
0,1,0,2
1,1,0,2
2,0,0,1
3,1,0,2
4,0,0,1


In [14]:
# Extract title from Name using regex
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'].value_counts()


Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Mr,517
Miss,182
Mrs,125
Master,40
Dr,7
Rev,6
Col,2
Mlle,2
Major,2
Ms,1


In [16]:
# Replace rare titles with 'Rare'
rare_titles = df['Title'].value_counts()[df['Title'].value_counts() < 10].index
df['Title'] = df['Title'].replace(rare_titles, 'Rare')


In [17]:
# prompt: i want to see the new range of titles after combining the rare ones into a dedxicated rare class

# Display the updated title counts after combining rare titles
print(df['Title'].value_counts())


Title
Mr        517
Miss      182
Mrs       125
Master     40
Rare       27
Name: count, dtype: int64


In [18]:
# One-hot encoding (drop_first=True avoids redundancy)
df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

# Check new structure
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,2,True,False,True,False,True,False,False
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,2,False,False,False,False,False,True,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,False,False,True,True,False,False,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,2,False,False,True,False,False,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,True,False,True,False,True,False,False


In [19]:
# IQR for Fare
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Fare outlier bounds:", lower_bound, upper_bound)

# Count how many are outliers
outliers = df[(df['Fare'] < lower_bound) | (df['Fare'] > upper_bound)]
print("Number of Fare outliers:", outliers.shape[0])


Fare outlier bounds: -26.724 65.6344
Number of Fare outliers: 116


In [21]:
# Cap Fare values to remove outlier effect
df['Fare'] = df['Fare'].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))


In [23]:
# prompt: i want to know if there is any other data i shoud do a cap for other value to remove the outlier affect and if there is please apply

# Check for outliers in 'Age'
Q1_age = df['Age'].quantile(0.25)
Q3_age = df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age

lower_bound_age = Q1_age - 1.5 * IQR_age
upper_bound_age = Q3_age + 1.5 * IQR_age

print("Age outlier bounds:", lower_bound_age, upper_bound_age)

# Count how many are outliers
outliers_age = df[(df['Age'] < lower_bound_age) | (df['Age'] > upper_bound_age)]
print("Number of Age outliers:", outliers_age.shape[0])

# Cap Age values to remove outlier effect
df['Age'] = df['Age'].apply(lambda x: upper_bound_age if x > upper_bound_age else (lower_bound_age if x < lower_bound_age else x))


# Check for outliers in 'FamilySize'
Q1_family = df['FamilySize'].quantile(0.25)
Q3_family = df['FamilySize'].quantile(0.75)
IQR_family = Q3_family - Q1_family

lower_bound_family = Q1_family - 1.5 * IQR_family
upper_bound_family = Q3_family + 1.5 * IQR_family

print("FamilySize outlier bounds:", lower_bound_family, upper_bound_family)

# Count how many are outliers
outliers_family = df[(df['FamilySize'] < lower_bound_family) | (df['FamilySize'] > upper_bound_family)]
print("Number of FamilySize outliers:", outliers_family.shape[0])

# Cap FamilySize values to remove outlier effect
df['FamilySize'] = df['FamilySize'].apply(lambda x: upper_bound_family if x > upper_bound_family else (lower_bound_family if x < lower_bound_family else x))


Age outlier bounds: 2.5 54.5
Number of Age outliers: 0
FamilySize outlier bounds: -0.5 3.5
Number of FamilySize outliers: 91


In [24]:
# prompt: i want to standardise and normalise the data. can you import the releveant scalers to do so. i also dont want to loose the original data

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

# Assuming 'df' is your DataFrame from the previous code

# Create copies to avoid modifying the original DataFrame
df_standardized = df.copy()
df_normalized = df.copy()

# Select numerical columns for scaling
numerical_cols = ['Age', 'Fare', 'FamilySize', 'Parch', 'SibSp']


# Standardization
scaler = StandardScaler()
df_standardized[numerical_cols] = scaler.fit_transform(df_standardized[numerical_cols])


# Normalization
min_max_scaler = MinMaxScaler()
df_normalized[numerical_cols] = min_max_scaler.fit_transform(df_normalized[numerical_cols])

# Now you have two new DataFrames:
# - df_standardized: with standardized numerical features
# - df_normalized: with normalized numerical features
# - df: your original DataFrame remains unchanged

print("Standardized Data:")
print(df_standardized.head())
print("\nNormalized Data:")
print(df_normalized.head())
print("\nOriginal Data:")
print(df.head())


Standardized Data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name       Age     SibSp  \
0                            Braund, Mr. Owen Harris -0.583432  0.432793   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  0.742685  0.432793   
2                             Heikkinen, Miss. Laina -0.251903 -0.474545   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  0.494038  0.432793   
4                           Allen, Mr. William Henry  0.494038 -0.474545   

      Parch            Ticket      Fare  FamilySize  Sex_male  Embarked_Q  \
0 -0.473674         A/5 21171 -0.820552    0.366685      True       False   
1 -0.473674          PC 17599  2.031623    0.366685     False       False   
2 -0.473674  STON/O2. 3101282 -0.787578   -0.727841     False       False   

In [25]:
df[['Sex', 'Embarked', 'Title']].head()


KeyError: "None of [Index(['Sex', 'Embarked', 'Title'], dtype='object')] are in the [columns]"

In [26]:
df.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'FamilySize', 'Sex_male', 'Embarked_Q', 'Embarked_S',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare'],
      dtype='object')

In [27]:
# Define age bins and labels
age_bins = [0, 12, 18, 35, 60, 100]
age_labels = ['Child', 'Teen', 'Adult', 'Senior', 'Elder']

# Create AgeGroup column
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

# Preview
df[['Age', 'AgeGroup']].head()


Unnamed: 0,Age,AgeGroup
0,22.0,Adult
1,38.0,Senior
2,26.0,Adult
3,35.0,Adult
4,35.0,Adult


In [28]:
# One-hot encode AgeGroup
df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)

# Preview
df.filter(like='AgeGroup_').head()


Unnamed: 0,AgeGroup_Teen,AgeGroup_Adult,AgeGroup_Senior,AgeGroup_Elder
0,False,True,False,False
1,False,False,True,False
2,False,True,False,False
3,False,True,False,False
4,False,True,False,False
