In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Set pandas to display all columns
pd.set_option('display.max_columns', None)

In [None]:
# Load the dataset into a pandas DataFrame
df = pd.read_csv('train.csv')

In [None]:
# View the first 10 records
print("## First 10 Records ##")
display(df.head(10))

# View the last 15 records
print("\n## Last 15 Records ##")
display(df.tail(15))

# Describe data and find statistics about all attributes
print("\n## Statistics for All Attributes ##")
display(df.describe(include='all'))

# View types of all attributes
print("\n## Data Types of All Attributes ##")
df.info()

# Read data in Boolean format (to identify missing values)
print("\n## Boolean Mask of Missing Values (First 5 Rows) ##")
display(df.isnull().head())

# Check how many missing values are there in each attribute
print("\n## Count of Missing Values per Attribute ##")
print(df.isnull().sum())

# Note: The Titanic dataset uses standard NaN for missing values,
# so there are no non-standard values like '?' or 'N/A' to replace.

## First 10 Records ##


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C



## Last 15 Records ##


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
876,877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20.0,0,0,7534,9.8458,,S
877,878,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0,,S
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q



## Statistics for All Attributes ##


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Dooley, Mr. Patrick",male,,,,347082.0,,G6,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,



## Data Types of All Attributes ##
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

## Boolean Mask of Missing Values (First 5 Rows) ##


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False



## Count of Missing Values per Attribute ##
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [None]:
# Use median to fill missing Age values.
# Median is more robust to outliers (e.g., a few very old passengers) than the mean.
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)
print(f"Missing 'Age' values after median imputation: {df['Age'].isnull().sum()}")

# Use mode to fill missing Embarked values.
# Mode is the standard method for imputing missing categorical data.
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(mode_embarked, inplace=True)
print(f"Missing 'Embarked' values after mode imputation: {df['Embarked'].isnull().sum()}")

# Use removal of column for Cabin.
# The 'Cabin' column is missing too many values (over 77%) to be useful.
df.drop('Cabin', axis=1, inplace=True)
print("\n'Cabin' column removed from DataFrame.")
print("Current columns:", df.columns)

Missing 'Age' values after median imputation: 0
Missing 'Embarked' values after mode imputation: 0

'Cabin' column removed from DataFrame.
Current columns: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(mode_embarked, inplace=True)


In [None]:
# Apply bin by mean
mean_age = df['Age'].mean()
df['Age_Bin_Mean'] = pd.cut(df['Age'],
                            bins=[df['Age'].min()-1, mean_age, df['Age'].max()+1],
                            labels=['Below Mean Age', 'Above or Equal to Mean Age'])

print("## Age Binned by Mean ##")
display(df[['Age', 'Age_Bin_Mean']].head())

# Apply bin by boundaries
df['Age_Bin_Boundary'] = pd.cut(df['Age'],
                                bins=[0, 12, 19, 59, df['Age'].max()+1],
                                labels=['Child', 'Teen', 'Adult', 'Senior'])

print("\n## Age Binned by Custom Boundaries ##")
display(df[['Age', 'Age_Bin_Boundary']].head())

## Age Binned by Mean ##


Unnamed: 0,Age,Age_Bin_Mean
0,22.0,Below Mean Age
1,38.0,Above or Equal to Mean Age
2,26.0,Below Mean Age
3,35.0,Above or Equal to Mean Age
4,35.0,Above or Equal to Mean Age



## Age Binned by Custom Boundaries ##


Unnamed: 0,Age,Age_Bin_Boundary
0,22.0,Adult
1,38.0,Adult
2,26.0,Adult
3,35.0,Adult
4,35.0,Adult


In [None]:
# Display a boolean DataFrame indicating missing values
print("Boolean format (is null?):")
display(df.isnull())

Boolean format (is null?):


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
# Min-max normalization (scales to a [0, 1] range)
min_max_scaler = MinMaxScaler()
df['Fare_MinMax'] = min_max_scaler.fit_transform(df[['Fare']])
print("## Fare after Min-Max Normalization ##")
display(df[['Fare', 'Fare_MinMax']].head())

# Z-score normalization (scales to a mean of 0 and std dev of 1)
z_score_scaler = StandardScaler()
df['Fare_ZScore'] = z_score_scaler.fit_transform(df[['Fare']])
print("\n## Fare after Z-Score Normalization ##")
display(df[['Fare', 'Fare_ZScore']].head())

# Decimal scaling (moves decimal so max absolute value is < 1)
k = np.ceil(np.log10(df['Fare'].abs().max()))
df['Fare_Decimal'] = df['Fare'] / (10**k)
print("\n## Fare after Decimal Scaling ##")
display(df[['Fare', 'Fare_Decimal']].head())

## Fare after Min-Max Normalization ##


Unnamed: 0,Fare,Fare_MinMax
0,7.25,0.014151
1,71.2833,0.139136
2,7.925,0.015469
3,53.1,0.103644
4,8.05,0.015713



## Fare after Z-Score Normalization ##


Unnamed: 0,Fare,Fare_ZScore
0,7.25,-0.502445
1,71.2833,0.786845
2,7.925,-0.488854
3,53.1,0.42073
4,8.05,-0.486337



## Fare after Decimal Scaling ##


Unnamed: 0,Fare,Fare_Decimal
0,7.25,0.00725
1,71.2833,0.071283
2,7.925,0.007925
3,53.1,0.0531
4,8.05,0.00805


In [None]:
# Convert nominal attributes to symmetric binary (one-hot encoding)
nominal_cols = ['Sex', 'Embarked']
df_one_hot = pd.get_dummies(df, columns=nominal_cols, drop_first=True)

print("## DataFrame after One-Hot Encoding (showing new columns) ##")
print(f"Original shape: {df.shape}")
print(f"Shape after one-hot encoding: {df_one_hot.shape}")
display(df_one_hot.head())

# Convert ordinal attributes to z-score
# This is a two-step process: 1. Ordinal Encoding, 2. Z-score Normalization.
# The 'Pclass' (Passenger Class) is a perfect ordinal feature.

# Step 1: Ordinal Encoding (mapping categories to numbers that reflect order)
# We map 1st class to a higher value (3) and 3rd class to a lower value (1).
pclass_map = {1: 3, 2: 2, 3: 1}
df['Pclass_Encoded'] = df['Pclass'].map(pclass_map)

# Step 2: Apply Z-score to the new encoded column
scaler = StandardScaler()
df['Pclass_ZScore'] = scaler.fit_transform(df[['Pclass_Encoded']])

print("\n## Ordinal Attributes after Encoding and Z-Score Normalization ##")
display(df[['Pclass', 'Pclass_Encoded', 'Pclass_ZScore']].head())

## DataFrame after One-Hot Encoding (showing new columns) ##
Original shape: (891, 16)
Shape after one-hot encoding: (891, 17)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Age_Bin_Mean,Age_Bin_Boundary,Fare_MinMax,Fare_ZScore,Fare_Decimal,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,Below Mean Age,Adult,0.014151,-0.502445,0.00725,True,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,Above or Equal to Mean Age,Adult,0.139136,0.786845,0.071283,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,Below Mean Age,Adult,0.015469,-0.488854,0.007925,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,Above or Equal to Mean Age,Adult,0.103644,0.42073,0.0531,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,Above or Equal to Mean Age,Adult,0.015713,-0.486337,0.00805,True,False,True



## Ordinal Attributes after Encoding and Z-Score Normalization ##


Unnamed: 0,Pclass,Pclass_Encoded,Pclass_ZScore
0,3,1,-0.827377
1,1,3,1.566107
2,3,1,-0.827377
3,1,3,1.566107
4,3,1,-0.827377


In [None]:
print("Count of missing values per attribute:")
print(df.isnull().sum())

Count of missing values per attribute:
Employee_Name                   0
EmpID                           0
MarriedID                       0
MaritalStatusID                 0
GenderID                        0
EmpStatusID                     0
DeptID                          0
PerfScoreID                     0
FromDiversityJobFairID          0
Salary                          0
Termd                           0
PositionID                      0
Position                        0
State                           0
Zip                             0
DOB                             0
Sex                             0
MaritalDesc                     0
CitizenDesc                     0
HispanicLatino                  0
RaceDesc                        0
DateofHire                      0
DateofTermination             207
TermReason                    207
EmploymentStatus                0
Department                      0
ManagerName                     0
ManagerID                       8
Recruitme