In [1]:
!pip install pandas matplotlib seaborn openpyxl




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Titanic Cleaning & Feature Importance (v1)
This notebook focuses on cleaning the Titanic dataset and training a simple model to evaluate feature importance.

In [2]:
import pandas as pd

df = pd.read_csv("Titanic_dataset.csv") 
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 1. View first rows of the dataset
df.head()

# 2. General information about columns and data types
df.info()

# 3. Descriptive statistics of numeric columns
df.describe()

# 4. Count of null values per column
df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# --- Step 1: Rename columns to lowercase for consistency ---
df.columns = df.columns.str.lower()
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [5]:
# --- Step 2: Drop irrelevant columns or columns with too many nulls ---
df = df.drop(['name', 'ticket', 'cabin'], axis=1)
df.head()

Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [6]:
# --- Step 3: Fill missing values ---
# Fill Age with median
df['age'].fillna(df['age'].median(), inplace=True)

# Fill Embarked with the most frequent value
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# Verify no nulls remain
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)


passengerid    0
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
dtype: int64

In [7]:
# Convert 'sex' to binary variable: male=0, female=1
df['sex'] = df['sex'].map({'male': 0, 'female': 1})

# Create dummy variables for 'embarked'
df = pd.get_dummies(df, columns=['embarked'], drop_first=True)

# Verify the result
df.head()

Unnamed: 0,passengerid,survived,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
0,1,0,3,0,22.0,1,0,7.25,False,True
1,2,1,1,1,38.0,1,0,71.2833,False,False
2,3,1,3,1,26.0,0,0,7.925,False,True
3,4,1,1,1,35.0,1,0,53.1,False,True
4,5,0,3,0,35.0,0,0,8.05,False,True


In [8]:
from sklearn.preprocessing import StandardScaler

# 1️⃣ Target variable
y = df['survived']   # what we want to predict

# 2️⃣ Features (drop 'survived' so it does not mix)
X = df.drop(columns=['survived'])

# 3️⃣ Scale ONLY numeric columns
scaler = StandardScaler()
X_scaled = X.copy()  # to not overwrite in case we want to compare
num_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']
X_scaled[num_cols] = scaler.fit_transform(X_scaled[num_cols])

# Check first rows to verify changes
X_scaled.head()

Unnamed: 0,passengerid,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
0,1,0.827377,0,-0.565736,0.432793,-0.473674,-0.502445,False,True
1,2,-1.566107,1,0.663861,0.432793,-0.473674,0.786845,False,False
2,3,0.827377,1,-0.258337,-0.474545,-0.473674,-0.488854,False,True
3,4,-1.566107,1,0.433312,0.432793,-0.473674,0.42073,False,True
4,5,0.827377,0,0.433312,-0.474545,-0.473674,-0.486337,False,True


In [9]:
from sklearn.model_selection import train_test_split

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Check dataset sizes
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


Train: (712, 9), Test: (179, 9)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1️⃣ Create the model
model = RandomForestClassifier(random_state=42)

# 2️⃣ Train the model with training data
model.fit(X_train, y_train)

# 3️⃣ Predict with test data
y_pred = model.predict(X_test)

# 4️⃣ Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8379888268156425

Confusion Matrix:
 [[94 11]
 [18 56]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       105
           1       0.84      0.76      0.79        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



# --- Note on Random Forest results ---
# Accuracy: 0.838 → The model predicts correctly about 83.8% of the time
# Confusion Matrix:
# [[94 11]
#  [18 56]]
# Rows = actual values, Columns = predictions
# Most errors are survivors predicted as non-survivors
#
# Classification Report:
# - Precision: percentage of correct predictions per class
# - Recall: percentage of true positives identified
# - F1-score: balance between precision and recall
# In our case:
# - Class 0 (Did not survive): precision 0.84, recall 0.90 → the model identifies non-survivors well
# - Class 1 (Survived): precision 0.84, recall 0.76 → some survivors are misclassified

In [12]:
# Feature importance
importances = model.feature_importances_

# Associate with column names
feature_importances = pd.Series(importances, index=X_train.columns)

# Sort from highest to lowest importance
feature_importances.sort_values(ascending=False)


sex            0.264303
fare           0.196071
passengerid    0.190691
age            0.167558
pclass         0.080570
sibsp          0.041881
parch          0.028616
embarked_S     0.021430
embarked_Q     0.008880
dtype: float64

In [14]:
df.to_csv("titanic_cleaned.csv", index=False)