In [3]:
# titanic_ml.py

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

# -----------------------------
# 1. Load Dataset
# -----------------------------
data = pd.read_csv("titanic.csv")

print("Dataset Shape:", data.shape)
print(data.head())

# -----------------------------
# 2. Data Preprocessing
# -----------------------------

# Drop unnecessary columns
# Normalize column names to match the rest of the notebook, then drop unused ones
data.rename(columns={
    'passenger_id': 'PassengerId',
    'name': 'Name',
    'p_class': 'Pclass',
    'sex': 'Sex',
    'age': 'Age',
    'sib_sp': 'SibSp',
    'parch': 'Parch',
    'ticket': 'Ticket',
    'fare': 'Fare',
    'cabin': 'Cabin',
    'embarked': 'Embarked',
    'survived': 'Survived'
}, inplace=True)

data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Handle missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])

# -----------------------------
# 3. Feature Selection
# -----------------------------
X = data.drop('Survived', axis=1)
y = data['Survived']

# -----------------------------
# 4. Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 5. Model Training
# -----------------------------
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

model.fit(X_train, y_train)

# -----------------------------
# 6. Prediction
# -----------------------------
y_pred = model.predict(X_test)

# -----------------------------
# 7. Evaluation
# -----------------------------
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Dataset Shape: (891, 12)
   passenger_id                                               name  p_class  \
0             1                            Braund, Mr. Owen Harris        3   
1             2  Cumings, Mrs. John Bradley (Florence Briggs Th...        1   
2             3                             Heikkinen, Miss. Laina        3   
3             4       Futrelle, Mrs. Jacques Heath (Lily May Peel)        1   
4             5                           Allen, Mr. William Henry        3   

      sex   age  sib_sp  parch            ticket     fare cabin embarked  \
0    male  22.0       1      0         A/5 21171   7.2500   NaN        S   
1  female  38.0       1      0          PC 17599  71.2833   C85        C   
2  female  26.0       0      0  STON/O2. 3101282      NaN   NaN        S   
3  female  35.0       1      0            113803  53.1000  C123        S   
4    male  35.0       0      0            373450   8.0500   NaN        S   

   survived  
0         0  
1         1  
2

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
