In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Libraries imported successfully!")

In [None]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

print(f"Dataset Shape: {df.shape}")
df.head()

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].median())

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

print("Missing values filled.")
print(df.isnull().sum()) 

In [None]:
df.isnull().sum()

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)
print("Age fixed.")

In [None]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
print("Embarked fixed.")

In [None]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df[['SibSp', 'Parch', 'FamilySize']].head()

In [None]:
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df[['FamilySize', 'IsAlone']].head()

In [None]:
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'].value_counts()

In [None]:
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                   'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
print("Titles grouped.")

In [None]:
categorical_cols = ['Sex', 'Embarked', 'Title']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']

In [None]:
X = df[categorical_cols + numerical_cols]  
y = df['Survived']                         

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Data split done.")

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [None]:
model_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

In [None]:
model_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
print("Training Logistic Regression...")
model_lr.fit(X_train, y_train)
print("Done!")

In [None]:
print("Training Random Forest...")
model_rf.fit(X_train, y_train)
print("Done!")

In [None]:
y_pred_lr = model_lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {acc_lr*100:.1f}%")

In [None]:
y_pred_rf = model_rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {acc_rf*100:.1f}%")

In [None]:
rf_step = model_rf.named_steps['classifier']
pre_step = model_rf.named_steps['preprocessor']

cat_names = pre_step.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_features = np.r_[numerical_cols, cat_names]
importances = rf_step.feature_importances_

feat_df = pd.DataFrame({'Feature': all_features, 'Importance': importances})
top_10 = feat_df.sort_values(by='Importance', ascending=False).head(10)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=top_10, palette='viridis')
plt.title('Top 10 Features for Survival')
plt.show()