In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Load Titanic dataset from seaborn
data = sns.load_dataset("titanic")

# Drop columns with too many missing values or irrelevant info
data.drop(['deck', 'embark_town', 'alive', 'class', 'who'], axis=1, inplace=True)

# Drop rows with missing target (survived)
data.dropna(subset=['survived'], inplace=True)

# Fill missing values
data['age'].fillna(data['age'].median(), inplace=True)
data['embarked'].fillna(data['embarked'].mode()[0], inplace=True)

# Encode categorical variables
label_cols = ['sex', 'embarked', 'alone']
le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])

# Define features and target
X = data.drop(['survived'], axis=1)
y = data['survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVC": SVC(probability=True)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-score": f1_score(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T.sort_values("F1-score", ascending=False)
print("Model performance:")
print(results_df)

# Hyperparameter Tuning for best models (Random Forest and Gradient Boosting)
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None]
}

param_grid_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1')
grid_rf.fit(X_train, y_train)

rand_gb = RandomizedSearchCV(GradientBoostingClassifier(), param_grid_gb, cv=5, n_iter=5, scoring='f1', random_state=42)
rand_gb.fit(X_train, y_train)

print("\nBest Random Forest Parameters:", grid_rf.best_params_)
print("Best Gradient Boosting Parameters:", rand_gb.best_params_)

# Evaluate best models
y_pred_rf = grid_rf.predict(X_test)
y_pred_gb = rand_gb.predict(X_test)

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))


Model performance:
                     Accuracy  Precision    Recall  F1-score
Gradient Boosting    0.832402   0.833333  0.743243  0.785714
Logistic Regression  0.815642   0.797101  0.743243  0.769231
Random Forest        0.810056   0.777778  0.756757  0.767123
SVC                  0.659218   0.760000  0.256757  0.383838

Best Random Forest Parameters: {'max_depth': 5, 'n_estimators': 100}
Best Gradient Boosting Parameters: {'n_estimators': 150, 'learning_rate': 0.2}

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.70      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           