In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def MachineLearning(file_path):
    try:
        #Loading the dataset
        titanic_data = pd.read_csv(file_path)
    except FileNotFoundError:
        print("Error: file '{file_path}' not found. ")
        return

    #Process the data handling
    # Handle missing values
    titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
    titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

    # Address outliers (optional step, depending on the analysis from previous tasks)

    # Perform feature engineering
    titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1
    titanic_data['Title'] = titanic_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    rare_titles = titanic_data['Title'].value_counts().index[-4:]
    titanic_data['Title'] = titanic_data['Title'].apply(lambda x: 'Rare' if x in rare_titles else x)
    titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

    # Handle categorical variables using one-hot encoding
    titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

    #Split the data into x and y
    X = titanic_data.drop('Survived', axis=1)
    y = titanic_data['Survived']

    #Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #Implement the machine learing models (SVM and Random Forest) 
    # Model 1: Support Vector Machine (SVM)
    svm_model = SVC(kernel='linear', random_state=42)
    svm_model.fit(X_train, y_train)

    # Model 2: Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    #Evaluate and compare the performance in each model
    def evaluate_model(model, X_test, y_test):
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy:", accuracy)
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

    # Evaluate SVM model
    print("SVM Model Evaluation:")
    evaluate_model(svm_model, X_test, y_test)

    # Evaluate Random Forest model
    print("\nRandom Forest Model Evaluation:")
    evaluate_model(rf_model, X_test, y_test)

if __name__ == "__main__":
    file_path = "titanic.csv"
    MachineLearning(file_path)



SVM Model Evaluation:
Accuracy: 0.8100558659217877
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       105
           1       0.77      0.77      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix:
[[88 17]
 [17 57]]

Random Forest Model Evaluation:
Accuracy: 0.8324022346368715
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       105
           1       0.80      0.80      0.80        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179

Confusion Matrix:
[[90 15]
 [15 59]]
