In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib


def load_data(file_path):
    return pd.read_csv(file_path)


def preprocess_data(df):
    df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], errors='ignore')
    
    # Handle missing values
    df["Age"] = SimpleImputer(strategy="median").fit_transform(df[["Age"]])
    df["Fare"] = SimpleImputer(strategy="mean").fit_transform(df[["Fare"]])
    df.dropna(inplace=True)  # Drop remaining missing values if any
    
    # Encode categorical variables
    label_encoder = LabelEncoder()
    df["Sex"] = label_encoder.fit_transform(df["Sex"])
    df["Embarked"] = label_encoder.fit_transform(df["Embarked"])
    
    # Normalize numerical data
    scaler = StandardScaler()
    df[["Age", "Fare"]] = scaler.fit_transform(df[["Age", "Fare"]])
    
    return df

# Train Model
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

# Evaluate Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }
    return metrics

if __name__ == "__main__":
    file_path = "tested.csv"  # Ensure the dataset is in the project directory
    
   
    df = load_data(file_path)
    df = preprocess_data(df)
    
    # Splitting data
    X = df.drop(columns=["Survived"])
    y = df["Survived"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train and evaluate model
    model = train_model(X_train, y_train)
    results = evaluate_model(model, X_test, y_test)
    
    print("Model Evaluation:")
    for metric, value in results.items():
        print(f"{metric}: {value:.4f}")
    
    # Save the trained model
    joblib.dump(model, "titanic_model.pkl")
    print("Model saved as titanic_model.pkl")


Model Evaluation:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Model saved as titanic_model.pkl
