# Exploratory Data Analysis

### Data Dictionary

| Variable   | Definition                            | Key                            |
|------------|---------------------------------------|--------------------------------|
| survival   | Survival                              | 0 = No, 1 = Yes                |
| pclass     | Ticket class                          | 1 = 1st, 2 = 2nd, 3 = 3rd      |
| sex        | Sex                                   |                                |
| Age        | Age in years                          |                                |
| sibsp      | # of siblings/spouses aboard Titanic  |                                |
| parch      | # of parents/children aboard Titanic  |                                |
| ticket     | Ticket number                         |                                |
| fare       | Passenger fare                        |                                |
| cabin      | Cabin number                          |                                |
| embarked   | Port of Embarkation                   | C = Cherbourg, Q = Queenstown, S = Southampton |


# 1. Problem Definition

### Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.
### Specific Task: find patterns in train.csv that help predict whether the passengers in test.csv survived.

# 2. Data Collection and Exploration

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Step 1: Load the Dataset
train_df = pd.read_csv('data/train/train.csv')
test_df = pd.read_csv('data/test/test.csv')
combine_df = [train_df, test_df]


In [None]:
# Features
print(f"Features:\n{train_df.columns.values}")
print()

# Display first few rows of train and test data
train_df.head()
test_df.head()

# Summary statistics
print(train_df.describe())
print(test_df.describe())

# Check for missing values
print(train_df.isnull().sum())
print(test_df.isnull().sum())


### Visualize Survival Rates by Gender

In [None]:
sns.set_theme(style="whitegrid")

# Plot survival rates by gender
plt.figure(figsize=(6, 4))
sns.countplot(x='Sex', hue='Survived', data=train_df, palette='Set1')
plt.title("Survival Rates by Gender")
plt.ylabel("Count")
plt.show()


### Visualize Survival Rates by Passenger Class

In [None]:
# Plot survival rates by passenger class
plt.figure(figsize=(6, 4))
sns.countplot(x='Pclass', hue='Survived', data=train_df, palette='Set1', legend=True)
plt.title("Survival Rates by Passenger Class")
plt.ylabel("Count")
plt.show()


### Visualize Survival Rates by Age

In [None]:
# Create a plot that shows survival based on age
plt.figure(figsize=(8, 6))
sns.histplot(data=train_df, x='Age', hue='Survived', multiple='stack', bins=30, palette='Set1')
plt.title("Survival Rates by Age")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()


### Visualize Age Distribution of Passengers

In [None]:
# Plot the distribution of age
plt.figure(figsize=(8, 6))
sns.histplot(train_df['Age'].dropna(), bins=30, kde=True, color='red')
plt.title("Age Distribution of Passengers")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()


# 3. Feature Engineering

In [None]:
# Preprocessing function to handle missing values, encoding, and scaling
def preprocess_data(df):
    # Fill missing age values with the median
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # Fill missing embarked values with the most common value
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Fill missing fare values with the median (important for test data)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # Create a new feature 'CabinAvailable' to indicate whether cabin info is available
    df['CabinAvailable'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

    # Family Size: Combine SibSp and Parch into a single feature, FamilySize
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # Extract title from the Name column (e.g., Mr., Mrs., Miss)
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

    # Drop unnecessary columns ('Name', 'Ticket', 'Cabin', 'PassengerId')
    columns_to_drop = ['Name', 'Ticket', 'Cabin', 'PassengerId']
    df = df.drop(columns_to_drop, axis=1, errors='ignore')

    # Encode categorical variables (Sex, Embarked, Title)
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

    # Scale continuous variables (Fare and Age)
    scaler = StandardScaler()
    df[['Fare', 'Age']] = scaler.fit_transform(df[['Fare', 'Age']])
    
    return df

# Load the train and test datasets
train_df = pd.read_csv('data/train/train.csv')
test_df = pd.read_csv('data/test/test.csv')

# Step 2: Store PassengerId from test_df before preprocessing
passenger_id_test = test_df['PassengerId'].copy()

# Apply preprocessing to both train and test datasets
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Step 3: Define features and target variable for training
X_train = train_df.drop('Survived', axis=1)
y_train = train_df['Survived']

# Align test set with the training set columns
train_columns = X_train.columns
X_test = test_df.copy()

# Add missing columns in the test set
for col in train_columns:
    if col not in X_test.columns:
        X_test[col] = 0

# Ensure test set has the same columns as the train set
X_test = X_test[train_columns]


# Step 4: Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Select and Initialize models
logreg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC()

# Step 4: Cross-validate Logistic Regression
cv_scores_logreg = cross_val_score(logreg, X_train, y_train, cv=5, scoring='accuracy')
logreg_accuracy = cv_scores_logreg.mean()

# Step 5: Cross-validate Random Forest
cv_scores_rf = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
rf_accuracy = cv_scores_rf.mean()

# Step 6: Cross-validate SVM
cv_scores_svm = cross_val_score(svm_model, X_train, y_train, cv=5, scoring='accuracy')
svm_accuracy = cv_scores_svm.mean()

# Output cross-validation accuracies
print(f"Logistic Regression Cross-Validation Accuracy: {logreg_accuracy:.4f}")
print(f"Random Forest Cross-Validation Accuracy: {rf_accuracy:.4f}")
print(f"SVM Cross-Validation Accuracy: {svm_accuracy:.4f}")


### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
# Step 1: Define parameter grids for each model

# Logistic Regression parameters
param_grid_logreg = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

# Random Forest parameters
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# SVM parameters
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Step 2: Hyperparameter tuning with GridSearchCV

# Logistic Regression tuning
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_logreg, cv=5, scoring='accuracy')
grid_search_logreg.fit(X_train, y_train)

# Random Forest tuning
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# SVM tuning
grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)


# 5. Model Evaluation - Precision, Cross Validation

In [None]:
# Step 3: Display the best parameters and cross-validation score for each model
print("Best parameters for Logistic Regression:", grid_search_logreg.best_params_)
print(f"Best cross-validation accuracy for Logistic Regression: {grid_search_logreg.best_score_:.4f}")

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print(f"Best cross-validation accuracy for Random Forest: {grid_search_rf.best_score_:.4f}")

print("Best parameters for SVM:", grid_search_svm.best_params_)
print(f"Best cross-validation accuracy for SVM: {grid_search_svm.best_score_:.4f}")

# 6. Generate Predictions Using the Best SVM Model

In [None]:
# Select best model
best_model = grid_search_svm.best_estimator_

# Make predictions on the test set using the best model
y_pred_final = best_model.predict(X_test)

# 7. Prepare Submission Dataframe

In [None]:
# Prepare the submission dataframe
submission = pd.DataFrame({
    "PassengerId": passenger_id_test,  # Use the PassengerId saved earlier
    "Survived": y_pred_final           # Predictions from the best model
})

# Save the submission to a CSV file
submission.to_csv('titanic_submission.csv', index=False)

print("Submission file 'titanic_submission.csv' generated successfully.")