# Titanic Survival Prediction
This notebook uses the Titanic dataset to predict survival based on various passenger features. The workflow follows CRISP-DM methodology.

## Step 1: Importing Libraries and Loading Data

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Load the Titanic dataset
file_path = '/path_to_your_data/train.csv'
titanic_data = pd.read_csv(file_path)

# Display the first few rows of the dataset
titanic_data.head()
    

## Step 2: Data Preprocessing
In this step, we handle missing values, encode categorical variables, and scale numerical features.

In [None]:

# Handle missing values
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data.drop('Cabin', axis=1, inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# Encode categorical variables
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=True)

# Select relevant features
features_to_keep = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S']
titanic_prepared = titanic_data[features_to_keep]

# Scale numerical features
scaler = StandardScaler()
titanic_prepared[['Age', 'SibSp', 'Parch', 'Fare']] = scaler.fit_transform(titanic_prepared[['Age', 'SibSp', 'Parch', 'Fare']])

# Display the first few rows of the preprocessed data
titanic_prepared.head()
    

## Step 3: Splitting Data and Training Models
Now we split the data into training and testing sets and train a Random Forest model.

In [None]:

# Define the target variable and features
X = titanic_prepared
y = titanic_data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1
    

## Step 4: Saving the Model
Finally, we save the trained Random Forest model for deployment.

In [None]:

# Save the trained model
model_filename = 'final_random_forest_model.pkl'
joblib.dump(rf_clf, model_filename)

# To download the model, you can use the following code in a Colab cell:
# from google.colab import files
# files.download('final_random_forest_model.pkl')
    