# Predicting the survival of passangers of Titanic based on the features like age, cabin, gender etc

The goal of this project is to build a machine learning model that can predict 
whether a passenger survived or not on the Titanic, based on features such as age, gender, ticket class, fare, and other personal attributes.

## Problem Definition or Business Understanding

## Data Acquisition

### Import libraries

In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('train.csv')

## Data Preprocessing

In [3]:
df.dropna(subset='Embarked', inplace=True, ignore_index=True)

In [4]:
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(value=age_median)

In [5]:
df['Deck'] = df['Cabin'].str[0].fillna('U')

In [6]:
encoder_sex = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore', dtype=int)
encoder_embarked = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore', dtype=int)
encoder_deck = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore', dtype=int)

In [7]:
encoded_sex = pd.DataFrame(encoder_sex.fit_transform(df[['Sex']]), columns=encoder_sex.get_feature_names_out(['Sex']))
encoded_embarked = pd.DataFrame(encoder_embarked.fit_transform(df[['Embarked']]), columns=encoder_embarked.get_feature_names_out(['Embarked']))
encoded_deck = pd.DataFrame(encoder_deck.fit_transform(df[['Deck']]), columns=encoder_deck.get_feature_names_out(['Deck']))

In [8]:
numerical = df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [9]:
X = pd.concat([numerical, encoded_sex, encoded_embarked, encoded_deck], axis=1)
y = df['Survived']

feature_order = list(X.columns)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

## Model Building and Evaluation

In [11]:
classifier = LogisticRegression(max_iter=500)

In [12]:
classifier.fit(X_train,y_train)

In [13]:
y_pred = classifier.predict(X_test)

In [14]:
accuracy_score(y_test, y_pred)

0.8258426966292135

In [15]:
confusion_matrix(y_test, y_pred)

array([[99, 11],
       [20, 48]])

## Saving for Deploying

In [16]:
constants = {'age_median':age_median, 'feature_order':feature_order}

In [17]:
joblib.dump(classifier,'../model/model.pkl')
joblib.dump(encoder_sex,'../model/encoder_sex.pkl')
joblib.dump(encoder_embarked,'../model/encoder_embarked.pkl')
joblib.dump(encoder_deck,'../model/encoder_deck.pkl')
joblib.dump(constants, '../model/constants.pkl')

['../model/constants.pkl']