<a href="https://colab.research.google.com/github/Swathi1309/Data_Analytics/blob/main/EE4708_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression for Predicting Survival using the Titanic Dataset

## Importing libraries

In [188]:
import pandas as pd
import numpy as np

import random
import re

import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

## Data cleaning

In [189]:
data = pd.read_csv("train.csv")

# Data Cleaning
# Dropping columns that are not relevant
to_drop = ['Name', 'Ticket', 'PassengerId']
data.drop(columns=to_drop, inplace=True)
# Dropping rows where embarked is not available
data.dropna(subset=['Embarked'], inplace=True)
# Converting categorical variables to one-hot encodings
sex = pd.get_dummies(data["Sex"])
embarked = pd.get_dummies(data["Embarked"])
data = pd.concat([data,sex,embarked],axis=1)

In [None]:
# Data imputation for Age feature
sb.displot(data, x='Age', hue='Pclass', kind='kde', col='Sex', palette='flare');
data['Age'] = data.groupby(['Pclass'])['Age'].apply(lambda x:x.fillna(x.median()))

In [None]:
# Creating categorical variables from continuous variables, and creating one-hot encodings for the same
sb.displot(data, x="Age", palette='flare');
sb.displot(data, x="Fare", palette='flare');

data['Age_bins'] = pd.cut(data['Age'], bins=[data['Age'].min()-1,5,18,25,50,data['Age'].max()+1], labels=['age <5', 'age 5-18', 'age 18-25', 'age 25-50','age >50'])
data['Fare_bins'] = pd.cut(data['Fare'], bins=[data['Fare'].min()-1, 10, 20, 30, 100, data['Fare'].max()+1], labels=['fare <10', 'fare 10-20','fare 20-30','fare 30-100','fare >100'])

sb.displot(data, x="Age_bins", palette='flare');
sb.displot(data, x="Fare_bins", palette='flare');

age_bins = pd.get_dummies(data['Age_bins'])
fare_bins = pd.get_dummies(data['Fare_bins'])
data = pd.concat([data, age_bins, fare_bins], axis=1)

In [192]:
normalized_age =(data['Age']-data['Age'].min())/(data['Age'].max()-data['Age'].min())
data['Age'] = normalized_age

normalized_fare = (data['Fare']-data['Fare'].min())/(data['Fare'].max()-data['Fare'].min())
data['Fare'] = normalized_fare

## Data Visualization and Exploratory Analysis

In [None]:
sb.catplot(x="Pclass", y="Survived", hue='Sex', data=data, saturation=.5, kind="bar", palette='pastel');

sb.displot(data=data, x='Age', kde=True, col='Survived');
sb.displot(data=data, x='Fare_bins', hue='Survived');

data['Cabin'] = data['Cabin'].apply(lambda s:s[0] if pd.notnull(s) else s)
sb.catplot(x='Cabin', y='Survived', data=data, kind='bar', order = ['A','B','C','D','E','F','G','T']);

sb.catplot(x="SibSp", y="Survived",data=data, kind="bar");
sb.catplot(x="Parch", y="Survived",data=data, kind="bar");
sb.catplot(x="Embarked", y="Survived",data=data, kind="bar");

## Building and Comparing Regression Models

In [194]:
data.drop(columns=['Cabin'], inplace=True)
X_continous = data[['Pclass', 'male', 'Age', 'SibSp', 'Parch', 'Fare', 'C', 'Q']]
X_classes = data[['Pclass', 'male', 'age <5', 'age 5-18', 'age 18-25', 'age 25-50', 'fare <10', 'fare 10-20', 'fare 20-30', 'fare 30-100', 'SibSp', 'Parch', 'C', 'Q']]
Y = data['Survived']

In [195]:
def print_coeff(model, features, scores):
  print (model, ": \n")
  for i in range(len(features)):
    print (features[i], " :", scores[0, i])

In [None]:
# Model 1 - Using continous values for age and fare
x_train, x_test, y_train, y_test = train_test_split(X_continous, Y, test_size=0.3, random_state=10)
model1 = LogisticRegression(max_iter=5000)
model1.fit(x_train, y_train)
print_coeff("Model1", ['Pclass', 'male', 'Age', 'SibSp', 'Parch', 'Fare', 'C', 'Q'], model1.coef_)
# For training data
y_pred = model1.predict(x_train)
print ("Training data:")
print ("Confusion matrix:")
print (confusion_matrix(y_pred, y_train))
print ("Accuracy: ", accuracy_score(y_pred, y_train))
print ("Precision: ", precision_score(y_pred, y_train))
print ("Recall: ", recall_score(y_pred, y_train))
print ("F1_score: ", f1_score(y_pred, y_train))
print ("\n")
# For validation data
y_pred = model1.predict(x_test)
print ("Validation data:")
print ("Confusion matrix:")
print (confusion_matrix(y_pred, y_test))
print ("Accuracy: ", accuracy_score(y_pred, y_test))
print ("Precision: ", precision_score(y_pred, y_test))
print ("Recall: ", recall_score(y_pred, y_test))
print ("F1_score: ", f1_score(y_pred, y_test))

In [None]:
# Model 2 - Using binned values for age and fare
x_train, x_test, y_train, y_test = train_test_split(X_classes, Y, test_size=0.3, random_state=10)
model2 = LogisticRegression(max_iter=5000)
model2.fit(x_train, y_train)

print ("Model 2")
print ("\n")
# For training data
y_pred = model2.predict(x_train)
print ("Training data:")
print ("Confusion matrix:")
print (confusion_matrix(y_pred, y_train))
print ("Accuracy: ", accuracy_score(y_pred, y_train))
print ("Precision: ", precision_score(y_pred, y_train))
print ("Recall: ", recall_score(y_pred, y_train))
print ("F1_score: ", f1_score(y_pred, y_train))
print ("\n")
# For validation data
y_pred = model2.predict(x_test)
print ("Validation data:")
print ("Confusion matrix:")
print (confusion_matrix(y_pred, y_test))
print ("Accuracy: ", accuracy_score(y_pred, y_test))
print ("Precision: ", precision_score(y_pred, y_test))
print ("Recall: ", recall_score(y_pred, y_test))
print ("F1_score: ", f1_score(y_pred, y_test))

# Predicting Survival from the Test Dataset

In [None]:
test_data = pd.read_csv("test.csv")

# Data cleaning as done for training data
to_drop = ['Name', 'Ticket', 'PassengerId']
test_data.drop(columns=to_drop, inplace=True)

test_data['Age'] = test_data.groupby(['Pclass'])['Age'].apply(lambda x:x.fillna(x.median()))
test_data['Fare'] = test_data.groupby(['Pclass'])['Fare'].apply(lambda x:x.fillna(x.median()))

sex = pd.get_dummies(test_data["Sex"])
embarked = pd.get_dummies(test_data["Embarked"])
test_data = pd.concat([test_data,sex,embarked],axis=1)

normalized_age =(test_data['Age']-test_data['Age'].min())/(test_data['Age'].max()-test_data['Age'].min())
test_data['Age'] = normalized_age

normalized_fare = (test_data['Fare']-test_data['Fare'].min())/(test_data['Fare'].max()-test_data['Fare'].min())
test_data['Fare'] = normalized_fare

test_data.drop(columns=['Cabin'], inplace=True)
test_data.isnull().sum()

In [214]:
x = test_data[['Pclass', 'male', 'Age', 'SibSp', 'Parch', 'Fare', 'C', 'Q']]
y = model1.predict(x)
test = pd.read_csv("test.csv")
survived = pd.DataFrame(y)
survived.columns = ["Survived"]
predictions = pd.concat([test, survived], axis=1)
predictions.to_csv('predictions.csv')
survived["Survived"].value_counts()