# Step 1: Importing required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Step 2: Importing the titanic dataset


In [None]:
df= pd.read_csv('../input/titanicdataset/titanic.csv')

In [None]:
df.info()

The below Table provides information about the dataset.

Variable           | Definition                                              | Key
-------------------|------------------ ------------------ ------------------ | ------------------
PassengerId        | Passenger Id                                            | 
Survived           | Survival people                                         | 0 = No, 1 = Yes
Pclass             | Ticket class                                            | 1 = 1st, 2 = 2nd, 3 = 3rd
Name               | Name                                                    |
Sex                | Sex                                                     |
Age                | Age                                                     |
SibSp              | # of siblings / spouses aboard the Titanic              |
Parch              | # of parents / children aboard the Titanic              |
Ticket             | Ticket number                                           |
Fare               | Passenger fare                                          |
Cabin              | Cabin number                                            |
Embarked           | Port of Embarkation                                     | C = Cherbourg, Q = Queenstown, S = Southampton


### Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

# Step 3: Data Preparation

In [None]:
# Droping useless columns

df = df.drop(['PassengerId', 'Name', 'Ticket'],axis=1)

In [None]:
#Calculating the percentage of missing data in each columns (feature) and then sort it
def missing_percentage(df):
    nan_percent= 100*(df.isnull().sum()/len(df))
    nan_percent= nan_percent[nan_percent>0].sort_values()
    return nan_percent
nan_percent= missing_percentage(df)
print(nan_percent)

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent, color=(0.2, 0.4, 0.6, 0.6), edgecolor='blue')
plt.xticks(rotation=90)

In [None]:
# Droping the Cabin feature beacuse of its high rate of missing values

df = df.drop(['Cabin'],axis=1)

In [None]:
# Droping rows with missing values

df = df.dropna(subset = ["Embarked"])
df = df.dropna(subset = ["Fare"])

In [None]:
# Filling missing values related to Age with its mean value

df['Age'] = df.groupby(['Sex'])['Age'].apply(lambda x: x.fillna(x.mean()))

In [None]:
sns.scatterplot(data=df,x='Fare', y='Survived')
plt.axvline(x=400,color='r')

In [None]:
# Eliminating the outliers

index_drop=df[(df['Fare']>300) & (df['Survived']==1)].index
df = df.drop(index_drop, axis=0)

In [None]:

sns.scatterplot(data=df,x='Fare', y='Survived')
plt.axvline(x=400,color='r')

In [None]:
# The Correlation overview

df.corr()['Survived'].sort_values()

In [None]:
df.corr()['Survived'].sort_values()

In [None]:
df = df.drop(['SibSp'],axis=1)

In [None]:
df.info()

# Step 4: Exploratory Data Analysis

In [None]:
df['Survived'].value_counts()

In [None]:
sns.countplot(data=df, x='Survived')



As can be observed, passengers that have survived are virtually younger than those that could not survive.

In [None]:
sns.boxplot(data=df, x='Survived', y='Pclass')     

It was witnessed that people with 1st class ticket have survived.

In [None]:
fig= plt.figure(figsize=(8,8), dpi=800)
sns.pairplot(df, hue='Survived')

In [None]:
fig= plt.figure(figsize=(4,4), dpi=300)
sns.heatmap(df.corr(), annot=True)

# Step 5: Spliting feature into object and number categories

In [None]:
df_num= df.select_dtypes(exclude='object')
df_obj= df.select_dtypes(include='object')

# Step 6: Determining the Features & Target Variable

In [None]:
X= df_num.drop('Survived', axis=1)
y= df_num['Survived']

# Step 7: Spliting the Dataset to Tain & Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Step 8: Scaling the Features

In [None]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
scaler.fit(X_train)
scaled_X_train= scaler.transform(X_train)
scaled_X_test= scaler.transform(X_test)

# Step 9: Training the Model (Logistic Regression)

Logistic Regression transforms a Linear Regression into classification model using the below equation:

$\sigma (x) = 1/(1 + e^{-x})$

Hence, the output always lays between 0 and 1.

In [None]:
from sklearn.linear_model import LogisticRegression
log_model= LogisticRegression()
log_model.fit(scaled_X_train, y_train)

In [None]:
#Model Coeficient:
log_model.coef_

# Step 10: Predicting Test Data

In [None]:
y_pred= log_model.predict(scaled_X_test)

# Step 11: Evaluating the Model

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
plot_confusion_matrix(log_model, scaled_X_test, y_test)

In [None]:
print(classification_report(y_test, y_pred))

# Step 12: Training the Model (Logistic Regression using cross validation)

In [None]:
# Scaling the Features

from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
scaler.fit(X)
scaled_X= scaler.transform(X)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
log_model2= LogisticRegressionCV(cv=5, random_state=101).fit(scaled_X, y) 
y_pred= log_model2.predict(scaled_X)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

In [None]:
accuracy_score(y, y_pred)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
plot_confusion_matrix(log_model2, scaled_X, y)

In [None]:
print(classification_report(y, y_pred))

### Comparing LogisticRegression function with LogisticRegressionCV function

In [None]:
from sklearn.metrics import plot_roc_curve
figure1 = plot_roc_curve(log_model2, scaled_X, y)
figure2 = plot_roc_curve(log_model, scaled_X_test, y_test, ax=figure1.ax_)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = [5, 3]