In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Description
On this notebook we are going to use a logistic regression model for a classification problem. We want to predict whether a passenger will survive  or die from the Titanic disaster. The notebook shows all the steps used in a machine learning process. The purpose of this notebook is getting familiar with logistic regression and understand it in the simple possible way. 


In [None]:
#import libraries for data visualisation 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Read the input file

In [None]:
#read train cvs file with pandas 
file_path ='../input/titanic-machine-learning-from-disaster/train.csv'
data =pd.read_csv( file_path)
#show the first 5 rows of data dataframe 
data.head()

# 1. Explore data 

In [None]:
# identify missing data 
missing_data=data.isnull()
missing_data

The isnull() function return a True value when the data is missining NAN value.

In [None]:
# visualize missing data with heatmap of boolean values 
sns.heatmap(missing_data, cbar=False)

The heatmap tells us that every white dash is a True value for a missing data. So we assume that we have missing a lot of data for **"Cabin"** columns and some data are missing too for **"Age"**

Now we want to visualize survived passanger based on their gender. 

In [None]:
sns.countplot(data["Survived"], hue="Sex", data=data, palette='RdBu_r')

The plot tells us that we have more female survived than male. Next we are going to explore whether the rich people survived more than poor one 

In [None]:
sns.countplot(data["Survived"], hue="Pclass", data=data, palette='RdBu_r')

A lot of people did not survive from the third class. Is it because money matters in life-death situation or just simply because we have more passenger from the third class than the others? We want to get a distribution of passangers based on their classes.  

In [None]:
sns.countplot(data["Pclass"], data=data, palette='RdBu_r')

The rate of people from the third class is the highest. That explains why more people died from this class. Well it's time to stop guessing and do some logistic machine learning to predict death among passengers. 

# 2. Cleaning data

We are going to replace null age value by the average age per class.

In [None]:
data.groupby("Pclass")["Age"].mean()

We are going to use a function that fills missing age value by 38 for class 1 and 29 for class 2 and 25 for class 3

In [None]:
#define a function that return missing value per class 
def fill_missing(cols) : 
    Age = cols[0] #Age column
    Pclass = cols[1] #Pclass column
    if pd.isnull(Age): #if the age value is missing 
        if Pclass==1: 
            return 38
        elif Pclass==2: 
            return 29
        else :
            return 25
    else : 
        return Age 
    
data["Age"]= data[["Age","Pclass"]].apply(fill_missing, axis=1)

In [None]:
#Checking visually that our function filled the missing value 
sns.heatmap(data.isnull(), cbar=False)

For the second missing value; we are just going to drop this column and don’t use it in our model as a feature for simplicity.

In [None]:
data.drop("Cabin", axis=1, inplace=True) 
#if you don't use inplace =True the cabin column will still exist on your data 

# 4. Create a dummy variable 

In [None]:
sns.heatmap(data.isnull(), cbar=False)

We use get_dummies to transform a categorical variable to a numerical value. For example for columns sex , we transform male to 1 and female to 0

In [None]:
pd.get_dummies(data["Sex"])

You can mention here that if it's not a male it's female wish makes our columns predictives. We don't want this behaviour calles multicollinearity so we fix this issue by calling he parameter drop_first as True

In [None]:
#create dummiee variables for sex and Embarked
sex=pd.get_dummies(data["Sex"],drop_first=True)
embarked= pd.get_dummies(data["Embarked"],drop_first=True)
#Add this two variable to our data : 
data=pd.concat([data,sex,embarked], axis=1)
#Check the first row of our data
data.head(1)

Again we drop columns we don't need like Sex, Embarked, Name, Ticket 


In [None]:
data.drop(["Sex","Embarked","Name","Ticket"], axis=1, inplace=True)

In [None]:
#drop the passanger ID
data.drop("PassengerId", axis=1, inplace=True)

# 5. Train the model 

In [None]:
#check data
data.head()

In [None]:
#select the output y and the features X
y=data["Survived"]
features = ["Pclass","Age", "SibSp", "Parch","Fare","male","Q","S"]
X=data[features]

1. Import scikit learn libraries
1. Create an instance of a logistic regression model 
1. Fit the model to our data 
1. predict 

In [None]:
#Import scikit learn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

#split our data into a train and test data
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.30, random_state=0)

#Create an instance of a logistic model 
lgmodel= LogisticRegression()

#train the lgmodel 
lgmodel.fit(X_train,y_train)

In [None]:
predictions = lgmodel.predict(X_test)

# 6. Evaluating model 

Import libraries for evaluting model 

In [None]:
from sklearn.metrics import confusion_matrix
#show the confusing matrix 
print(confusion_matrix(y_test,predictions))

In [None]:
from sklearn.metrics import classification_report 
#show a full classification report
print(classification_report(y_test,predictions))

# Conclusion
Yet we finished all the steps but we want to improve our model. So in the next notebook we are going to do some changes in order to explore whether we can do better.