# Jupyter Notebook for Logistic Regression

Short example to perform logistic regression in python using titanic data.

We want to predict the probability of survive using sklearn and dummy variables, see how to transform them.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns   # to check correlation and statistical tools
import matplotlib.pyplot as plt
%matplotlib inline
import math

In [2]:
# titanic_data = pd.read_csv("../Dataset/Titanic.csv")   you should put the data in the correct folder
print(titanic_data.shape)
titanic_data.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
print('number of passenger:' + str(len(titanic_data)))

# Let's plot our data to have a first idea

The plot of the data help us to have a first idea about which data can be useful to predict our target variable, if a person has survived or not.

In [None]:
# Count plot
sns.countplot(x = 'Survived', data = titanic_data)
plt.show()

In [None]:
sns.countplot(x = 'Survived', hue = 'Sex', data = titanic_data)
plt.show()

In [None]:
sns.countplot(x = 'Survived', hue= 'Pclass', data = titanic_data)
plt.show()

In [None]:
# histogram plot
# https://matplotlib.org/api/_as_gen/matplotlib.pyplot.subplots.html
plt.subplots(1,1)
titanic_data["Age"].plot.hist()
plt.subplots(1,1)
titanic_data["Fare"].plot.hist()
plt.show()

In [None]:
# infor about the data
titanic_data.info()

In [None]:
sns.countplot(titanic_data["SibSp"])
plt.show()

## Clean our data: get rid of NaN and unecessary columns

In [None]:
titanic_data.isnull()   #gives me a boolean = true if it is NaN

In [None]:
# We want to know the exact number on NaN that we have. 
titanic_data.isnull().sum()

In [None]:
sns.heatmap(titanic_data.isnull(), cmap = "autumn")  # to change color #viridis/hot/rainbow/jet
plt.show()                                           # winter/autumn...

In [None]:
sns.boxplot(x = "Pclass", y = "Age", data = titanic_data)
plt.show()

In [None]:
titanic_data.head()

we can see that some values like PassengerID, Survived, Pclass.. are categorical so we can use them as dummy in our Logistic model. The model is called Logistic because as target we are using a categorical variable and not a continuous variable.

In [None]:
# let's drop cabin column because it is useless to our evaluation and because
# there are a lot of NaN in it
titanic_data.drop("Cabin", axis = 1, inplace=True)

In [None]:
print(titanic_data.shape)
titanic_data.head()

In [None]:
# drop nan, if we want we can populate this nan with the mean, median or other values.
# in this case we don't do any guess and we just eliminate them
titanic_data.dropna(inplace=True)
print(titanic_data.shape)
titanic_data.head()

# To substitute NAN or get rid of them
#titanic_data.fillna() sostituisce i NaN

In [None]:
sns.heatmap(titanic_data.isnull(), cmap = "viridis") 
# we removed all the NaN value
print(titanic_data.isnull().sum())

## Covert string variables male - female in dummy

In [None]:
pd.get_dummies(titanic_data["Sex"])
# one of the two colum is enough, to avoid the collinearity problem

In [None]:
# We create the sex dummy
sex = pd.get_dummies(titanic_data["Sex"], drop_first = True)

In [None]:
# embark has 3 values not 2, so we have 3 categories and we keep only two
embark = pd.get_dummies(titanic_data["Embarked"], drop_first = True)
embark.head()

In [None]:
# Pcalss same as embarked
Pcl = pd.get_dummies(titanic_data["Pclass"], drop_first = True)
Pcl.head()

In [None]:
# We need to drop the old columns and concatenate the new ones
titanic_data = pd.concat([titanic_data,sex, embark, Pcl], axis = 1)
titanic_data.head()

In [None]:
# drop the old categorical columns and the useless columns
titanic_data.drop(['Sex', 'Embarked', 'PassengerId', 'Name', 'Ticket', 'Pclass'], axis = 1, inplace=True)

In [None]:
titanic_data.head()

# Determine X and Y

In [None]:
X = titanic_data.drop("Survived",axis = 1)
y = titanic_data["Survived"]  # wanna predict this column

In [None]:
train_set = np.int(np.floor(X.shape[0]*0.8))

# 1. Split using sklearn

In [None]:
# we can use sklearn model_selection
from sklearn.model_selection import train_test_split
train_test_split  #shift+tab and we go to the documentation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=8) 
# random_state
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# let's fit and train the model to determine the value of the parameters
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

In [None]:
# Let's do prediction using the values determined from the fit
# We use our test variables
predictions = logmodel.predict(X_test)

In [None]:
# Let us evaluate the performance of out model
from sklearn.metrics import classification_report
# from sklearn.metrics import accuracy_score

# evaluate the prediction to the real y_test value
print(classification_report(y_test, predictions))

Compute precision, recall, F-measure and support for each class

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0

In [None]:
# we can use the confusion matrix
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test,predictions)
cm = pd.DataFrame(conf_mat, index = ['yes','no'], columns = ['yes', 'no'])
cm

In [None]:
from sklearn.metrics import accuracy_score
print('accuracy score using the function: ', accuracy_score(y_test, predictions))

# it is basically the mean between the tru positive cases
# manually
conf_mat[:,1]

# number of the main diagonal are the true values
numer = conf_mat[0,0] + conf_mat[1,1]
denom = sum(sum(conf_mat))
print('accuracy score computed manuallay: ', numer/denom)

# 2. Split the data without using sklearn

In [None]:
# We can split our data manually.
# The split can be sequential, not random.
# To add randomness we can divide the X in different blocks and then combine these blocks
# random.

X_train2 = X[:train_set]
y_train2 = y[:train_set]
X_test2 = X[train_set:]
y_test2 = y[train_set:]
print(X_train2.shape)
print(y_train2.shape)
print(X_test2.shape)
print(y_test2.shape)

In [None]:
# Split in Blocks
max_int = 10
size_block = np.int(np.floor(X.shape[0]/10))
XX_tot =[]
yy_tot = []
for i in range(max_int):
    if i == max_int-1:
        X_tmp = X[i*size_block:]
        y_tmp = y[i*size_block:]
    else:
        X_tmp = X[i*size_block:(i+1)*size_block]
        y_tmp = y[i*size_block:(i+1)*size_block]
    XX_tot.append(X_tmp)
    yy_tot.append(y_tmp)

# X_test = pd.concat([X_train12, X_train11], axis = 0)
random_idx_train = np.random.choice(10, 10, replace=False)[0:8]
random_idx_test = np.random.choice(10, 10, replace=False)[8:]

# concatenate data set
XX_train = XX_tot[random_idx_train[0]]
yy_train = yy_tot[random_idx_train[0]]
for i in range(len(random_idx_train)-1):
    XX_train =pd.concat([XX_train, XX_tot[random_idx_train[i+1]]], axis = 0)
    yy_train =pd.concat([yy_train, yy_tot[random_idx_train[i+1]]], axis = 0)
    
XX_test = XX_tot[random_idx_test[0]]
yy_test = yy_tot[random_idx_test[0]]
for i in range(len(random_idx_test)-1):
    XX_test =pd.concat([XX_test, XX_tot[random_idx_train[i+1]]], axis = 0)
    yy_test =pd.concat([yy_test, yy_tot[random_idx_train[i+1]]], axis = 0)

In [None]:
# Split using random draws
idx_tot = np.random.choice(X.index,X.shape[0], replace=False)
idx_train = idx_tot[:np.int(np.floor(X.shape[0]*0.8))]
idx_test = idx_tot[np.int(np.floor(X.shape[0]*0.8)):]

# train set
XX_train2 = X.loc[idx_train]
yy_train2 = y.loc[idx_train]

# test_Set 
XX_test2 = X.loc[idx_test]
yy_test2 = y.loc[idx_test]

In [None]:
# ... then we can do the same operations as before...
# try to do them by yourself!