In [1]:
# import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load dataset

data = pd.read_csv("titanic.csv")

In [3]:
# print first few rows

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Logistic Regression on the 'Survived' categorical variable

In [4]:
# getting the frequency table for 'Survived' variable

data['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

So, here we have 549 people did not survive and 342 survived.

In [5]:
# assigning the dummy variables for the string variable values in the dataset

data = pd.get_dummies(data)    # numeric features from the categorical features

In [6]:
# filling the missing values in the dataset with '0'

data.fillna(0,inplace=True)

In [32]:
# printing the dimensions of the dataset after the data exploration

data.shape

(891, 1731)

In [33]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbing, Mr. Anthony","Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)",...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,1,3,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [34]:
# making the train and test variable datasets

train = data[0:699]
test = data[700:]

### Creating the model and target variable x_train & y_train for model training (create the data required for the logistic regression)

In [35]:
# creating the independent variable dataset

x_train = train.drop('Survived',axis=1)

In [36]:
# creating the target variable dataset

y_train = train['Survived']

In [37]:
# creating the variable containing the features used for making the predictions

x_test = test.drop('Survived',axis=1)

In [38]:
# creating the correct/true values for the test dataset

true_p = test['Survived']

So, now we can go ahead and perform our logistic regression task

In [39]:
# import logistic regression package

from sklearn.linear_model import LogisticRegression

In [65]:
# creating an object and performing all operations using that object

logreg = LogisticRegression(solver='liblinear',multi_class='ovr')    

# for small datasets, ‘liblinear’ is a good choice
# ‘liblinear’ is limited to one-versus-rest schemes
# if the option chosen is ‘ovr’, then a binary problem is fit for each label
# select ‘ovr’ if the data is binary, or if solver=’liblinear’

Now we train our model using the .fit() function here as well

In [66]:
# train/fit our model (independent,dependent)

logreg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

#### Now we make some predictions

In [68]:
# storing the predictions in a variable and using .predict() function

pred = logreg.predict(x_test)    # predictions stored inside pred variable

In [69]:
# print the predictions

pred

array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0], dtype=int64)

Since this is a classification problem, this is what we expect

#### Now we see the accuracy of our model using the score function
> Here the score function returns the accuracy whereas in linear regression it returned the R-Square value

In [70]:
logreg.score(x_test,true_p)    # .score(independent test feature,true value of target)

0.8219895287958116

This is the accuracy over the test dataset

We can also check the accuracy over the training dataset

In [72]:
logreg.score(x_train,y_train)

0.9227467811158798

#### Here we can see that, there is a 92% accuracy on the train dataset, whereas accuracy over the test dataset is 82%
> Again, our test sample might not be a representative of the train sample
> Making the test sample representation of train is an entirely different concept called validation

Also, Logistic regression can be used on 3-class classification problem. For example, if we consider one class as 1 and other two classes as 2 , it becomes a 2-class classification problem. This is called one vs all classification.