In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import scipy
from scipy.stats import spearmanr

import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb

import sklearn
from sklearn.preprocessing import scale 
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import preprocessing

%matplotlib inline
rcParams['figure.figsize'] = 5, 4
sb.set_style('whitegrid')



### Logistic Regression Assumptions:

### 1. Data should be free of missing values
### 2. The predictant variable is binary or categorical(a categorical variable with ordered values)
### 3. All predictors are independent of each other

In [2]:
titanic=pd.read_csv('train.csv')

In [3]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
titanic.shape


(891, 12)

In [7]:
titanic.isnull().sum()
# Also, check for independence between features using spearman co-relation or by using scatterplots
# We can also use phi corelation for checking co-relation between two variables

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# Here, in our logistic regression model we are not going to use the column cabin, 
# However, we are going to use the column Age as our predictor and it contains null values so we will drop those which has null values



In [9]:
titanic_data_required=pd.DataFrame()

In [11]:
# Here, we are filtering out only those columns which we are going to use in our logistic regression model

titanic_data_required['sex']=titanic['Sex']
titanic_data_required['Age']=titanic['Age']
titanic_data_required['Survived']=titanic['Survived']

In [12]:
# Since sex is a categorical variable, we need to create a dummy value for the column 'sex'
titanic_data_required['sex']=pd.get_dummies(titanic_data_required.sex)['female']

In [14]:
titanic_data_required.head()

Unnamed: 0,sex,Age,Survived
0,0,22.0,0
1,1,38.0,1
2,1,26.0,1
3,1,35.0,1
4,0,35.0,0


### It is very essential in logistic regression that the variables are not dependent
### For finding co-relation between variables in logistic regression, we use non parametric methods for corelation between categorical, non linearly, distributed variables

In [15]:
sex = titanic_data_required['sex']
Age = titanic_data_required['Age']

spearmanr_coefficient, p_value =  spearmanr(sex, Age)
print('Spearman Rank Correlation Coefficient %0.3f' % (spearmanr_coefficient))

Spearman Rank Correlation Coefficient -0.092


In [38]:
titanic_data_required.head()

Unnamed: 0,sex,Age,Survived
0,0,22.0,0
1,1,38.0,1
2,1,26.0,1
3,1,35.0,1
4,0,35.0,0


In [39]:
titanic_data_required=titanic_data_required.dropna(axis=0)

In [40]:
titanic_data_required.isnull().sum()

sex         0
Age         0
Survived    0
dtype: int64

In [42]:
titanic_data_required.head()

Unnamed: 0,sex,Age,Survived
0,0,22.0,0
1,1,38.0,1
2,1,26.0,1
3,1,35.0,1
4,0,35.0,0


In [53]:
titanic_data_required.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [54]:
titan_data=pd.read_csv('train.csv')

In [55]:
X['sex']=titan_data['Sex']
X['Age']=titan_data['Age']
X['Survived']=titan_data['Survived']

In [56]:
X=X.dropna(axis=0)

In [57]:
X['sex']=pd.get_dummies(X.sex)['female']

In [58]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

In [59]:
y=X['Survived']

In [60]:
X=X.drop(['Survived'],axis=1)

In [61]:
# Now, X feature is our feature vector set
# Before applying linear or logistic regression, we should scale our feature variables
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [62]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [63]:
model = LogisticRegression(penalty="l2",C=1)

In [64]:
model.fit(X_train,y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
from sklearn.metrics import accuracy_score

In [66]:
print("The accuracy score is %1.1f"%accuracy_score(y_test,model.predict(X_test)))

The accuracy score is 0.7


In [69]:
from sklearn.metrics import roc_auc_score

In [70]:
logit_roc_auc=roc_auc_score(y_test,model.predict(X_test))

In [71]:
print("The Logistics AUC is %1.1f"%logit_roc_auc)

The Logistics AUC is 0.7


In [72]:
from sklearn.metrics import classification_report

In [73]:
print(classification_report(y_test,model.predict(X_test)))

             precision    recall  f1-score   support

          0       0.78      0.79      0.78        87
          1       0.67      0.64      0.65        56

avg / total       0.73      0.73      0.73       143



- In statistical hypothesis testing, a type I error is the incorrect rejection of a true null hypothesis (a false "positive")
- While a type II error is incorrectly retaining a false null hypothesis 

- Precision is number of relevant items found amongst all the items found
- Recall is the number of relevant items found amongst the total number of items found 



In [75]:
# Import the necessary module
from sklearn.model_selection import cross_val_score

# Compute 5-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(model, X, y, cv=5)

# Print the 5-fold cross-validation scores
print(cv_scores)

# find the mean of our cv scores here
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))

[ 0.78321678  0.81818182  0.77622378  0.72727273  0.79577465]
Average 5-Fold CV Score: 0.7801339505564858


In [None]:
# We conclude that we have not violated any model assumptions since the data is free of missing values and the variables are
# independent of each other as seen from the Spearman co-relation
# After cross validation, the model has somewhat similar score in the range of 0/78 to .81
# There seems to be a significant relationship between the predictants and the predictors
