## Using Logistic Regression to Predict Titanic Survivors

In [210]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [211]:
# Load data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [212]:
# Merge Train and Test data, creating a variable to denot which rows belong to which dataset
train['train_test'] = 1
test['train_test'] = 0
test['Survived'] = np.NaN
titanic_data = pd.concat([train,test])

In [213]:
#titanic_data.head()
#titanic_data.index
#titanic_data.columns
#titanic_data.info()
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,train_test
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0,1309.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479,0.680672
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668,0.466394
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958,0.0
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275,1.0
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292,1.0


In [214]:
# Countplot
#sns.countplot(x='Survived',data=titanic_data)

# Clustered Countplot
#sns.countplot(x='Survived',data=titanic_data,hue='Sex')

# Histogram
#sns.displot(x='Age',data=train)



In [215]:
print(titanic_data.isna().sum())

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
train_test        0
dtype: int64


In [216]:
# Impute missing age values using median
titanic_data['Age'].fillna(titanic_data['Age'].median(),inplace=True)
# Impute missing fare values using median
titanic_data['Fare'].fillna(titanic_data['Fare'].median(),inplace=True)
# Impute missing Embarked values with most common value
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace = True)

In [217]:
# Create title variable using Name variable

titanic_data['Title'] = titanic_data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

titanic_data['Title'] = titanic_data['Title'].replace(
    ['Don', 'Rev', 'Dr', 'Major', 'Lady', 'Sir','Col', 'Capt', 'the Countess','Jonkheer', 'Dona']
    , 'Rare')
titanic_data['Title'] = titanic_data['Title'].replace(['Ms', 'Mlle'], 'Miss')
titanic_data['Title'] = titanic_data['Title'].replace('Mme', 'Mrs')

In [218]:
# Create Family Size Variable

titanic_data['Family_Size'] = titanic_data['Parch'] + titanic_data['SibSp']

In [219]:
# Create Dummy Variable for Sex, Class & Title
# Represent most common value with a 0 for ordinal variables, exclude most common from Dummy variables

# Gender
titanic_data = pd.get_dummies(titanic_data,columns=['Sex'])

# Class
titanic_data.loc[ titanic_data['Pclass'] == 1, 'Class'] = "FirstClass"
titanic_data.loc[ titanic_data['Pclass'] == 2, 'Class'] = "SecondClass"
titanic_data.loc[ titanic_data['Pclass'] == 3, 'Class'] = "ThirdClass"

titanic_data = pd.get_dummies(titanic_data,columns=['Class'])

# Title

titanic_data = pd.get_dummies(titanic_data,columns=['Title'])

# Embarked

titanic_data = pd.get_dummies(titanic_data,columns=['Embarked'])


In [220]:
# Calculate most common value in dummies to know which one to drop from model
sum_of_columns = titanic_data.sum(axis=0)
sum_of_columns

  sum_of_columns = titanic_data.sum(axis=0)


PassengerId                                                     857395
Survived                                                         342.0
Pclass                                                            3004
Name                 Braund, Mr. Owen HarrisCumings, Mrs. John Brad...
Age                                                           38619.67
SibSp                                                              653
Parch                                                              504
Ticket               A/5 21171PC 17599STON/O2. 31012821138033734503...
Fare                                                        43564.9411
train_test                                                         891
Family_Size                                                       1157
Sex_female                                                         466
Sex_male                                                           843
Class_FirstClass                                                   323
Class_

In [222]:
# Drop unrequired columns
titanic_data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)

In [223]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,train_test,Family_Size,Sex_female,...,Class_SecondClass,Class_ThirdClass,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,Embarked_C,Embarked_Q,Embarked_S
0,1,0.0,3,22.0,1,0,7.25,1,1,0,...,0,1,0,0,1,0,0,0,0,1
1,2,1.0,1,38.0,1,0,71.2833,1,1,1,...,0,0,0,0,0,1,0,1,0,0
2,3,1.0,3,26.0,0,0,7.925,1,0,1,...,0,1,0,1,0,0,0,0,0,1
3,4,1.0,1,35.0,1,0,53.1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
4,5,0.0,3,35.0,0,0,8.05,1,0,0,...,0,1,0,0,1,0,0,0,0,1


In [224]:
new_train = titanic_data[titanic_data['train_test']==1]
new_test = titanic_data[titanic_data['train_test']==0]


In [225]:
new_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'train_test', 'Family_Size', 'Sex_female', 'Sex_male',
       'Class_FirstClass', 'Class_SecondClass', 'Class_ThirdClass',
       'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [226]:
# Seperate Dependent and Independent Variables
# Model 1 - All variables (no dummy on Class), dummys = k
X_model1=new_train[['PassengerId','Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q','Embarked_S']]
y_model1=new_train['Survived']
X_testing_model1=new_test[['PassengerId','Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q','Embarked_S']]
y_testing_model1=new_test['Survived']

# Model 2 - All variables (no dummy on Class), dummys = k-1
X_model2=new_train[['PassengerId','Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Title_Master', 'Title_Miss',
                    'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q']]
y_model2=new_train['Survived']
X_testing_model2=new_test[['PassengerId','Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Title_Master', 'Title_Miss',
                    'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q']]
y_testing_model2=new_test['Survived']

# Model 2 - All variables, dummys = k-1
X_model3=new_train[['PassengerId','Pclass', 'Age', 'Family_Size', 'Fare', 'Sex_female', 'Title_Master', 'Title_Miss',
                    'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q']]
y_model3=new_train['Survived']
X_testing_model3=new_test[['PassengerId','Pclass', 'Age', 'Family_Size', 'Fare', 'Sex_female', 'Title_Master', 'Title_Miss',
                    'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q']]
y_testing_model3=new_test['Survived']


In [227]:
# Import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
# Fit a Logistic Regression Model and calculate accuracy against test data
# logreg = LogisticRegression(max_iter=1000).fit(X_train, y_train)
# predictions = logreg.predict(X_test)
# from sklearn.metrics import accuracy_score
# accuracy_score(y_test, predictions)

NameError: name 'X_train' is not defined

In [230]:
from sklearn.model_selection import cross_val_score

# model_list = ['model1','model2','model3']
# d={}
# for i in model_list:
#     d["{0}accuracy".format(i)] = []
#     d["model1accuracy"].append(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_model1, y_model1,cv=10))
#     d["{0}avgaccuracy".format(i)] = np.average(d["model1accuracy"])

# d


model1_accuracy = []
model2_accuracy = []
model3_accuracy = []
model1_accuracy.append(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_model1, y_model1,cv=10))
model2_accuracy.append(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_model2, y_model2,cv=10))
model3_accuracy.append(cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X_model3, y_model3,cv=10))
model1_avg_accuracy = np.average(model1_accuracy)
model2_avg_accuracy = np.average(model2_accuracy)
model3_avg_accuracy = np.average(model3_accuracy)
model1_sd = np.std(model1_accuracy)
model2_sd = np.std(model2_accuracy)
model3_sd = np.std(model3_accuracy)
print("Model 1 Average Accuracy: ",round(model1_avg_accuracy,4)*100,"%"," +/- ",round(model1_sd,4)*100,"%")
print("Model 2 Average Accuracy: ",round(model2_avg_accuracy,4)*100,"%"," +/- ",round(model2_sd,4)*100,"%")
print("Model 3 Average Accuracy: ",round(model3_avg_accuracy,4)*100,"%"," +/- ",round(model3_sd,4)*100,"%")

Model 1 Average Accuracy:  82.83 %  +/-  3.49 %
Model 2 Average Accuracy:  82.83 %  +/-  3.38 %
Model 3 Average Accuracy:  82.71 %  +/-  3.3099999999999996 %


In [209]:
# Create Submission File

# Input most accurate model
logreg = LogisticRegression(max_iter=1000).fit(X_model1, y_model1)

#set ids as PassengerId and predict survival 
ids = X_testing_model1['PassengerId']
predictions = logreg.predict(X_testing_model1)
# X_testing['Survived'] = logreg.predict(X_testing)
# X_testing

#set the output as a dataframe and convert to csv file named submission.csv
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output['Survived'] = output['Survived'].astype('int')
output.to_csv('submission.csv', index=False)
#output

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
