Titanic dataset is one of the datasets available in sklearn.


You are given:

1. A Training dataset csv file with X train and Y train data

2. A X test File and you have to predict and submit predictions for this file.


Your task is to:

1. Use Logistic Regression and come with predictions.


Read Instructions carefully -

1. Use Logistic Regression as a training algorithm and submit results predicted.

2. Files are in csv format.

3. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions.

4. Your score is based on number of accurate predictions.

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

## Data Loading

In [2]:
# Loading the training dataset
import pandas as pd
training_data = pd.read_csv("training_titanic.csv").copy()
training_data

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [3]:
training_data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,668.0,536.0,668.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,32.064552,0.402695
std,0.831638,14.240257,1.080327,0.854695,45.320835,0.490808
min,1.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0
50%,3.0,29.0,0.0,0.0,14.75,0.0
75%,3.0,38.25,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


## Training Data Preprocessing and Cleaning

In [4]:
# Preparing a column based on the title (Mr/Mrs/Miss) of the column "Name". It is a column (classification type) 
import numpy as np

# Creating a list to store the title (Mr/Mrs/Miss) of the column "Name"
title = []

# Extracting the title (Mr/Mrs/Miss) from the column "Name"
for i in training_data['Name']:
    title.append(i.split(',')[1].split('.')[0].strip())
    
# Converting the list to a Numpy array (datatype of elements is string) for further operations
title = np.array(title)

# Classifying the title "Master" as class = 0 and replacing it with the current value (string)
title[title == 'Master'] = 0

# Classifying the title "Miss" as class = 1 and replacing it with the current value (string)
title[title == 'Miss'] = 1

# Classifying the title "Mr" as class = 2 and replacing it with the current value (string)
title[title =='Mr'] = 2

# Classifying the title "Mrs" as class = 3 and replacing it with the current value (string)
title[title == 'Mrs'] = 3

# Classifying any other title as class = 4 and replacing it with the current value (string)
title[(title != '0') & (title != '1') & (title != '2') & (title != '3')] = 4

# Converting the Numpy array (string elements) to integer elements so that operations can be performed on the elements
title = np.array(title, dtype='int')

In [5]:
# Dropping the column "Name"
training_data = training_data.drop('Name', axis = 1)

# Dropping the column "Ticket"
training_data = training_data.drop('Ticket', axis = 1)

# Classifying Sex = Female as class = 0
training_data.loc[training_data['Sex'] == 'female', 'Sex'] = 0

# Classifying Sex = Male as class = 1
training_data.loc[training_data['Sex'] == 'male', 'Sex'] = 1

# Replacing the Null values of the column "Age" with the median value
training_data.loc[training_data['Age'].isnull(), 'Age'] = training_data.loc[training_data['Age'].notnull(), 'Age'].median()

# Replacing the Null values of the column "Embarked" with the value "S" (Southamptom)
training_data.loc[training_data['Embarked'].isnull(), 'Embarked'] = 'S'

# Classifying Embarked = "S" as class = 1
training_data.loc[training_data['Embarked'] == 'S', 'Embarked'] = 1

# Classifying Embarked = "C" as class = 2
training_data.loc[training_data['Embarked'] == 'C', 'Embarked'] = 2

# Classifying Embarked = "Q" as class = 3
training_data.loc[training_data['Embarked'] == 'Q', 'Embarked'] = 3

# Replacing the Null values of the column "Cabin" with the class = 0
training_data.loc[training_data['Cabin'].isnull(), 'Cabin'] = 0

# Replacing the Notnull values of the column "Cabin" with the class = 1
training_data.loc[training_data['Cabin'] != 0, 'Cabin'] = 1

# Extracting the column values
training_data = training_data.values

In [6]:
# Preparing the training input features
X_train = training_data[:, :-1]
X_train = np.append(X_train, title.reshape(-1, 1), axis = 1)

# Preparing the training output features
Y_train = training_data[:, -1]
Y_train = Y_train.astype(int)

In [7]:
# Shape of the input features (it's a numpy array). Output : (rows, columns/features)
X_train.shape

(668, 9)

In [8]:
testing_data = pd.read_csv("testing_titanic.csv").copy()
testing_data

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.7500,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0000,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S
...,...,...,...,...,...,...,...,...,...,...
218,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,STON/O 2. 3101285,7.9250,,S
219,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.5500,B38,S
220,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9000,C65,C
221,3,"Holm, Mr. John Fredrik Alexander",male,43.0,0,0,C 7075,6.4500,,S


In [9]:
testing_data.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,223.0,178.0,223.0,223.0,223.0
mean,2.345291,29.694775,0.506726,0.304933,32.622551
std,0.850047,15.398053,1.1697,0.634108,61.062047
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,19.25,0.0,0.0,7.8792
50%,3.0,27.0,0.0,0.0,12.475
75%,3.0,37.75,1.0,0.0,30.0354
max,3.0,71.0,8.0,2.0,512.3292


## Testing Data Preprocessing and Cleaning

In [10]:
# Creating a list to store the title (Mr/Mrs/Miss) of the column "Name"
title = []

# Extracting the title (Mr/Mrs/Miss) from the column "Name"
for i in testing_data['Name']:
    title.append(i.split(',')[1].split('.')[0].strip())
    
# Converting the list to a Numpy array (datatype of elements is string) for further operations
title = np.array(title)

# Classifying the title "Master" as class = 0 and replacing it with the current value (string)
title[title == 'Master'] = 0

# Classifying the title "Miss" as class = 1 and replacing it with the current value (string)
title[title == 'Miss'] = 1

# Classifying the title "Mr" as class = 2 and replacing it with the current value (string)
title[title =='Mr'] = 2

# Classifying the title "Mrs" as class = 3 and replacing it with the current value (string)
title[title == 'Mrs'] = 3

# Classifying any other title as class = 4 and replacing it with the current value (string)
title[(title != '0') & (title != '1') & (title != '2') & (title != '3')] = 4

# Converting the Numpy array (string elements) to integer elements so that operations can be performed on the elements
title = np.array(title, dtype='int')

In [11]:
# Dropping the column "Name"
testing_data = testing_data.drop('Name', axis = 1)

# Dropping the column "Ticket"
testing_data = testing_data.drop('Ticket', axis = 1)

# Classifying Sex = Female as class = 0
testing_data.loc[testing_data['Sex'] == 'female', 'Sex'] = 0

# Classifying Sex = Male as class = 1
testing_data.loc[testing_data['Sex'] == 'male', 'Sex'] = 1

# Replacing the Null values of the column "Age" with the median value
testing_data.loc[testing_data['Age'].isnull(), 'Age'] = testing_data.loc[testing_data['Age'].notnull(), 'Age'].median()

# Replacing the Null values of the column "Embarked" with the value "S" (Southamptom)
testing_data.loc[testing_data['Embarked'].isnull(), 'Embarked'] = 'S'

# Classifying Embarked = "S" as class = 1
testing_data.loc[testing_data['Embarked'] == 'S', 'Embarked'] = 1

# Classifying Embarked = "C" as class = 2
testing_data.loc[testing_data['Embarked'] == 'C', 'Embarked'] = 2

# Classifying Embarked = "Q" as class = 3
testing_data.loc[testing_data['Embarked'] == 'Q', 'Embarked'] = 3

# Replacing the Null values of the column "Cabin" with the class = 0
testing_data.loc[testing_data['Cabin'].isnull(), 'Cabin'] = 0

# Replacing the Notnull values of the column "Cabin" with the class = 1
testing_data.loc[testing_data['Cabin'] != 0, 'Cabin'] = 1

# Extracting the column values
testing_data = testing_data.values
testing_data = np.append(testing_data, title.reshape(-1, 1), axis = 1)

In [12]:
# Shape of the input features (it's a numpy array). Output : (rows, columns/features)
testing_data.shape

(223, 9)

In [13]:
training_data

array([[2, 0, 29.0, ..., 0, 1, 1],
       [3, 1, 29.0, ..., 0, 1, 0],
       [2, 1, 39.0, ..., 0, 1, 0],
       ...,
       [3, 1, 32.0, ..., 0, 1, 1],
       [3, 0, 22.0, ..., 0, 1, 0],
       [3, 0, 29.0, ..., 0, 3, 1]], dtype=object)

In [14]:
testing_data

array([[2, 1, 8.0, ..., 0, 1, 0],
       [1, 0, 49.0, ..., 1, 1, 4],
       [3, 1, 27.0, ..., 0, 3, 2],
       ...,
       [1, 0, 17.0, ..., 1, 2, 3],
       [3, 1, 43.0, ..., 0, 1, 2],
       [2, 1, 36.5, ..., 1, 1, 2]], dtype=object)

In [15]:
# Using sklearn to import the classifier/algorithm
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

# Creating the algorithm object that we can use to train and then test the data
clf = LogisticRegression(solver = 'saga', max_iter = 10000, tol = 0.0001)

In [16]:
# Fitting/training the data
clf.fit(X_train, Y_train)

LogisticRegression(max_iter=10000, solver='saga')

In [17]:
# Predicting the output for the test data
y_pred = clf.predict(testing_data)

In [18]:
# Dumping the output obtained from the evaluation data into a "CSV" file
np.savetxt('Titanic Prediction.csv', y_pred, fmt = '%.0f')