In [46]:
import numpy as np
import pandas as pd
#import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#import matplotlib.pyplot as plt

In [48]:
titanic_data = pd.read_csv(r'C:\Users\Dupe\Downloads\titanic\data\train.csv')

In [49]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Handling Missing Values

In [50]:
#checking how many cells are left empty in the table
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [51]:
#Dropping  the “Cabin” column from the data frame as it won’t be of much importance
titanic_data = titanic_data.drop(columns='Cabin', axis=1)

In [52]:
#Replacing the missing values in the “Age” column with the mean value
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)

In [53]:
#Finding the mode value of the “Embarked” column as it will have occurred the maximum number of times
print(titanic_data['Embarked'].mode())

0    S
dtype: object


In [54]:
#Replacing the missing values in the “Embarked” column with mode value
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

In [55]:
#check again if there are still any cells remaining empty.
titanic_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

## Transformation into a categorical column.

In [56]:
#convert that into integer type values, and transform it into a categorical column:
titanic_data.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)

### Let’s split the data into the target and feature variables.

In [57]:
X = titanic_data.drop(columns = ['PassengerId','Name','Ticket','Survived'],axis=1)
Y = titanic_data['Survived']

In [58]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

### Logistic Regression

In [59]:
#create a model named model
model = LogisticRegression()

In [60]:
#train the model, with our training values(X_train , Y_train)
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

### Checking the accuracy

In [61]:
#name a variable X_train_prediction, which will store all the predictive outputs of the values X_train.
X_train_prediction = model.predict(X_train)

In [62]:
#compare the values of X_train_prediction with Y_train, which was the original real-life data.
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.8075842696629213


In [63]:
#Let’s try it again with X_test and Y_test:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.7821229050279329


### Checking for a Random Person:

In [64]:
input_data = (3,0,35,0,0,8.05,0)  
# Note that these datas exclude the Survived data, as it is to be determined from the model itself

In [20]:
#prediction = model.predict(input_data)

In [65]:
#changing these values to a NumPy array
input_data_as_numpy_array = np.asarray(input_data)

In [66]:
#reshaping to our target dimensions.
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

In [67]:
#predicting using our model:
prediction = model.predict(input_data_reshaped)
#print(prediction)
if prediction[0]==0:
    print("Dead")
if prediction[0]==1:
    print("Alive")

Dead


## Working on the test set now

In [68]:
ttc_data = pd.read_csv(r'C:\Users\Dupe\Downloads\titanic\data\test.csv')
ttc_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [69]:
#checking how many cells are left empty in the table
ttc_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [70]:
#Replacing the missing values in the “Age” and "Fare" columns with the mean value
ttc_data['Age'].fillna(ttc_data['Age'].mean(), inplace=True)
ttc_data['Fare'].fillna(ttc_data['Fare'].mean(), inplace=True)

In [71]:
#dropping Cabin and Fare columns because they are not needed
ttc_data = ttc_data.drop(columns='Cabin', axis=1)

In [72]:
#convert that into integer type values, and transform it into a categorical column:
ttc_data.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)

In [73]:
X = ttc_data.drop(columns = ['PassengerId','Name','Ticket'],axis=1)

In [74]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,34.5,0,0,7.8292,2
1,3,1,47.0,1,0,7.0,0
2,2,0,62.0,0,0,9.6875,2
3,3,0,27.0,0,0,8.6625,0
4,3,1,22.0,1,1,12.2875,0


In [75]:
# predicting on actual test data
y_result = model.predict(X)
#y_result[1:10]

In [77]:
#create a new column 'Survived' to save predictions
ttc_data['Survived'] = y_result

prediction = pd.DataFrame(ttc_data, columns= ['PassengerId', 'Survived'])
export_csv = prediction.to_csv (r'C:\Users\Dupe\Downloads\titanic_survival.csv', index = None, header=True) 
export_csv
print('Successful!')

Successful!
