In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import math


In [3]:
dataset = pd.read_csv("C:/Users/shive/Desktop/CSV Files/titanic.csv")

In [4]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
print("No of passengers travel in titanic:" + str(len(dataset.index)))

No of passengers travel in titanic:891


# Data Wrangling
Data wrangling is a process to clean the Nan value's data and unnecessary columns from the dataset.

In [6]:
dataset.isnull()
#False shows the data is not null and True means data is null

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
dataset.isnull().sum()
#It shows no. of null values in each column

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
#As we seen the cabin column has lots  of null values , so that it might be true to remove the cabin column.
dataset.drop("Cabin" , axis = 1 , inplace=True)


In [9]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


As we can see that cabin column is removed and we have to remove only missing values of Age column. So Let's do that.

In [10]:
#Now we have to drop the Nan value.
dataset.dropna(inplace=True)

Now we can clearly see that there is no any missing or data value . 
Means we can complete our data wrangling part successfully.

In [11]:
dataset.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [12]:
dataset.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


As we see we have many categorical data in our dataset , but this data is not acceptible in the analysis , so we have to convert those data in the numerical data.And we can do it using pandas.

In [13]:
pd.get_dummies(dataset['Sex'])

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


In [14]:
sex=pd.get_dummies(dataset['Sex'],drop_first=True)
sex.head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [15]:
Embark=pd.get_dummies(dataset['Embarked'],drop_first=True)
Embark.head()

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [16]:
Pcl=pd.get_dummies(dataset['Pclass'],drop_first=True)
Pcl.head()

Unnamed: 0,2,3
0,0,1
1,0,0
2,0,1
3,0,0
4,0,1


In [17]:
#Now we have to add these columns in the dataset.
dataset = pd.concat([dataset , sex , Embark , Pcl], axis=1)

In [18]:
dataset.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,male,Q,S,2,3
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,0,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,0,1,0,0


In [19]:
dataset.drop(['PassengerId' , 'Sex' , 'Pclass' , 'Embarked'] , axis=1 , inplace=True)

In [20]:
dataset.head()

Unnamed: 0,Survived,Name,Age,SibSp,Parch,Ticket,Fare,male,Q,S,2,3
0,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,0,0,0,0,0
2,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,0,1,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,0,0,1,0,0
4,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,0,1,0,1


# Training Model

In [21]:
X = dataset.drop(["Survived" , "Name" , "Ticket"] ,axis=1)
y = dataset.iloc[:,0].values
X

Unnamed: 0,Age,SibSp,Parch,Fare,male,Q,S,2,3
0,22.0,1,0,7.2500,1,0,1,0,1
1,38.0,1,0,71.2833,0,0,0,0,0
2,26.0,0,0,7.9250,0,0,1,0,1
3,35.0,1,0,53.1000,0,0,1,0,0
4,35.0,0,0,8.0500,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...
885,39.0,0,5,29.1250,0,1,0,0,1
886,27.0,0,0,13.0000,1,0,1,1,0
887,19.0,0,0,30.0000,0,0,1,0,0
889,26.0,0,0,30.0000,1,0,0,0,0


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
lig = LogisticRegression() 
lig.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
predict = lig.predict(X_test)

In [27]:
from sklearn.metrics import classification_report

In [28]:
classification_report(y_test , predict)

'              precision    recall  f1-score   support\n\n           0       0.79      0.86      0.82       125\n           1       0.77      0.67      0.72        89\n\n    accuracy                           0.78       214\n   macro avg       0.78      0.77      0.77       214\nweighted avg       0.78      0.78      0.78       214\n'

In [29]:
from sklearn.metrics import confusion_matrix as cm

In [30]:
cm(y_test , predict)

array([[107,  18],
       [ 29,  60]], dtype=int64)

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test , predict)

0.780373831775701

In [32]:
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set , y_set =  X_train , y_train
X1 , X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1  , stop = X_set[:,0].max() + 1 , step = 0.01) , 
                     np.arange(start = X_set[:,1].min() - 1  , stop = X_set[:,1].max() + 1 , step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

                      

TypeError: '(slice(None, None, None), 0)' is an invalid key