In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
df= sns.load_dataset("titanic")

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [5]:
df.isnull().sum()
#deck, age, embarked, embarked_town are containing null values

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [6]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [7]:
df.drop(["deck","embark_town","alive","class","who","adult_male"],axis=1,inplace=True)  #removing null and unnecessary columns

In [8]:
df["age"].fillna(df["age"].mean(),inplace=True)  #filling null values with mean

In [9]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,male,22.0,1,0,7.25,S,False
1,1,1,female,38.0,1,0,71.2833,C,False
2,1,3,female,26.0,0,0,7.925,S,True
3,1,1,female,35.0,1,0,53.1,S,False
4,0,3,male,35.0,0,0,8.05,S,True


In [10]:
df.isnull().sum() #again checking null after some processing
#now we have only one embarked columns as a null in 2 rows so we will drop that 2 row fromt he table...

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
alone,0


In [11]:
df.dropna(subset=["embarked"],inplace=True) #this code drop the null values present in any row of the embarked columns...

In [12]:
df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,0
alone,0


In [13]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,male,22.000000,1,0,7.2500,S,False
1,1,1,female,38.000000,1,0,71.2833,C,False
2,1,3,female,26.000000,0,0,7.9250,S,True
3,1,1,female,35.000000,1,0,53.1000,S,False
4,0,3,male,35.000000,0,0,8.0500,S,True
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S,True
887,1,1,female,19.000000,0,0,30.0000,S,True
888,0,3,female,29.699118,1,2,23.4500,S,False
889,1,1,male,26.000000,0,0,30.0000,C,True


In [14]:
#now we can see that our 2 columns sex and embarked are in string form so we will do label encoding to convert everything into numeric form..
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
df['sex']= le.fit_transform(df['sex']) #male:1 and female:0
df['embarked']= le.fit_transform(df['embarked']) # S=2,C=0, Q=1


In [15]:
df
#now everything is numeric except alone column which is boolean

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,1,22.000000,1,0,7.2500,2,False
1,1,1,0,38.000000,1,0,71.2833,0,False
2,1,3,0,26.000000,0,0,7.9250,2,True
3,1,1,0,35.000000,1,0,53.1000,2,False
4,0,3,1,35.000000,0,0,8.0500,2,True
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,2,True
887,1,1,0,19.000000,0,0,30.0000,2,True
888,0,3,0,29.699118,1,2,23.4500,2,False
889,1,1,1,26.000000,0,0,30.0000,0,True


In [16]:
df= df.astype(int)
df  #whole cleaned data is ready

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,1,22,1,0,7,2,0
1,1,1,0,38,1,0,71,0,0
2,1,3,0,26,0,0,7,2,1
3,1,1,0,35,1,0,53,2,0
4,0,3,1,35,0,0,8,2,1
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27,0,0,13,2,1
887,1,1,0,19,0,0,30,2,1
888,0,3,0,29,1,2,23,2,0
889,1,1,1,26,0,0,30,0,1


In [17]:
# now we will divide the data with input and output feature into 2 different variable
x= df.drop(["survived"],axis=1)
y=df["survived"]

In [18]:
#Logistic Regression....

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)

In [20]:
from sklearn.linear_model import LogisticRegression
model= LogisticRegression()
model.fit(x_train,y_train)

In [21]:
y_pred= model.predict(x_test)

In [22]:
#now we have to evaluate our model and in classification models we use confusion matrix...

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [23]:
accuracy_score(y_test,y_pred) #accuracy of our model using logistic regression

0.8033707865168539

In [24]:
confusion_matrix(y_test,y_pred) #displaying confusion  matrix of our model

array([[90, 19],
       [16, 53]])

In [25]:
print(classification_report(y_test,y_pred)) #this will show each and every evaluation metrics of our model

              precision    recall  f1-score   support

           0       0.85      0.83      0.84       109
           1       0.74      0.77      0.75        69

    accuracy                           0.80       178
   macro avg       0.79      0.80      0.79       178
weighted avg       0.81      0.80      0.80       178



In [26]:
#KNN MODEL

In [27]:
 #we will do feature scaling of x_train x_test data because we are using knn model now
 from sklearn.preprocessing import StandardScaler
 scaler = StandardScaler()
 x_train_scaled= scaler.fit_transform(x_train)
 x_test_scaled= scaler.fit_transform(x_test)


In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train_scaled,y_train)

In [30]:
y_pred_knn= knn_model.predict(x_test_scaled)

In [31]:
accuracy_score(y_pred_knn,y_test)  #accuracy score using knn model

0.7752808988764045

In [32]:
confusion_matrix(y_pred_knn,y_test)

array([[89, 20],
       [20, 49]])

In [33]:
print(classification_report(y_pred_knn,y_test))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       109
           1       0.71      0.71      0.71        69

    accuracy                           0.78       178
   macro avg       0.76      0.76      0.76       178
weighted avg       0.78      0.78      0.78       178



In [34]:
#Naive Bayes....

In [35]:
# now we are having two data set x_trained and x_trained_scaled
# but we will use normal x_trained data in naive bayes

from sklearn.naive_bayes import GaussianNB

In [36]:
model_naive= GaussianNB()

In [37]:
model_naive.fit(x_train,y_train)

In [38]:
y_pred_naive= model_naive.predict(x_test)

In [39]:
accuracy_score(y_test,y_pred_naive)

0.7752808988764045

In [40]:
confusion_matrix(y_test,y_pred_naive)

array([[84, 25],
       [15, 54]])

In [41]:
print(classification_report(y_test,y_pred_naive))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81       109
           1       0.68      0.78      0.73        69

    accuracy                           0.78       178
   macro avg       0.77      0.78      0.77       178
weighted avg       0.78      0.78      0.78       178



In [42]:
#Decision Tree

In [43]:
from sklearn.tree import DecisionTreeClassifier


In [44]:
model_DT= DecisionTreeClassifier(random_state=42)

In [45]:
model_DT.fit(x_train_scaled,y_train)

In [46]:
y_pred_DT= model_DT.predict(x_test_scaled)

In [47]:
accuracy_score(y_test,y_pred_DT)

0.7696629213483146

In [48]:
confusion_matrix(y_test,y_pred_DT)

array([[88, 21],
       [20, 49]])

In [49]:
print(classification_report(y_test,y_pred_DT))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       109
           1       0.70      0.71      0.71        69

    accuracy                           0.77       178
   macro avg       0.76      0.76      0.76       178
weighted avg       0.77      0.77      0.77       178



In [50]:
 #SUPPORT VECTOR MACHINE

In [51]:
from sklearn.svm import SVC

In [52]:
model_SVM= SVC(kernel='rbf')

In [53]:
model_SVM.fit(x_train_scaled,y_train)

In [54]:
y_pred_svm= model_SVM.predict(x_test_scaled)

In [55]:
accuracy_score(y_test,y_pred_svm)

0.8258426966292135

In [56]:
confusion_matrix(y_test,y_pred_svm)

array([[96, 13],
       [18, 51]])

In [57]:
print(classification_report(y_test,y_pred_svm))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       109
           1       0.80      0.74      0.77        69

    accuracy                           0.83       178
   macro avg       0.82      0.81      0.81       178
weighted avg       0.82      0.83      0.82       178



In [58]:
#Now we can conclude that out of all models support vector machine gives most
# accurate survival prediction of titanic event...