#### Load the essential libraries

In [2]:
import pandas as pd
import numpy as np

#### Load the dataset

In [3]:
data = pd.read_csv(r"C:\Users\ADMIN\Downloads\titanic_dataset .csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Informations about the dataset

In [4]:
#shape of data
data.shape

(891, 12)

In [5]:
# Basic information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# Statistical analysis
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


#### Pre-processing

In [7]:
# 1.Remove the unneccessary columns

data.PassengerId.nunique()

891

Which means the passenger id does not provide any information about the dataset.So we drop that column.

In [8]:
data.drop(['PassengerId'],axis=1,inplace=True)

In [9]:
# 2.Check for null values
data.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

The columns 'Age','Cabin' & 'Embarked' contain missing values.So we fill them using Imputation method.

In [10]:
# Imputation

mean_age = data.Age.mean()
mode_cabin = data.Cabin.mode()
mode_embarked = data.Embarked.mode()

print('mean_age: ',mean_age)
print('mode_cabin: ',mode_cabin)
print('mode_embarked: ',mode_embarked)

mean_age:  29.69911764705882
mode_cabin:  0        B96 B98
1    C23 C25 C27
2             G6
Name: Cabin, dtype: object
mode_embarked:  0    S
Name: Embarked, dtype: object


In [11]:
data['Age'] = data['Age'].fillna(29.69911764705882)
data['Cabin'] = data['Cabin'].fillna('G6')
data['Embarked'] = data['Embarked'].fillna('S')

data.isna().sum()

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [12]:
# 3.Check for duplicates
data[data.duplicated()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


No duplicates.

In [13]:
# 4.Check for Outliers

def outliers(data,col,threshold):
    z_score = ((col-col.mean())/col.std())
    outlier_val = (abs(z_score)>threshold)
    outlier_data = data[outlier_val]
    return outlier_data
    

In [14]:
data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [15]:
age_outlier = outliers(data,data.Age,2.5)
fare_outlier = outliers(data,data.Fare,2.5)


In [16]:
# Remove outliers

def remove_outliers(data,column,threshold):
    z_scr = (column-column.mean())/column.std()
    non_outliers = (abs(z_scr)<=threshold)
    df_no_outliers = data[non_outliers]
    return df_no_outliers

In [17]:
cleaned_age = remove_outliers(data,data.Age,2.5)
data = remove_outliers(cleaned_age,data.Fare,2.5)

  df_no_outliers = data[non_outliers]


In [18]:
data.shape

(855, 11)

In [19]:
# 5.Encoding

data = pd.get_dummies(data)

#### Creating feature and target variables

In [20]:
X = data.drop(['Survived'],axis=1)
y = data['Survived']

print(X.shape)
print(y.shape)

(855, 1656)
(855,)


In [21]:
#Scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X = scaler.fit_transform(X)

#### Creating Training and Testing dataset

In [22]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state=42)

#### Building the models

In [23]:
# 1. K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier

model_1 = KNeighborsClassifier()
knn_model = model_1.fit(X_train,y_train)
knn_pred = knn_model.predict(X_test)

In [24]:
# 2. Support Vector Machine
from sklearn.svm import SVC

model_2 = SVC()
sv_classifier = model_2.fit(X_train,y_train)
svc_pred = model_2.predict(X_test)

#### Model Evaluation

In [25]:
from sklearn.metrics import accuracy_score,classification_report

knn_accuracy = accuracy_score(y_test,knn_pred)
knn_report = classification_report(y_test,knn_pred)
print('KNN Accuracy :\n',knn_accuracy)
print('KNN Classification Report :\n',knn_report)

svc_accuracy = accuracy_score(y_test,svc_pred)
svc_report = classification_report(y_test,svc_pred)
print('SVM Accuracy :\n',svc_accuracy)
print('SVM Classification Report :\n',svc_report)


KNN Accuracy :
 0.45794392523364486
KNN Classification Report :
               precision    recall  f1-score   support

           0       0.59      0.14      0.23       121
           1       0.44      0.87      0.58        93

    accuracy                           0.46       214
   macro avg       0.51      0.51      0.40       214
weighted avg       0.52      0.46      0.38       214

SVM Accuracy :
 0.5794392523364486
SVM Classification Report :
               precision    recall  f1-score   support

           0       0.57      1.00      0.73       121
           1       1.00      0.03      0.06        93

    accuracy                           0.58       214
   macro avg       0.79      0.52      0.40       214
weighted avg       0.76      0.58      0.44       214



#### Cross Validation

In [26]:
# 1.KFold

from sklearn.model_selection import KFold,StratifiedKFold,cross_val_score

kfold = KFold(n_splits=5,shuffle=True,random_state=0)

knn_score = cross_val_score(knn_model,X,y,cv=kfold)
svc_score = cross_val_score(sv_classifier,X,y,cv=kfold)
mean_knn_scr = knn_score.mean()
mean_svc_scr = svc_score.mean()

print('KNN Mean Cross Validation Score :',mean_knn_scr)
print('SVM Mean Cross Validation Score :',mean_svc_scr)

KNN Mean Cross Validation Score : 0.44795321637426894
SVM Mean Cross Validation Score : 0.6456140350877193


In [27]:
# 2.Stratified KFold

skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=0)

knn_strat_score = cross_val_score(knn_model,X,y,cv=skfold)
svc_strat_score = cross_val_score(sv_classifier,X,y,cv=skfold)
avg_knn_scr = knn_strat_score.mean()
avg_svc_scr = svc_strat_score.mean()

print('KNN Mean Cross Validation Score :',avg_knn_scr)
print('SVM Mean Cross Validation Score :',avg_svc_scr)

KNN Mean Cross Validation Score : 0.40701754385964917
SVM Mean Cross Validation Score : 0.6432748538011696


Inference :
    Using Holdout,KFold and Stratified KFold techniques,the accuracy scores of Support Vector Machine is higher than that of K-Nearest Neighbors.So,its clear that the Support Vector Machine has a better performance than the K-Nearest Neighbors.