In [5]:
# Support Vector Machine (SVM) is a supervised machine learning algorithm used for classification, regression, and outlier detection.
# When the data is linearly separable, SVM finds the optimal hyperplane that maximizes the margin between the two classes.
# Hyperplane: A decision boundary that separates different classes.
# Support Vectors: The data points closest to the hyperplane. These points influence the position and orientation of the hyperplane.
# Margin: The distance between the hyperplane and the closest data points (support vectors). SVM aims to maximize this margin.
# Kernel: Determines how the data is mapped into a higher-dimensional space.

# C (Regularization Parameter):
# Large C → less margin, focuses on correctly classifying all training examples.
# Small C→ larger margin, allows more misclassifications for better generalization.
# Gamma (γ): Defines the influence of a single training example.
# Low γ → larger influence, smooth decision boundary.
# High γ → smaller influence, more complex decision boundary.

In [6]:
# Importing libraries
import numpy as n
import pandas as pd
import matplotlib.pyplot as plt                         
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# Load dataset
df = pd.read_csv("C:/Users/KIIT/Downloads/cleaned_titanic_data.csv")
print(df.columns)
X=df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Gender']]
y=df[['Survived']]

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Gender'], dtype='object')


In [8]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train)
print(X_test)

     Pclass        Age  SibSp  Parch      Fare  Gender
445       1   4.000000      0      2   81.8583       1
650       3  29.699118      0      0    7.8958       1
172       3   1.000000      1      1   11.1333       0
450       2  36.000000      1      2   27.7500       1
314       2  43.000000      1      1   26.2500       1
..      ...        ...    ...    ...       ...     ...
106       3  21.000000      0      0    7.6500       0
270       1  29.699118      0      0   31.0000       1
860       3  41.000000      2      0   14.1083       1
435       1  14.000000      1      2  120.0000       0
102       1  21.000000      0      1   77.2875       1

[623 rows x 6 columns]
     Pclass        Age  SibSp  Parch     Fare  Gender
709       3  29.699118      1      1  15.2458       1
439       2  31.000000      0      0  10.5000       1
840       3  20.000000      0      0   7.9250       1
720       2   6.000000      0      1  33.0000       0
39        3  14.000000      1      0  11.2417 

In [9]:
# SVM model building
model = SVC(kernel='linear', C=1.0, gamma='scale')
model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print(y_train)
print(y_pred_train)

  y = column_or_1d(y, warn=True)


     Survived
445         1
650         0
172         1
450         0
314         0
..        ...
106         1
270         0
860         0
435         1
102         0

[623 rows x 1 columns]
[0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1
 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 1 0 1
 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 1 0
 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0
 1 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1

In [10]:
#SVM model evaluation
#Training
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print("Confusion Matrix:/n", confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train,y_pred_train))

Accuracy: 0.7849117174959872
Confusion Matrix:/n [[334  58]
 [ 76 155]]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       392
           1       0.73      0.67      0.70       231

    accuracy                           0.78       623
   macro avg       0.77      0.76      0.77       623
weighted avg       0.78      0.78      0.78       623



In [11]:
#Test
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix:/n", confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test,y_pred_test))

Accuracy: 0.7910447761194029
Confusion Matrix:/n [[134  23]
 [ 33  78]]
              precision    recall  f1-score   support

           0       0.80      0.85      0.83       157
           1       0.77      0.70      0.74       111

    accuracy                           0.79       268
   macro avg       0.79      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268



In [12]:
# K-Nearest Neighbors (KNN) is a supervised machine learning algorithm used for classification and regression tasks.
# It is one of the simplest and most intuitive algorithms in machine learning.
# KNN is based on the assumption that similar points are located near each other, making it a distance-based algorithm.

In [13]:
# Load dataset
df = pd.read_csv("C:/Users/KIIT/Downloads/cleaned_titanic_data.csv")
print(df.columns)
X=df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Gender']]
y=df[['Survived']]


Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Gender'], dtype='object')


In [14]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Fit the KNN model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

  return self._fit(X, y)


In [17]:
# KNN model building
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print(y_pred_train)
print(y_pred_test)

[0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1
 0 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 1 0 1
 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0 0 0 0 1 1 0
 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0
 1 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 1 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1
 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0
 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0
 1 0 0 1 1 1 0 1 1 1 0 0 



In [18]:
# KNN model evaluation
#Training
print("Accuracy:", accuracy_score(y_train, y_pred_train))
print(f"Confusion Matrix: \n {confusion_matrix(y_train, y_pred_train)}")
print(classification_report(y_train,y_pred_train))

Accuracy: 0.7849117174959872
Confusion Matrix: 
 [[334  58]
 [ 76 155]]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       392
           1       0.73      0.67      0.70       231

    accuracy                           0.78       623
   macro avg       0.77      0.76      0.77       623
weighted avg       0.78      0.78      0.78       623



In [19]:
#Test
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test,y_pred_test))

Accuracy: 0.7910447761194029
Confusion Matrix: 
 [[134  23]
 [ 33  78]]
              precision    recall  f1-score   support

           0       0.80      0.85      0.83       157
           1       0.77      0.70      0.74       111

    accuracy                           0.79       268
   macro avg       0.79      0.78      0.78       268
weighted avg       0.79      0.79      0.79       268



The Voting in sci-kit-learn (Sklearn) allows us to combine multiple machine-learning modules and use a majority vote or a weighted vote to make predictions.

Voting Strategies:
Hard Voting - The class that receives the majority of votes is selected as the final prediction. It is commonly used in classification problems. In regression, it predicts the average of the individual predictions.

Soft Voting - Weighted average of predicted probabilities is used to make the final prediction. It is suitable when classifiers provide probability estimates. In other words, for each class, it sums the predicted probabilities and predicts the class with the highest sum.


In [None]:
# Creating a VotingClassifier with soft voting

voting_classifier = VotingClassifier(estimators=[('SVC', model), ('KNeighborsClassifier', knn)], voting='soft')
voting_classifier.fit(X_train, y_train)

y_pred_test = voting_classifier.predict(X_test)

print('Classification report \n', classification_report(y_test, y_pred_test))

print('Confusion Matrix \n', confusion_matrix(y_test, y_pred_test))

print('Accuracy \n', accuracy_score(y_test, y_pred_test))

Classification report 
               precision    recall  f1-score   support

           0       0.80      0.87      0.83       157
           1       0.79      0.70      0.74       111

    accuracy                           0.80       268
   macro avg       0.80      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268

Confusion Matrix 
 [[136  21]
 [ 33  78]]
Accuracy 
 0.7985074626865671


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [None]:
# Creating a VotingClassifier with hard voting

voting_classifier = VotingClassifier(estimators=[('SVC', model), ('KNeighborsClassifier', knn)], voting='hard')
voting_classifier.fit(X_train, y_train)

y_pred_test = voting_classifier.predict(X_test)

print('Classification report \n', classification_report(y_test, y_pred_test))

print('Confusion Matrix \n', confusion_matrix(y_test, y_pred_test))

print('Accuracy \n', accuracy_score(y_test, y_pred_test))

Classification report 
               precision    recall  f1-score   support

           0       0.77      0.89      0.82       157
           1       0.79      0.62      0.70       111

    accuracy                           0.78       268
   macro avg       0.78      0.75      0.76       268
weighted avg       0.78      0.78      0.77       268

Confusion Matrix 
 [[139  18]
 [ 42  69]]
Accuracy 
 0.7761194029850746


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
