**Name :** Muhammad Saad Ullah Khan Haidri

**Reg no:** 2022420

**Faculty :** Cyber Security

**Lab 4 Task**

**Part 1: Data Exploration and Preprocessing**

Load the dataset:





In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Titanic dataset
titanic_data = pd.read_csv('train.csv')

# Display the first few rows of the dataset
print(titanic_data.head())


**Explore the dataset:**

You can check the distribution of the key features and visualize the data using seaborn or matplotlib:


In [None]:
# Visualize distribution of key features
sns.countplot(data=titanic_data, x='Pclass')
plt.show()

sns.histplot(data=titanic_data, x='Age', kde=True)
plt.show()

sns.countplot(data=titanic_data, x='Sex')
plt.show()


**Check for missing values:**

Use the following to see if there are any missing values

In [None]:
print(titanic_data.isnull().sum())


**Data Preprocessing:**

Handle missing values:



In [None]:
# Fill missing age values with median
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)

# Drop rows with missing Embarked values
titanic_data.dropna(subset=['Embarked'], inplace=True)


**Encode categorical variables:**


In [None]:
# Convert 'Sex' to numerical values
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})

# Convert 'Embarked' to numerical values
titanic_data['Embarked'] = titanic_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})


**Normalize numerical features:**

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
titanic_data[['Age', 'Fare']] = scaler.fit_transform(titanic_data[['Age', 'Fare']])


**Part 2: Implementing k-NN and Decision Trees**

**Model Training:**

Split the dataset:

Separate features (X) and target variable (Survived) and split the data into training and test sets.



In [None]:
from sklearn.model_selection import train_test_split

X = titanic_data[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']]
y = titanic_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


**k-NN Algorithm:**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the k-NN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict using k-NN
y_pred_knn = knn.predict(X_test)


**Decision Tree Algorithm:**

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Predict using Decision Tree
y_pred_dt = dt.predict(X_test)


**Model Evaluation:**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(y_test, y_pred, model_name):
    print(f"Performance of {model_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")

# Evaluate k-NN
evaluate_model(y_test, y_pred_knn, "k-NN")

# Evaluate Decision Tree
evaluate_model(y_test, y_pred_dt, "Decision Tree")


**Part 3: Visualization**

Decision Boundaries:

You can visualize decision boundaries by selecting two features (e.g., Age and Fare) and plotting the decision boundaries for both k-NN and Decision Tree.


In [None]:
import numpy as np
from matplotlib.colors import ListedColormap

def plot_decision_boundaries(X, y, model, title):
    h = 0.02
    x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
    y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, cmap=ListedColormap(('red', 'green')), alpha=0.8)
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors='k', cmap=ListedColormap(('red', 'green')))
    plt.title(title)
    plt.show()

# Plot decision boundaries for k-NN
plot_decision_boundaries(X_test[['Age', 'Fare']], y_test, knn, 'k-NN Decision Boundaries')

# Plot decision boundaries for Decision Tree
plot_decision_boundaries(X_test[['Age', 'Fare']], y_test, dt, 'Decision Tree Decision Boundaries')

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
knn_scores = [accuracy_score(y_test, y_pred_knn), precision_score(y_test, y_pred_knn), recall_score(y_test, y_pred_knn), f1_score(y_test, y_pred_knn)]
dt_scores = [accuracy_score(y_test, y_pred_dt), precision_score(y_test, y_pred_dt), recall_score(y_test, y_pred_dt), f1_score(y_test, y_pred_dt)]

# Bar chart to compare metrics
fig, ax = plt.subplots()
index = np.arange(len(metrics))
bar_width = 0.35
opacity = 0.8

rects1 = plt.bar(index, knn_scores, bar_width, alpha=opacity, color='b', label='k-NN')
rects2 = plt.bar(index + bar_width, dt_scores, bar_width, alpha=opacity, color='g', label='Decision Tree')

plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Model Performance Comparison')
plt.xticks(index + bar_width/2, metrics)
plt.legend()
plt.tight_layout()
plt.show()
