<a href="https://colab.research.google.com/github/Tharanidissanayake/Machine-Learning/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import urllib.request

# Load the Dataset

In [9]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None)

# Load the names from names folder

In [13]:
# Load the names
url_names = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names'
names = pd.read_csv(url_names, skiprows=31, header=None, sep=':', names=['feature_name', 'description'])

# Extract the names of the columns
column_names = list(names['feature_name'])

# Add the column names to the data
data.columns = column_names

# View the combined data and names
print(data.head())

   1, 0.    | spam, non-spam classes  word_freq_make  word_freq_address  \
0                               0.00            0.64               0.64   
1                               0.21            0.28               0.50   
2                               0.06            0.00               0.71   
3                               0.00            0.00               0.00   
4                               0.00            0.00               0.00   

   word_freq_all  word_freq_3d  word_freq_our  word_freq_over  \
0            0.0          0.32           0.00            0.00   
1            0.0          0.14           0.28            0.21   
2            0.0          1.23           0.19            0.19   
3            0.0          0.63           0.00            0.31   
4            0.0          0.63           0.00            0.31   

   word_freq_remove  word_freq_internet  word_freq_order  ...  \
0              0.00                0.00             0.00  ...   
1              0.07         

# Shuffle the Dataset

In [14]:
data = data.sample(frac=1, random_state=42)

# Split the Data into Training and Testing Sets

In [15]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the Data

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check for Missing Values

In [17]:
print(data.isnull().sum())

1, 0.    | spam, non-spam classes    0
word_freq_make                       0
word_freq_address                    0
word_freq_all                        0
word_freq_3d                         0
word_freq_our                        0
word_freq_over                       0
word_freq_remove                     0
word_freq_internet                   0
word_freq_order                      0
word_freq_mail                       0
word_freq_receive                    0
word_freq_will                       0
word_freq_people                     0
word_freq_report                     0
word_freq_addresses                  0
word_freq_free                       0
word_freq_business                   0
word_freq_email                      0
word_freq_you                        0
word_freq_credit                     0
word_freq_your                       0
word_freq_font                       0
word_freq_000                        0
word_freq_money                      0
word_freq_hp             

# Feature Engineering

In [18]:
data['email_length'] = data.iloc[:, :-1].sum(axis=1)


# Train the KNN Model

In [19]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict the test set labels

In [20]:
y_pred_knn = knn.predict(X_test)

# Evaluate the Performance of the KNN model

In [21]:
accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn)
recall = recall_score(y_test, y_pred_knn)
f1 = f1_score(y_test, y_pred_knn)

print("KNN Model Performance:")
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))

KNN Model Performance:
Accuracy: 0.912
Precision: 0.914
Recall: 0.865
F1 Score: 0.889


# Train the Decision Tree model

In [22]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict the test set labels

In [23]:
y_pred_dt = dt.predict(X_test)

# Evaluate the Performance of the Decision Tree model

In [24]:
accuracy = accuracy_score(y_test, y_pred_dt)
precision = precision_score(y_test, y_pred_dt)
recall = recall_score(y_test, y_pred_dt)
f1 = f1_score(y_test, y_pred_dt)

print("Decision Tree Model Performance:")
print("Accuracy: {:.3f}".format(accuracy))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F1 Score: {:.3f}".format(f1))

Decision Tree Model Performance:
Accuracy: 0.912
Precision: 0.896
Recall: 0.888
F1 Score: 0.892
