**Spam Mail Classification Using KNN**

In [318]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [319]:
# Importing the dataset
column_names = ['word_freq_make','word_freq_address','word_freq_all','word_freq_3d','word_freq_our','word_freq_over','word_freq_remove','word_freq_internet','word_freq_order','word_freq_mail','word_freq_receive','word_freq_will','word_freq_people','word_freq_report','word_freq_addresses','word_freq_free','word_freq_business','word_freq_email','word_freq_you','word_freq_credit','word_freq_your','word_freq_font','word_freq_000','word_freq_money','word_freq_hp','word_freq_hpl','word_freq_george','word_freq_650','word_freq_lab','word_freq_labs','word_freq_telnet','word_freq_857','word_freq_data','word_freq_415','word_freq_85','word_freq_technology','word_freq_1999','word_freq_parts','word_freq_pm','word_freq_direct','word_freq_cs','word_freq_meeting','word_freq_original','word_freq_project','word_freq_re','word_freq_edu','word_freq_table','word_freq_conference','char_freq_%3B','char_freq_%28','char_freq_%5B','char_freq_%21','char_freq_%24','char_freq_%23','capital_run_length_average','capital_run_length_longest','capital_run_length_total','class']
dataset = pd.read_csv('/content/spambase.data', delimiter =',', header=None, names=column_names)

Understanding the data

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.describe()

In [None]:
dataset.columns

In [None]:
dataset['class'].value_counts()

In [None]:
dataset.nunique() #outputs the unique values

Cleaning the data

In [None]:
#data cleaning
dataset.dropna(inplace=True)
dataset.isnull().sum()

Data Preprocessing

In [327]:
#Split the dataset into feature set and target variable
X = dataset.drop('class', axis = 'columns')
y = dataset['class'].values

In [None]:
print(X.shape)
print(y.shape)

Standardize the features

In [329]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_scaled = sc_X.fit_transform(X)

In [330]:
scaled_data = pd.DataFrame( data = X_scaled , columns =X.columns)

In [None]:
scaled_data.describe()

Relationship Anaysis

In [332]:
corelation = scaled_data.corr()

In [None]:
plt.figure(figsize=(60,60))
sns.heatmap(corelation, xticklabels=corelation.columns, yticklabels=corelation.columns, annot=True)

Applying PCA for scaled data

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(scaled_data)
explained_var = pca.explained_variance_ratio_


plt.plot(np.cumsum(explained_var))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.title('Cumulative explained variance ratio plot')
plt.grid() 
plt.show()

In [None]:
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
label = ['PC' + str(x) for x in range(1, len(per_var) + 1)]
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(x=range(1, len(per_var) + 1), height=per_var, tick_label=label, width=0.6)
ax.set_ylabel("Percentage of explained variance")
ax.set_xlabel('Principal components')
ax.set_title('Screen plot')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

Based on the screen plot, we can see that there is a significant drop in explained variance after the first 5 principal components.
Therefore, we will select the first 4 principal components for modeling

In [352]:
# Perform PCA on the feature set
pca = PCA(n_components = 4)
X_pca = pca.fit_transform(X_scaled)

In [None]:
print(X_scaled.shape)
print(X_pca.shape)

Creating a DataFrame for the principal components

In [None]:
#This will be the new data fed to the algorithm.
principal_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
principal_df.head()

In [356]:
y_df = pd.DataFrame(data=y, columns=['class'])
final_df = pd.concat([principal_df, y_df], axis=1)

Visualizing the principal components using scatter plot

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=final_df, x='PC1', y='PC2', hue='class', palette='Set2')
plt.title('Principal Components Scatter Plot')
plt.show()

In [None]:
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.show()

In [None]:
per_var =np.round(pca.explained_variance_ratio_ * 100 , decimals=1)
label = ['PC' + str(x) for x in range(1,len(per_var) + 1 )]
plt.bar(x=range(1,len(per_var)+1),height = per_var , tick_label = label)
plt.ylabel("percentage of explained variance")
plt.xlabel('principal components')
plt.title('scree plot')

Model Building

In [360]:
#Define the Model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5, p=2, metric='euclidean')

In [361]:
#split dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=0, test_size=0.2)

In [None]:
# Fit the Model
model.fit(X_train,y_train)

In [None]:
#Predict the test set results
y_pred = model.predict(X_test)
y_pred

Evaluate Model 

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred, normalize='true')
print("Confusion Matrix:\n",cm)

In [None]:
#Visualization of the confusion matrix 
sns.heatmap(cm, annot=True, cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
#Checking the f1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
#Checking the Accuracy
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test, y_pred)
print(ac)

In [None]:
from sklearn.metrics import classification_report
class_rep = classification_report(y_test,y_pred)
print("Classification Report:\n",class_rep)