In [29]:
# Importing the required libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [30]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the spam dataset
data = pd.read_csv("/content/drive/MyDrive/Spam Email Detection - spam.csv")
data


In [31]:
# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["v2"], data["v1"], test_size=0.2)

# Creating a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transforming the training and testing sets
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [32]:
# Creating a logistic regression model
model = LogisticRegression()

# Trainig the model
model.fit(X_train_tfidf, y_train)

# Evaluating the model
accuracy = model.score(X_test_tfidf, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9650224215246637


In [None]:
# Get the predictions and labels
predictions = model.predict(X_test_tfidf)
labels = y_test

# Create the confusion matrix
confusion_matrix = confusion_matrix(labels, predictions)

# Print the confusion matrix
print(confusion_matrix)

In [None]:
# Converting the confusion matrix into a list
confusion_matrix_list = confusion_matrix.tolist()

# Creating a dataframe from the confusion matrix list
confusion_matrix_df = pd.DataFrame(confusion_matrix_list)

# Creating a bar graph of the confusion matrix
plt.figure(figsize=(8, 6))
plt.bar(confusion_matrix_df.columns, confusion_matrix_df.sum(axis=1))
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Confusion Matrix")
plt.show()