In [None]:
import pandas as pd
import string
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
import csv

In [None]:
# Download NLTK data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Load dataset
file_path = '/content/drive/MyDrive/spamdetection/spam.csv'
raw_df = pd.read_csv(file_path, encoding='ISO-8859-1', header=None)
print(raw_df.head(10))

      0                                                  1    2    3    4
0    v1                                                 v2  NaN  NaN  NaN
1   ham  Go until jurong point, crazy.. Available only ...  NaN  NaN  NaN
2   ham                      Ok lar... Joking wif u oni...  NaN  NaN  NaN
3  spam  Free entry in 2 a wkly comp to win FA Cup fina...  NaN  NaN  NaN
4   ham  U dun say so early hor... U c already then say...  NaN  NaN  NaN
5   ham  Nah I don't think he goes to usf, he lives aro...  NaN  NaN  NaN
6  spam  FreeMsg Hey there darling it's been 3 week's n...  NaN  NaN  NaN
7   ham  Even my brother is not like to speak with me. ...  NaN  NaN  NaN
8   ham  As per your request 'Melle Melle (Oru Minnamin...  NaN  NaN  NaN
9  spam  WINNER!! As a valued network customer you have...  NaN  NaN  NaN


In [None]:
# Concatenate all columns into a single string per row
raw_df['text'] = raw_df.iloc[:, 1:].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
raw_df = raw_df[[0, 'text']]
raw_df.columns = ['label', 'text']

In [None]:
print(raw_df.head())

  label                                               text
0    v1                                                 v2
1   ham  Go until jurong point, crazy.. Available only ...
2   ham                      Ok lar... Joking wif u oni...
3  spam  Free entry in 2 a wkly comp to win FA Cup fina...
4   ham  U dun say so early hor... U c already then say...


In [None]:
# Display the data distribution
print(raw_df['label'].value_counts())

label
ham     4825
spam     747
v1         1
Name: count, dtype: int64


In [None]:
# Preprocessing
def preprocess_text(text):
    if isinstance(text, float):  # Handle non-string inputs
        return ''
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()  # Tokenize
    stop_words = set(stopwords.words('english'))  # Remove stopwords
    words = [word for word in words if word not in stop_words]
    stemmer = SnowballStemmer('english')  # Stemming
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [None]:
# Handle NaN values in the 'text' column
raw_df['text'] = raw_df['text'].fillna('')

# Ensure all values are strings
raw_df['text'] = raw_df['text'].astype(str)

# Apply preprocessing to the text data
raw_df['processed_text'] = raw_df['text'].apply(preprocess_text)

In [None]:
# Check for empty strings after preprocessing
print(f"Number of samples in processed text: {len(raw_df['processed_text'])}")
print(f"Number of empty strings in processed text: {sum(raw_df['processed_text'] == '')}")

Number of samples in processed text: 5573
Number of empty strings in processed text: 5


In [None]:
# Remove empty strings if any
raw_df = raw_df[raw_df['processed_text'] != '']

In [None]:
# Ensure there are still samples left after filtering
print(f"Number of samples after removing empty strings: {len(raw_df)}")

Number of samples after removing empty strings: 5568


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(raw_df['processed_text'], raw_df['label'], test_size=0.2, random_state=42)

In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

In [None]:
# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

In [None]:
# Transform the testing data
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Initialize models
nb_model = MultinomialNB()
lr_model = LogisticRegression(max_iter=1000)
svm_model = SVC(kernel='linear')

In [None]:
# Train models
nb_model.fit(X_train_tfidf, y_train)
lr_model.fit(X_train_tfidf, y_train)
svm_model.fit(X_train_tfidf, y_train)

In [None]:
# Make predictions
nb_pred = nb_model.predict(X_test_tfidf)
lr_pred = lr_model.predict(X_test_tfidf)
svm_pred = svm_model.predict(X_test_tfidf)

In [None]:
# Filter out unexpected values from y_test
valid_labels = ['ham', 'spam']
y_test_filtered = y_test[y_test.isin(valid_labels)]

In [None]:
# Filter predictions accordingly
nb_pred_filtered = nb_pred[y_test.isin(valid_labels)]
lr_pred_filtered = lr_pred[y_test.isin(valid_labels)]
svm_pred_filtered = svm_pred[y_test.isin(valid_labels)]

In [None]:
# Evaluate models
def evaluate_model(y_test, y_pred):
    # Calculate confusion matrix
    try:
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=valid_labels).ravel()
    except ValueError:
        tn = fp = fn = tp = 0

    # Calculate precision, recall, and F1-score
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0

    return accuracy, precision, recall, f1

In [None]:
# Evaluate each model
nb_results = evaluate_model(y_test_filtered, nb_pred_filtered)
lr_results = evaluate_model(y_test_filtered, lr_pred_filtered)
svm_results = evaluate_model(y_test_filtered, svm_pred_filtered)

In [None]:
# Ensure results are not None or NaN
nb_results = [0 if pd.isna(x) else x for x in nb_results]
lr_results = [0 if pd.isna(x) else x for x in lr_results]
svm_results = [0 if pd.isna(x) else x for x in svm_results]

In [None]:
# Display results
print("Naive Bayes Results: Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*nb_results))
print("Logistic Regression Results: Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*lr_results))
print("SVM Results: Accuracy = {:.2f}, Precision = {:.2f}, Recall = {:.2f}, F1-Score = {:.2f}".format(*svm_results))

Naive Bayes Results: Accuracy = 0.96, Precision = 1.00, Recall = 0.72, F1-Score = 0.84
Logistic Regression Results: Accuracy = 0.97, Precision = 0.99, Recall = 0.84, F1-Score = 0.91
SVM Results: Accuracy = 0.98, Precision = 0.99, Recall = 0.90, F1-Score = 0.94


In [None]:
# Compare models
results_df = pd.DataFrame({
    'Model': ['Naive Bayes', 'Logistic Regression', 'SVM'],
    'Accuracy': [nb_results[0], lr_results[0], svm_results[0]],
    'Precision': [nb_results[1], lr_results[1], svm_results[1]],
    'Recall': [nb_results[2], lr_results[2], svm_results[2]],
    'F1-Score': [nb_results[3], lr_results[3], svm_results[3]]
})

print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score
0          Naive Bayes  0.958707   1.000000  0.722892  0.839161
1  Logistic Regression  0.974865   0.992857  0.837349  0.908497
2                  SVM  0.983842   0.986842  0.903614  0.943396
