# Introduction
This colab worksheet provides a starting point for Task 1 (the natural language processing assignment).

In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as sk_confusion_matrix, classification_report
from sklearn.metrics import f1_score, precision_score, recall_score
import re
import string
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

# Load the data

In [2]:
# Load the data using np.load
data = pd.read_csv('data/spam_detection_training_data.csv')

# Checking number of Not Spam(0) & Spam(1)
print(data['label'].value_counts())

# Extract the text and the labels
text = data['text'].values
labels = data['label'].values

test_data = pd.read_csv('data/spam_detection_test_data.csv')
test_text = test_data['text'].values

print(text.shape, labels.shape)
print(test_text.shape)

label
0    2551
1    1068
Name: count, dtype: int64
(3619,) (3619,)
(1552,)


# Data Visualisation
Here's an example of how to display the text based on its label.

In [3]:
# Examples of Spam and not Spam
def print_text(text, label):
  if label == 0:
    print (text, '\nis not spam!')
  else:
    print (text, '\nis spam!')

idx = np.random.randint(0, text.shape[0])
print_text(text[idx], labels[idx])

Subject: noms / actual flow for 3 / 19 / 01
we agree with the nom . for 3 / 19 / 01 .
- - - - - - - - - - - - - - - - - - - - - - forwarded by david avila / lsp / enserch / us on 03 / 20 / 2001
01 : 24 pm - - - - - - - - - - - - - - - - - - - - - - - - - - -
" eileen ponton " on 03 / 20 / 2001 10 : 05 : 59 am
to : david avila / lsp / enserch / us @ tu , charlie stone / texas utilities @ tu , melissa
jones / texas utilities @ tu , hpl . scheduling @ enron . com , liz . bellamy @ enron . com
cc :
subject : noms / actual flow for 3 / 19 / 01
date nom mcf mmbtu
3 / 19 / 01 60 , 000 59 , 944 61 , 562
btu = 1 . 027 
is not spam!


# Calculating Confusion Matrix and exporting results

In [4]:

nltk.download('stopwords')

# Define stopwords once
stop_words = set(stopwords.words("english"))

# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+|www\.\S+|\S+\.(com|net|org|co|in|uk)', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

data['clean_text'] = data['text'].map(clean_text)
test_data['clean_text'] = test_data['text'].map(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sahil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit on training data and transform both
x_train_full = vectorizer.fit_transform(data['clean_text']).toarray()
y_train_full = data['label']

x_test = vectorizer.transform(test_data['clean_text']).toarray()


In [6]:
def confusion_matrix(true_labels, pred_labels):
    cm = sk_confusion_matrix(true_labels, pred_labels)
    print("Confusion Matrix:\n")
    print(cm)
    print("\nClassification Report:\n")
    print(classification_report(true_labels, pred_labels))

In [7]:
# Split the training data for validation
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.2, random_state=42)

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVC": LinearSVC()
}

for name, clf in models.items():

    print(f"\nTesting model: {name}\n")
    clf.fit(x_train, y_train)
    preds = clf.predict(x_val)

    print(confusion_matrix(y_val, preds))

    print("____________________________________________________\n")




Testing model: Naive Bayes

Confusion Matrix:

[[487  22]
 [  9 206]]

Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       509
           1       0.90      0.96      0.93       215

    accuracy                           0.96       724
   macro avg       0.94      0.96      0.95       724
weighted avg       0.96      0.96      0.96       724

None
____________________________________________________


Testing model: Logistic Regression

Confusion Matrix:

[[500   9]
 [  3 212]]

Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       509
           1       0.96      0.99      0.97       215

    accuracy                           0.98       724
   macro avg       0.98      0.98      0.98       724
weighted avg       0.98      0.98      0.98       724

None
____________________________________________________


Testing model: Linear SVC





In [8]:
def save_as_csv(pred_labels, location = '.'):

    assert pred_labels.shape[0] == 1552, 'Error: wrong number of labels, should be 1552 test labels'

    if len(pred_labels.shape) == 1:
        pred_labels = pred_labels.reshape(-1, 1)

    np.savetxt(location + '/results_task1.csv', pred_labels, delimiter=',', fmt='%d')


In [9]:
# Retrain the best model on the full training data
best_model = LogisticRegression(max_iter=1000)
best_model.fit(x_train_full, y_train_full)

# Predict on already vectorized test data
test_preds = best_model.predict(x_test)


In [11]:
# Saving locally
save_as_csv(test_preds, location='results')