In [None]:
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
dataset = pd.read_csv("spam_ham_dataset.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
def preprocess_text(message): #To remove puctuation and non alphabetic characters from the message
  nonpunc =[char for char in message if char not in string.punctuation] #Removing the punctuations

  nonpunc = "".join(nonpunc)
  nonpunc = nonpunc.lower()

  nonstop = [
      word
      for word in nonpunc.split()
      if word.lower() not in stopwords.words("english") and word.isalpha() #Removing the stopwords and non-alphabetic chars
      ]

  return nonstop

In [None]:
dataset['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,3672
spam,1499


In [None]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
ham = dataset[dataset['label'] == 'ham']["text"]
spam = dataset[dataset['label'] == 'spam']["text"]

print(ham.shape)
print(spam.shape)

(3672,)
(1499,)


In [None]:
#We apply downsampling to prevent data from overfitting

# ham = ham.sample(spam.shape[0])
# print(ham.shape, "YAY !!")

In [None]:
nltk.download('stopwords')

spam_words = []
for message in spam:
  spam_words += preprocess_text(message)

print("The top 10 words are\n", pd.Series(spam_words).value_counts().head(10))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


The top 10 words are
 subject        1657
com             992
http            983
company         728
e               631
www             587
information     520
font            515
td              504
get             485
Name: count, dtype: int64


In [None]:
ham_words = []
for message in ham:
  ham_words += preprocess_text(message)

print("The top 10 words are", pd.Series(ham_words).value_counts().head(10))

The top 10 words are ect        13897
hou         7281
enron       6555
subject     6403
gas         2861
deal        2789
com         2717
please      2715
meter       2459
cc          2359
Name: count, dtype: int64


In [None]:
#Removing the stopwords
dataset['text'] = dataset['text'].apply(preprocess_text)
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,"[subject, enron, methanol, meter, follow, note...",0
1,2349,ham,"[subject, hpl, nom, january, see, attached, fi...",0
2,3624,ham,"[subject, neon, retreat, ho, ho, ho, around, w...",0
3,4685,spam,"[subject, photoshop, windows, office, cheap, m...",1
4,2030,ham,"[subject, indian, springs, deal, book, teco, p...",0


In [None]:
#converting each to string
dataset["text"] = dataset["text"].agg(lambda x: " ".join(map(str,x)))
dataset.head()

  dataset["text"] = dataset["text"].agg(lambda x: " ".join(map(str,x)))


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,subject enron methanol meter follow note gave ...,0
1,2349,ham,subject hpl nom january see attached file hpln...,0
2,3624,ham,subject neon retreat ho ho ho around wonderful...,0
3,4685,spam,subject photoshop windows office cheap main tr...,1
4,2030,ham,subject indian springs deal book teco pvr reve...,0


In [None]:
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(dataset["text"])

print(vectorizer.get_feature_names_out()[20:40])
print(len(vectorizer.vocabulary_))

['aashqcsny' 'aavilable' 'aaxrzm' 'ab' 'aba' 'ababa' 'abacha' 'aback'
 'abackof' 'abacus' 'abacustech' 'abandon' 'abandone' 'abandoned' 'abarch'
 'abasements' 'abash' 'abashed' 'abate' 'abater']
45637


In [None]:
df_bow = bow_transformer.transform(dataset["text"])
print(f"Shape of matrix: {df_bow.shape}")
print(f"Non-zero occurance: {df_bow.nnz}")

Shape of matrix: (5171, 45637)
Non-zero occurance: 319964


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidtrans = TfidfTransformer().fit(df_bow)
tfidfinal = tfidtrans.transform(df_bow)
print(tfidfinal.shape)

(5171, 45637)


In [None]:
FactorResult = pd.factorize(dataset['text'])
dataset['text'] = FactorResult[0]
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,0,0
1,2349,ham,1,0
2,3624,ham,2,0
3,4685,spam,3,1
4,2030,ham,4,0


In [None]:
msg_train, msg_test, label_train, label_test = train_test_split(tfidfinal, dataset['label_num'], test_size = 0.2)
print(msg_train.shape)
print(msg_test.shape)
print(label_train.shape)
print(label_test.shape)

(4136, 45637)
(1035, 45637)
(4136,)
(1035,)


In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier()

clf.fit(msg_train, label_train)

In [None]:
predict = clf.predict(msg_test)

print(f'The accuracy score of the given thing is {metrics.accuracy_score(label_test, predict)}')

The accuracy score of the given thing is 0.9806763285024155


In [None]:
# Function to preprocess the user's input message
def preprocess_user_input(message):
    # Apply the same preprocessing steps as used for training data
    nonpunc = [char for char in message if char not in string.punctuation]
    nonpunc = "".join(nonpunc)
    nonpunc = nonpunc.lower()

    nonstop = [
        word
        for word in nonpunc.split()
        if word.lower() not in stopwords.words("english") and word.isalpha()
    ]
    return " ".join(nonstop)  # Join the words back into a string

# Get user input
user_message = input("Enter a message:\n")

# Preprocess the user's input
preprocessed_message = preprocess_user_input(user_message)

# Vectorize the preprocessed message using the same vectorizer
user_bow = bow_transformer.transform([preprocessed_message])

# Transform to TF-IDF representation
user_tfidf = tfidtrans.transform(user_bow)

# Make a prediction
prediction = clf.predict(user_tfidf)

# Print the prediction
if prediction[0] == 0:  # Assuming 0 represents 'ham'
    print("The message is classified as HAM (not spam).")
else:
    print("The message is classified as SPAM.")


Enter a message:
Dear learners,  There will be a live interactive session where a Course team member will explain some sample problems, how they are solved - that will help you solve the weekly assignments.  We invite you to join the session and get your doubts cleared and learn better.  Date: Mar 17, 2025 - Monday Time: 06:00 PM - 08:00 PM Link to join: https://meet.google.com/zpm-rebn-hnk  Happy Learning.  -NPTEL Team
The message is classified as HAM (not spam).


In [None]:
import joblib
from flask import Flask, render_template, request

joblib.dump(prediction, "/content/spamNLP.plk")
joblib.dump(vectorizer, "/content/vectorizer.plk")

['/content/vectorizer.plk']

In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://l28z36plz6-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
app = Flask(__name__)

model = joblib.load("spamNLP.plk")
vectorizer = joblib.load("vectorizer.plk")

@app.route("/", methods=["GET", "POST"])
def index():
    result = None

    if request.method == "POST":
        email_text = request.form["email"]
        email_vector = vectorizer.transform([email_text])
        prediction = model.predict(email_vector)[0]
        result = "Spam" if prediction == 1 else "Not Spam"

    return render_template("index.html", result=result)

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
