In [1]:
import pandas as pd #pandas library used for data manipulation

In [2]:
import re #re = regular expression fior calculation and pattern making

In [3]:
import nltk #nltk = Natural language tool kit, is a part of NLP

In [4]:
from nltk.corpus import stopwords #stopwords = am, i, was, were

In [5]:
from nltk.stem import PorterStemmer

In [None]:
nltk.download("stopwords")

In [7]:
stemmer = PorterStemmer()

In [8]:
stop_words = set(stopwords.words("english"))

In [9]:
df = pd.read_csv("spam_detection_15000.csv", encoding = 'latin-1') [["Label", "Message"]]

In [10]:
df.head() #.head method shows the first 5 rows of the given dataset

Unnamed: 0,Label,Message
0,ham,Can you send me the file before Friday?
1,ham,Don't forget to bring your ID and documents.
2,ham,Are you joining the team call today?
3,spam,This is not a scam! You really won. Contact us...
4,spam,This is not a scam! You really won. Contact us...


In [11]:
df.tail()

Unnamed: 0,Label,Message
14995,spam,Congratulations! You've won a $57 gift card. C...
14996,spam,Congratulations! You've won a $199 gift card. ...
14997,ham,Lunch at Starbucks around 2:45 PM?
14998,spam,Congratulations! You've won a $363 gift card. ...
14999,ham,Lunch at Domino's around 4:45 AM?


In [12]:
df['Label'] = df["Label"].map({"ham": 0, "spam" : 1})

In [13]:
df.head()

Unnamed: 0,Label,Message
0,0,Can you send me the file before Friday?
1,0,Don't forget to bring your ID and documents.
2,0,Are you joining the team call today?
3,1,This is not a scam! You really won. Contact us...
4,1,This is not a scam! You really won. Contact us...


In [14]:
def preprocess_text(text):
    text = re.sub(r"\W", " ", text) #remove special symbols
    text = text.lower() #converting all the text into lowercase
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    #remove stopwords and stem words
    return " ".join(words)

In [15]:
df["cleaned_message"] = df["Message"].apply(preprocess_text)

In [16]:
df.head()

Unnamed: 0,Label,Message,cleaned_message
0,0,Can you send me the file before Friday?,send file friday
1,0,Don't forget to bring your ID and documents.,forget bring id document
2,0,Are you joining the team call today?,join team call today
3,1,This is not a scam! You really won. Contact us...,scam realli contact us winner yahoo com
4,1,This is not a scam! You really won. Contact us...,scam realli contact us winner gmail com


In [17]:
# Importing Data Science - ML Libraries using SKLEARN

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer # converting text to numerical format

In [19]:
from sklearn.model_selection import train_test_split # distributing data into train and test for predict

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
from sklearn.metrics import accuracy_score, classification_report # check the accuracy of ML model

In [22]:
df.head()

Unnamed: 0,Label,Message,cleaned_message
0,0,Can you send me the file before Friday?,send file friday
1,0,Don't forget to bring your ID and documents.,forget bring id document
2,0,Are you joining the team call today?,join team call today
3,1,This is not a scam! You really won. Contact us...,scam realli contact us winner yahoo com
4,1,This is not a scam! You really won. Contact us...,scam realli contact us winner gmail com


In [23]:
vectorizer = TfidfVectorizer(max_features = 3000)
X = vectorizer.fit_transform(df["cleaned_message"]) # input data

In [24]:
y = df["Label"] # output data

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [26]:
model = LogisticRegression()

In [27]:
model.fit(X_train, y_train) # using train data we can predict the test data

In [28]:
y_pred = model.predict(X_test)

In [29]:
print(f"accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

accuracy: 100.00%


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2089
           1       1.00      1.00      1.00       911

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000



In [31]:
def predict_email(email_text):
    processed_text = preprocess_text(email_text)
    processed_data = preprocess_text(email_text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)
    return "Spam" if prediction[0]==1 else "Ham - not Spam"

In [32]:
email = '''Hi [Recipient's Name],

Hope you're having a good week! Just wanted to quickly follow up on [briefly mention topic, e.g., "our chat yesterday" or "the document I sent"].

Let me know if you have any questions.

Best,

[Your Name]"'''

In [33]:
print(f"Email: {email}/n prediction: {predict_email(email)}")

Email: Hi [Recipient's Name],

Hope you're having a good week! Just wanted to quickly follow up on [briefly mention topic, e.g., "our chat yesterday" or "the document I sent"].

Let me know if you have any questions.

Best,

[Your Name]"/n prediction: Ham - not Spam


In [34]:
email = '''I hope this email finds you well.

I'm writing to initiate a discussion regarding our current spam detection mechanisms and explore potential avenues for improvement. As we all know, effective spam detection is crucial for maintaining a secure and productive digital environment, protecting us from phishing attempts, malware, and general inbox clutter.

I'd like to understand our current challenges, review the effectiveness of existing solutions, and brainstorm new strategies or technologies that could bolster our defenses against evolving spam tactics.

Please let me know your availability for a brief meeting next week to discuss this further. Alternatively, if you have any immediate thoughts or suggestions, please feel free to reply to this email.

Your insights are highly valued as we work towards a more robust and efficient spam-free experience.

Best regards,

[Your Name]"'''

In [36]:
print(f"Email: {email}/n prediction: {predict_email(email)}")

Email: I hope this email finds you well.

I'm writing to initiate a discussion regarding our current spam detection mechanisms and explore potential avenues for improvement. As we all know, effective spam detection is crucial for maintaining a secure and productive digital environment, protecting us from phishing attempts, malware, and general inbox clutter.

I'd like to understand our current challenges, review the effectiveness of existing solutions, and brainstorm new strategies or technologies that could bolster our defenses against evolving spam tactics.

Please let me know your availability for a brief meeting next week to discuss this further. Alternatively, if you have any immediate thoughts or suggestions, please feel free to reply to this email.

Your insights are highly valued as we work towards a more robust and efficient spam-free experience.

Best regards,

[Your Name]"/n prediction: Spam
