In [1]:
import pandas as pd

In [2]:
import re

In [3]:
import nltk

In [4]:
from nltk.corpus import stopwords

In [5]:
from nltk.stem import PorterStemmer

In [6]:
import nltk
print(nltk.data.path)

['C:\\Users\\shradha/nltk_data', 'C:\\ProgramData\\anaconda3\\nltk_data', 'C:\\ProgramData\\anaconda3\\share\\nltk_data', 'C:\\ProgramData\\anaconda3\\lib\\nltk_data', 'C:\\Users\\shradha\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']


In [7]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shradha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stemmer = PorterStemmer()

In [9]:
stop_words = set(stopwords.words("english"))

In [10]:
df = pd.read_csv(r"C:\Users\shradha\OneDrive\Desktop\Project1(codeTechie)\spam_ham_dataset.csv", encoding='latin-1')

In [11]:
df.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [12]:
df.columns = ['label', 'message']

In [13]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1}) # converted HAM = 0 and SPAM = 1

In [14]:
df.head()

Unnamed: 0,label,message
0,0,Subject: enron methanol ; meter # : 988291\r\n...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,1,"Subject: photoshop , windows , office . cheap ..."
4,0,Subject: re : indian springs\r\nthis deal is t...


In [15]:
def preprocess_text(text):
    text = re.sub(r"\W", " ",text)  #remove special symbols
    text =  text.lower() #converting all the text into lowercase
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    #remove stopwords and stem words
    return " ".join(words)

In [16]:
df["cleaned_message"] = df["message"].apply(preprocess_text)

In [17]:
df.head()

Unnamed: 0,label,message,cleaned_message
0,0,Subject: enron methanol ; meter # : 988291\r\n...,subject enron methanol meter 988291 follow not...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see...",subject hpl nom januari 9 2001 see attach file...
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar...",subject neon retreat ho ho ho around wonder ti...
3,1,"Subject: photoshop , windows , office . cheap ...",subject photoshop window offic cheap main tren...
4,0,Subject: re : indian springs\r\nthis deal is t...,subject indian spring deal book teco pvr reven...


In [18]:
#Importing Data Science - ML Libraries using SKlEARN

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
from sklearn.model_selection import train_test_split #distributing data into train and test for predict

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
from sklearn.metrics import accuracy_score, classification_report #check the accuracy of ML model

In [23]:
df.head()

Unnamed: 0,label,message,cleaned_message
0,0,Subject: enron methanol ; meter # : 988291\r\n...,subject enron methanol meter 988291 follow not...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see...",subject hpl nom januari 9 2001 see attach file...
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar...",subject neon retreat ho ho ho around wonder ti...
3,1,"Subject: photoshop , windows , office . cheap ...",subject photoshop window offic cheap main tren...
4,0,Subject: re : indian springs\r\nthis deal is t...,subject indian spring deal book teco pvr reven...


In [24]:
vectorizer = TfidfVectorizer(max_features=3000)  
X = vectorizer.fit_transform(df["cleaned_message"])  # Input data

In [25]:
y = df["label"] #output data

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

In [27]:
model = LogisticRegression()

In [28]:
model.fit(X_train, y_train) # using train data we can predict the test data

In [29]:
y_pred = model.predict(X_test)
print("Prediction successful ✅")

Prediction successful ✅


In [30]:
print(f"accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

accuracy: 96.13%


In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2952
           1       0.96      0.91      0.93      1185

    accuracy                           0.96      4137
   macro avg       0.96      0.95      0.95      4137
weighted avg       0.96      0.96      0.96      4137



In [32]:
def predict_email(email_text):
    processed_text = preprocess_text(email_text)  # use the global preprocess_text function
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return "Spam" if prediction == 1 else "Ham - Not Spam"

In [33]:
email = "Hello Shradha,Congrats on finishing the 5-day Generative AI Intensive course! As we wrap things up, this is our final round up of reminders and announcements.📋 Reminders and Announcements🎯 Capstone Project: Level up your skills and enhance your portfolio with a real-world capstone project. You’ll create a notebook showcasing a use case that leverages some of the key generative AI capabilities learned throughout this course. For bonus points, you’ll have the option to create a blog post and/or YouTube video around your use case. The top 10 winners will have their work featured on Kaggle and Google Cloud’s social media. Additional details about the capstone project, evaluation, and submission process will be shared in another email today. Participation in the capstone project is optional.⭐ Kaggle badge and certificate: You are eligible to earn a badge and certificate on your Kaggle profile for participating in the capstone project. The badges and certificates will be added to your profile by the end of April 2025.👀 Look out for the Kaggle Learn Guide: All of the course content will be aggregated and made publicly available next week as a handy Kaggle Learn Guide. We’ll share it with you as soon as it's ready. In the meantime, all of the livestream recordings can be found here.🎟 Google Cloud NEXT: If you happen to be at Google Cloud NEXT this year, don’t forget to drop by at the “Become a GenAI Expert” session led by Paige Bailey and Anant Nawalgaria. We will also be giving out limited copies of the abridged whitepapers at the Coding Challenge hub. We hope to see you there!📢 Share your feedback: Please fill out this quick feedback form to let us know more about your experience. It’ll help us improve the program for future participants. Your feedback is completely anonymous and should only take a few minutes.And with that, thank you very much for joining us this week. We enjoyed connecting with you all, and wish you nothing but the best as you continue to grow your skills in this field.Cheers,The Kaggle Team"


In [34]:
print(f"Email: {email}\n Prediction : {predict_email(email)}")

Email: Hello Shradha,Congrats on finishing the 5-day Generative AI Intensive course! As we wrap things up, this is our final round up of reminders and announcements.📋 Reminders and Announcements🎯 Capstone Project: Level up your skills and enhance your portfolio with a real-world capstone project. You’ll create a notebook showcasing a use case that leverages some of the key generative AI capabilities learned throughout this course. For bonus points, you’ll have the option to create a blog post and/or YouTube video around your use case. The top 10 winners will have their work featured on Kaggle and Google Cloud’s social media. Additional details about the capstone project, evaluation, and submission process will be shared in another email today. Participation in the capstone project is optional.⭐ Kaggle badge and certificate: You are eligible to earn a badge and certificate on your Kaggle profile for participating in the capstone project. The badges and certificates will be added to your