In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

data = pd.read_csv("emails.csv")

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

data['processed_text'] = data['text'].apply(preprocess_text)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['processed_text'])
y = data['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       1.00      0.99      0.99       856
           1       0.98      0.99      0.98       290

    accuracy                           0.99      1146
   macro avg       0.99      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



In [None]:
random_text = "Get rich quick! Click here to win a million dollars!!!"
preprocessed_text = preprocess_text(random_text)

text_vector = vectorizer.transform([preprocessed_text])

prediction = model.predict(text_vector)

if prediction[0] == 1:
    print("The text is classified as spam.")
else:
    print("The text is not classified as spam.")

The text is classified as spam.


In [None]:
nrml_text = "Hello, i am balaji"
preprocessed_text = preprocess_text(nrml_text)

text_vector = vectorizer.transform([preprocessed_text])

prediction = model.predict(text_vector)

if prediction[0] == 1:
    print("The text is classified as spam.")
else:
    print("The text is not classified as spam.")

The text is not classified as spam.
