In [15]:
import pandas as pd
import re
import nltk
import spacy
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict, Counter
import subprocess

In [16]:
# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], check=True)
    nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [17]:
# =============================
# STEP 1: Load & Clean Datasets
# =============================

# Load Enron and SMS datasets
enron_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/enron emails.csv", encoding='ISO-8859-1')
sms_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/mail_data.csv")

# Clean Enron
enron_df = enron_df[['Message', 'Category']].rename(columns={'Message': 'message', 'Category': 'label'})
# Clean SMS
sms_df = sms_df.rename(columns={'Message': 'message', 'Category': 'label'})

# Encode labels
combined_df = pd.concat([enron_df, sms_df], ignore_index=True)
combined_df['label'] = combined_df['label'].map({'ham': 0, 'spam': 1})
combined_df.dropna(subset=['message', 'label'], inplace=True)
combined_df.drop_duplicates(subset=['message'], inplace=True)
combined_df.reset_index(drop=True, inplace=True)

In [18]:
# =============================
# STEP 2: Exploratory Data Analysis
# =============================
print("Total samples:", len(combined_df))
print("Label distribution:\n", combined_df['label'].value_counts())

combined_df['label'].value_counts().plot(kind='bar')
plt.title("Spam vs Ham Distribution")
plt.xlabel("Label (0 = Ham, 1 = Spam)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/Colab Notebooks/eda_label_distribution.png")
plt.close()

combined_df['text_length'] = combined_df['message'].apply(len)
print("Text length stats:\n", combined_df['text_length'].describe())

combined_df[combined_df['text_length'] < 5000]['text_length'].hist(bins=50)
plt.title("Message Length Distribution (<5000 chars)")
plt.xlabel("Length")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("/content/drive/MyDrive/Colab Notebooks/eda_text_length.png")
plt.close()

print("Top 20 frequent words:")
print(Counter(" ".join(combined_df['message'].str.lower()).split()).most_common(20))

Total samples: 35595
Label distribution:
 label
0.0    20413
1.0    15182
Name: count, dtype: int64
Text length stats:
 count    35595.000000
mean      1171.920438
std       1809.583574
min          2.000000
25%        225.000000
50%        577.000000
75%       1375.000000
max      32453.000000
Name: text_length, dtype: float64
Top 20 frequent words:
[('.', 412805), ('-', 365855), (',', 298987), ('the', 220446), ('to', 170955), ('/', 157280), (':', 153732), ('and', 124095), ('of', 112471), ('a', 90473), ('in', 79526), ('you', 73681), ('for', 70187), ('is', 59098), ('this', 55052), ('_', 54237), ('i', 53573), ("'", 50127), ('on', 48056), (')', 45804)]


In [None]:
# =============================
# STEP 3: Text Preprocessing
# =============================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def process_sentence(sentence):
    nouns = []
    base_words = []
    final_words = []
    words_2 = word_tokenize(sentence)
    sentence = re.sub(r'[^ \w\s]', '', sentence)
    sentence = re.sub(r'_', ' ', sentence)
    words = word_tokenize(sentence)
    pos_tagged_words = pos_tag(words)
    for token, tag in pos_tagged_words:
        base_words.append(lemmatizer.lemmatize(token, tag_map[tag[0]]))
    for word in base_words:
        if word not in stop_words:
            final_words.append(word)
    sent = " ".join(final_words)
    for token, tag in pos_tag(words_2):
        if tag == 'NN' and len(token) > 1:
            nouns.append(token)
    return sent, nouns

def clean(email):
    email = email.lower()
    sentences = sent_tokenize(email)
    total_nouns = []
    string = ""
    for sent in sentences:
        sentence, nouns = process_sentence(sent)
        string += " " + sentence
        total_nouns += nouns
    return string.strip(), total_nouns

def ents(text):
    doc = nlp(text)
    expls = dict()
    if doc.ents:
        for ent in doc.ents:
            label = ent.label_
            word = ent.text
            if label in expls:
                expls[label].append(word)
            else:
                expls[label] = [word]
        return expls
    return 'no'

# Apply preprocessing
combined_df['cleaned_text'], combined_df['nouns'] = zip(*combined_df['message'].apply(clean))
combined_df['entities'] = combined_df['message'].apply(ents)

# Save
combined_df.drop(columns=['text_length'], inplace=True)
combined_df.to_csv("/content/drive/MyDrive/Colab Notebooks/processed_emails_combined.csv", index=False)
print("\n✅ Done. Saved to 'processed_emails_combined.csv'")



✅ Done. Saved to 'processed_emails_combined.csv'
