In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import joblib

# Load data
df = pd.read_csv("/kaggle/input/enron-email-dataset/emails.csv", nrows=100)
emails = pd.DataFrame({'email_body': []})

def extract_email_body(df):
    email_bodies = []

    for email in df['message']:
        if "FileName:" in email:
            start = email.index("FileName:") + len("FileName:")
            end = email[start:].index("\n") + start
            body = email[end+1:].replace("\n", "")
            email_bodies.append(body)
        else:
            email_bodies.append(email)
    
    email_df = pd.DataFrame({'email_body': email_bodies})
    
    return email_df

emails = extract_email_body(df)

# Preprocess email text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove numbers
    text = re.sub('\d', '', text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    return text

emails['text'] = emails['email_body'].apply(preprocess)

# Create bigrams from email text
text_list = [doc.split() for doc in emails['text']]
bigram_phrases = Phrases(text_list, min_count=5, threshold=100)
bigram = Phraser(bigram_phrases)
bigram_text_list = [bigram[doc] for doc in text_list]
emails['bigram_text'] = [' '.join(doc) for doc in bigram_text_list]

# Create dictionary and bag-of-words representation of bigram text
dictionary = corpora.Dictionary([doc.split() for doc in emails['bigram_text']])
corpus = [dictionary.doc2bow(doc.split()) for doc in emails['bigram_text']]

optimal_num_topics = 8

# Fit the LDA model with the optimal number of topics
lda_model = models.LdaModel(corpus=corpus,
                            id2word=dictionary,
                            num_topics=optimal_num_topics,
                            random_state=42,
                            passes=10)

# Add topic labels to email data
emails['topic_label'] = [max(lda_model[corpus[i]], key=lambda x:x[1])[0] for i in range(len(corpus))]

# Create descriptive labels for each topic
topic_labels = {}
for i, topic_dist in enumerate(lda_model[corpus]):
    topic_words = [word for word, prob in lda_model.show_topic(max(topic_dist, key=lambda x: x[1])[0])]
    topic_labels[i] = '_'.join(topic_words)

emails['topic_label'] = [topic_labels[max(lda_model[corpus[i]], key=lambda x:x[1])[0]] for i in range(len(corpus))]

# Encode target labels
le = LabelEncoder()
emails['target'] = le.fit_transform(emails['topic_label'])

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(emails['email_body'])
y_train = emails['target']

# Train SVM model on full dataset
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)

# Save trained model
joblib.dump(svm, "email_topic_classifier.joblib")

# Save the vectorizer and label encoder
joblib.dump(vectorizer, "vectorizer.joblib")
joblib.dump(le, "label_encoder.joblib")

['label_encoder.joblib']

In [2]:
!pip install pyspellchecker
!pip install spacy

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting pyspellchecker
  Downloading pyspellchecker-0.7.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.1
[0m/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [3]:
import pandas as pd
import re
import string
import nltk
import torch
import numpy as np
import os
import joblib
from torch.utils.data import TensorDataset
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from scipy.special import softmax
from torch.utils.data import DataLoader
from typing import Dict, List, Union
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
os.environ["WANDB_DISABLED"] = "true"

# Add this custom data collator
class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features: List[Union[Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Convert input features to dictionaries
        features = [dict(zip(["input_ids", "attention_mask", "labels"], feature)) for feature in features]
        labels = [feature.pop('labels') for feature in features]
        batch = super().__call__(features)
        batch['labels'] = torch.stack(labels)
        return batch

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def load_data(filepath, nrows):
    """
    Load email data from a CSV file.
    """
    df = pd.read_csv(filepath, nrows=nrows)
    return df

def extract_email_body(df):
    """
    Extract email body from email messages.
    """
    email_bodies = []

    for email in df['message']:
        if "FileName:" in email:
            start = email.index("FileName:") + len("FileName:")
            end = email[start:].index("\n") + start
            body = email[end+1:].replace("\n", "")
            email_bodies.append(body)
        else:
            email_bodies.append(email)

    email_df = pd.DataFrame({'email_body': email_bodies})

    return email_df

def preprocess(text):
    """
    Preprocess the text by converting to lowercase, removing punctuation, numbers, and stopwords, and lemmatizing words.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove numbers
    text = re.sub('\d', '', text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    return text

def create_bigrams(text_list):
    """
    Create bigrams from the list of tokenized documents.
    """
    bigram_phrases = Phrases(text_list, min_count=5, threshold=100)
    bigram = Phraser(bigram_phrases)
    bigram_text_list = [bigram[doc] for doc in text_list]
    return bigram_text_list

def fit_lda(corpus, dictionary, num_topics, passes=10, random_state=42):
    """
    Fit an LDA model to the corpus using the given number of topics.
    """
    lda_model = models.LdaModel(corpus=corpus,
                                id2word=dictionary,
                                num_topics=num_topics,
                                random_state=random_state,
                                passes=passes)
    return lda_model

def assign_topic_labels(lda_model, corpus):
    """
    Assign topic labels based on the words with the highest probability in each topic.
    """
    topic_labels = {}
    for i, topic_dist in enumerate(lda_model[corpus]):
        topic_words = [word for word, prob in lda_model.show_topic(max(topic_dist, key=lambda x: x[1])[0])]
        topic_labels[i] = '_'.join(topic_words)
    return topic_labels

def encode_dataset(texts, labels, tokenizer):
    """
    Encode the dataset using the BERT tokenizer.
    """
    input_ids, attention_masks = [], []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.to_numpy())  # Convert pandas Series to NumPy array

    dataset = TensorDataset(input_ids, attention_masks, labels)
    return dataset

def convert_labels_to_onehot(labels, num_classes):
    onehot_encoder = OneHotEncoder(sparse=False, categories=[range(num_classes)])
    labels = labels.reshape(-1, 1)
    onehot_labels = onehot_encoder.fit_transform(labels)
    return onehot_labels

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probabilities = softmax(logits, axis=-1)
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
    cm = confusion_matrix(labels, predictions)

    num_classes = len(np.unique(labels))
    if num_classes == 1:
        auc = float('nan')
    else:
        onehot_labels = convert_labels_to_onehot(labels, num_classes)
        auc = roc_auc_score(onehot_labels, probabilities, average='weighted', multi_class='ovr')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'auc': auc,
    }

# Load data
df = load_data("/kaggle/input/enron-email-dataset/emails.csv", nrows=50)
emails = extract_email_body(df)

# Preprocess email text
emails['text'] = emails['email_body'].apply(preprocess)

# Create bigrams from email text
text_list = [doc.split() for doc in emails['text']]
bigram_text_list = create_bigrams(text_list)
emails['bigram_text'] = [' '.join(doc) for doc in bigram_text_list]

# Create dictionary and bag-of-words representation of bigram text
dictionary = corpora.Dictionary([doc.split() for doc in emails['bigram_text']])
corpus = [dictionary.doc2bow(doc.split()) for doc in emails['bigram_text']]

optimal_num_topics = 8

# Fit the LDA model with the optimal number of topics
lda_model = fit_lda(corpus, dictionary, num_topics=optimal_num_topics)

# Add topic labels to email data
emails['topic_label'] = [max(lda_model[corpus[i]], key=lambda x:x[1])[0] for i in range(len(corpus))]

# Create descriptive labels for each topic
topic_labels = assign_topic_labels(lda_model, corpus)
emails['topic_label'] = [topic_labels[max(lda_model[corpus[i]], key=lambda x:x[1])[0]] for i in range(len(corpus))]

# Encode target labels
le = LabelEncoder()
emails['target'] = le.fit_transform(emails['topic_label'])

# Split the dataset into training and testing data
raw_X_train, raw_X_test, y_train, y_test = train_test_split(emails['email_body'], emails['target'], test_size=0.25, random_state=42)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(y_train)))

# Encode the dataset for BERT
train_dataset = encode_dataset(raw_X_train, y_train, tokenizer)
test_dataset = encode_dataset(raw_X_test, y_test, tokenizer)

# Define training arguments and trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
)

custom_collator = CustomDataCollator(tokenizer)
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=custom_collator,
    compute_metrics=compute_metrics, 
)

# Train the BERT model
trainer.train()

# Evaluate the model on the testing data
test_results = trainer.evaluate()

print("Test Results:")
for metric, value in test_results.items():
    print(f"{metric}: {value}")

# Save trained model, tokenizer, and label encoder
bert_model.save_pretrained("bert_topic_classifier")
tokenizer.save_pretrained("bert_topic_classifier")
joblib.dump(le, "label_encoder.joblib")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 13
  Batch size = 8


Trainer is attempting to log a value of "[[0 0 0 3 0]
 [0 0 0 2 0]
 [0 0 0 1 0]
 [0 0 0 6 0]
 [0 0 0 1 0]]" of type <class 'numpy.ndarray'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Configuration saved in bert_topic_classifier/config.json


Test Results:
eval_loss: 1.4735068082809448
eval_accuracy: 0.46153846153846156
eval_precision: 0.21301775147928995
eval_recall: 0.46153846153846156
eval_f1: 0.291497975708502
eval_confusion_matrix: [[0 0 0 3 0]
 [0 0 0 2 0]
 [0 0 0 1 0]
 [0 0 0 6 0]
 [0 0 0 1 0]]
eval_auc: 0.5632034632034632
eval_runtime: 11.1696
eval_samples_per_second: 1.164
eval_steps_per_second: 0.179
epoch: 2.0


Model weights saved in bert_topic_classifier/pytorch_model.bin
tokenizer config file saved in bert_topic_classifier/tokenizer_config.json
Special tokens file saved in bert_topic_classifier/special_tokens_map.json


['label_encoder.joblib']