# Text classification

Using the dataset `dataset_emails.csv` (or other dataset of your choice) create three text classificators:
* Using rule-based approach (regex)
* Using naive-bayes
* Using Spacy 3 

Finally, compare the results and show what is better and why. 

In [4]:
# Cell 1: Rule-based classifier (regex approach)
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from collections import Counter
# Cell 2: Naive Bayes classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# Cell 3: spaCy classifier
import spacy
from spacy.training.example import Example



#### Rule-based (regex)


In [13]:
import re

# Basic text preprocessing
def preprocess_text(text):
    """Convert to lowercase and remove extra whitespace"""
    return re.sub(r'\s+', ' ', str(text).lower()).strip()

# Rule-based prediction using regex
def rule_based_predict(texts):
    # Define regex patterns for each category
    patterns = {
        'send': [
            r'\bsend\b', r'\bcompose\b', r'\bwrite\b', r'\bnew message\b'
        ],
        'list': [
            r'\bmailing list\b', r'\bsubscribers\b', r'\bgroup email\b'
        ],
        'trash': [
            r'\bdelete\b', r'\bremove\b', r'\bspam\b', r'\bjunk\b'
        ],
        'read': [
            r'\bopen\b', r'\bview\b', r'\bread\b', r'\bseen\b'
        ],
        'reply': [
            r'\breply\b', r'\brespond\b', r'\banswer\b'
        ],
        'untrash': [
            r'\brestore\b', r'\brecover\b', r'\bmove from trash\b'
        ],
        'forward': [
            r'\bforward\b', r'\bpass along\b', r'\bsend again\b'
        ],
        'star': [
            r'\bmark\b', r'\bimportant\b', r'\bstar\b', r'\bflag\b'
        ],
        'trash_list': [
            r'\bdelete from list\b', r'\bunsubscribe\b'
        ]
    }

    predictions = []
    for text in texts:
        text = preprocess_text(text)
        scores = {category: 0 for category in patterns}

        for category, pattern_list in patterns.items():
            for pattern in pattern_list:
                matches = re.findall(pattern, text)
                scores[category] += len(matches)

        # Assign the category with the highest score, default to 'unknown' if no matches
        best_category = max(scores, key=scores.get)
        predictions.append(best_category if scores[best_category] > 0 else 'unknown')

    return predictions


In [14]:
# EXECUTION CODE
print("Running Rule-based Classifier")

# Load dataset
df = pd.read_csv('dataset_emails.csv')
print(f"Dataset loaded: {len(df)} rows")
print("Dataset columns:", df.columns.tolist())
print("Class distribution:")
print(df['label'].value_counts())

# Preprocess prompts
df['processed_prompt'] = df['prompt'].apply(preprocess_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_prompt'], df['label'], test_size=0.3, random_state=42
)
print(f"Test set size: {len(X_test)} samples")

# Make predictions
predictions = rule_based_predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, predictions)
print("\nRule-Based Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, predictions))

print("Rule-based classifier execution complete")

Running Rule-based Classifier
Dataset loaded: 1000 rows
Dataset columns: ['prompt', 'label']
Class distribution:
label
send          100
list          100
trash         100
read          100
reply         100
untrash       100
forward       100
star          100
trash_list    100
unknown       100
Name: count, dtype: int64
Test set size: 300 samples

Rule-Based Classifier Results:
Accuracy: 0.3800
Classification Report:
              precision    recall  f1-score   support

     forward       1.00      0.40      0.57        30
        list       0.00      0.00      0.00        20
        read       0.45      0.29      0.36        34
       reply       1.00      0.71      0.83        24
        send       0.50      0.37      0.43        35
        star       0.87      0.57      0.68        23
       trash       0.94      0.46      0.62        37
  trash_list       0.00      0.00      0.00        29
     unknown       0.16      0.86      0.27        35
     untrash       0.67      0.06  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Naive-Bayes


In [None]:
# EXECUTION CODE
print("Running Naive Bayes Classifier")

# Load dataset
df = pd.read_csv('dataset_emails.csv')
print(f"Dataset loaded: {len(df)} rows")
print("Dataset columns:", df.columns.tolist())
print("Class distribution:")
print(df['label'].value_counts())

# Preprocess prompts
df['processed_prompt'] = df['prompt'].apply(preprocess_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_prompt'], df['label'], test_size=0.3, random_state=42
)
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# Vectorize the text
vectorizer = CountVectorizer(max_features=1000)
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

# Train the classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_transformed, y_train)



Running Naive Bayes Classifier
Dataset loaded: 1000 rows
Dataset columns: ['prompt', 'label']
Class distribution:
label
send          100
list          100
trash         100
read          100
reply         100
untrash       100
forward       100
star          100
trash_list    100
unknown       100
Name: count, dtype: int64
Training set size: 700 samples
Test set size: 300 samples


In [8]:
# Make predictions
predictions = nb_classifier.predict(X_test_transformed)

# Evaluate
accuracy = accuracy_score(y_test, predictions)
print("\nNaive Bayes Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, predictions))

print("Naive Bayes classifier execution complete")


Naive Bayes Classifier Results:
Accuracy: 0.7467
Classification Report:
              precision    recall  f1-score   support

     forward       0.76      0.93      0.84        30
        list       0.80      0.80      0.80        20
        read       0.71      0.71      0.71        34
       reply       0.65      1.00      0.79        24
        send       0.92      0.66      0.77        35
        star       0.64      1.00      0.78        23
       trash       0.79      0.51      0.62        37
  trash_list       0.79      0.93      0.86        29
     unknown       0.81      0.37      0.51        35
     untrash       0.73      0.82      0.77        33

    accuracy                           0.75       300
   macro avg       0.76      0.77      0.74       300
weighted avg       0.77      0.75      0.73       300

Naive Bayes classifier execution complete


#### Spacy 3

In [10]:
# EXECUTION CODE
print("Running spaCy Classifier")

# Load dataset
df = pd.read_csv('dataset_emails.csv')
print(f"Dataset loaded: {len(df)} rows")
print("Dataset columns:", df.columns.tolist())
print("Class distribution:")
print(df['label'].value_counts())

# Preprocess prompts
df['processed_prompt'] = df['prompt'].apply(preprocess_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_prompt'], df['label'], test_size=0.3, random_state=42
)
print(f"Training set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# Initialize spaCy model
nlp = spacy.blank("en")

# Add text categorizer
textcat = nlp.add_pipe("textcat")

# Add labels
labels = list(set(y_train))
for label in labels:
    textcat.add_label(label)

# Prepare training data
train_data = []
for text, label in zip(X_train, y_train):
    cats = {l: (1.0 if l == label else 0.0) for l in labels}
    train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cats}))


Running spaCy Classifier
Dataset loaded: 1000 rows
Dataset columns: ['prompt', 'label']
Class distribution:
label
send          100
list          100
trash         100
read          100
reply         100
untrash       100
forward       100
star          100
trash_list    100
unknown       100
Name: count, dtype: int64
Training set size: 700 samples
Test set size: 300 samples


In [11]:
# Train the model
print("Training spaCy model (this may take a few minutes)...")
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(5):  # Reduced iterations for simplicity
        losses = {}
        for example in train_data:
            nlp.update([example], sgd=optimizer, losses=losses)
        print(f"Iteration {i+1}/5 completed. Loss: {losses}")

Training spaCy model (this may take a few minutes)...
Iteration 1/5 completed. Loss: {'textcat': 42.441631624418015}
Iteration 2/5 completed. Loss: {'textcat': 18.22192149075362}
Iteration 3/5 completed. Loss: {'textcat': 7.963955891138791}
Iteration 4/5 completed. Loss: {'textcat': 4.269872050853483}
Iteration 5/5 completed. Loss: {'textcat': 2.2127169262793087}


In [12]:
# Make predictions
print("Making predictions...")
predictions = []
for text in X_test:
    doc = nlp(text)
    scores = doc.cats
    predictions.append(max(scores, key=scores.get))

# Evaluate
accuracy = accuracy_score(y_test, predictions)
print("\nspaCy Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, predictions))

print("spaCy classifier execution complete")

Making predictions...

spaCy Classifier Results:
Accuracy: 0.7967
Classification Report:
              precision    recall  f1-score   support

     forward       0.90      0.87      0.88        30
        list       0.65      0.85      0.74        20
        read       0.78      0.74      0.76        34
       reply       0.96      0.92      0.94        24
        send       0.94      0.83      0.88        35
        star       0.79      1.00      0.88        23
       trash       0.83      0.51      0.63        37
  trash_list       0.80      0.97      0.88        29
     unknown       0.91      0.60      0.72        35
     untrash       0.59      0.88      0.71        33

    accuracy                           0.80       300
   macro avg       0.81      0.82      0.80       300
weighted avg       0.82      0.80      0.79       300

spaCy classifier execution complete
