In [2]:
import pandas as pd
import re

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces and punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

def preprocess_data(input_file, output_file):
    # Load the dataset
    df = pd.read_csv(input_file)

    # Remove missing values
    df.dropna(inplace=True)

    # Clean the text replies
    df['reply'] = df['reply'].apply(clean_text)

    # Standardize labels to lowercase
    df['label'] = df['label'].str.lower()

    # Save the cleaned data
    df.to_csv(output_file, index=False)

if __name__ == "__main__":
    preprocess_data('/content/Data/emails.csv', '/content/Data/cleaned_emails.csv')

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import joblib

def train_baseline_model(input_file, model_output_path, vectorizer_output_path):
    # Load the cleaned dataset
    df = pd.read_csv(input_file)

    # Split data into training and testing sets
    X = df['reply']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)

    # Fit and transform the training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Transform the test data
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Train a Logistic Regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Baseline Model Accuracy: {accuracy}")
    print(f"Baseline Model F1 Score: {f1}")

    # Save the trained model and vectorizer
    joblib.dump(model, model_output_path)
    joblib.dump(tfidf_vectorizer, vectorizer_output_path)

if __name__ == "__main__":
    train_baseline_model(
        '/content/Data/cleaned_emails.csv',
        '/content/models/baseline_model.joblib',
        '/content/models/tfidf_vectorizer.joblib'
    )

Baseline Model Accuracy: 0.9953051643192489
Baseline Model F1 Score: 0.9952978860372445


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

def train_transformer_model(input_file, model_output_dir):
    # Load the cleaned dataset
    df = pd.read_csv(input_file)

    # Map labels to integers
    unique_labels = df['label'].unique()
    label_to_int = {label: i for i, label in enumerate(unique_labels)}
    int_to_label = {i: label for i, label in enumerate(unique_labels)}
    df['label_int'] = df['label'].map(label_to_int)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        df['reply'], df['label_int'], test_size=0.2, random_state=42
    )

    # Initialize tokenizer
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    # Tokenize data
    def tokenize_function(examples):
        return tokenizer(examples, truncation=True, padding=True, max_length=128)

    train_encodings = tokenize_function(X_train.tolist())
    test_encodings = tokenize_function(X_test.tolist())

    # Create Hugging Face Dataset
    train_dataset = Dataset.from_dict({
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask'],
        'labels': y_train.tolist()
    })
    test_dataset = Dataset.from_dict({
        'input_ids': test_encodings['input_ids'],
        'attention_mask': test_encodings['attention_mask'],
        'labels': y_test.tolist()
    })

    # Load model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=len(unique_labels)
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        # logging_dir='./logs', # Removed for simplicity
        # logging_steps=10, # Removed for simplicity
        # evaluation_strategy=IntervalStrategy.EPOCH, # Removed due to TypeError
        # save_strategy=IntervalStrategy.EPOCH, # Removed due to TypeError
        # load_best_model_at_end=True, # Removed due to TypeError
        # metric_for_best_model="f1", # Removed due to TypeError
        report_to="none" # Disable reporting to services like W&B
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)

    # Save label mappings
    pd.DataFrame([int_to_label]).to_csv(f"{model_output_dir}/label_mapping.csv", index=False)

if __name__ == "__main__":
    train_transformer_model(
        '/content/Data/cleaned_emails.csv',
        '/content/models/distilbert_finetuned'
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.2261


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import joblib
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
import numpy as np
import json

def evaluate_baseline_model(input_file, model_path, vectorizer_path):
    df = pd.read_csv(input_file)
    X = df['reply']
    y = df['label']
    _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)

    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, f1

def evaluate_transformer_model(input_file, model_dir):
    df = pd.read_csv(input_file)

    # Load label mapping
    label_mapping_df = pd.read_csv(f"{model_dir}/label_mapping.csv")
    # Ensure keys are integers
    int_to_label = {int(k): v for k, v in label_mapping_df.iloc[0].to_dict().items()}
    label_to_int = {v: k for k, v in int_to_label.items()}

    df['label_int'] = df['label'].map(label_to_int)

    X = df['reply']
    y = df['label_int']
    _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
    model = DistilBertForSequenceClassification.from_pretrained(model_dir)
    model.eval()

    predictions = []
    true_labels = []

    for text, label in zip(X_test, y_test):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        pred_label_int = torch.argmax(logits, dim=1).item()
        predictions.append(pred_label_int)
        true_labels.append(label)

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    return accuracy, f1

if __name__ == "__main__":
    cleaned_data_path = '/content/Data/cleaned_emails.csv'
    baseline_model_path = '/content/models/baseline_model.joblib'
    tfidf_vectorizer_path = '/content/models/tfidf_vectorizer.joblib'
    transformer_model_dir = '/content/models/distilbert_finetuned'

    print("Evaluating Baseline Model...")
    baseline_accuracy, baseline_f1 = evaluate_baseline_model(
        cleaned_data_path, baseline_model_path, tfidf_vectorizer_path
    )
    print(f"Baseline Model - Accuracy: {baseline_accuracy:.4f}, F1 Score: {baseline_f1:.4f}")

    print("\nEvaluating Transformer Model...")
    transformer_accuracy, transformer_f1 = evaluate_transformer_model(
        cleaned_data_path, transformer_model_dir
    )
    print(f"Transformer Model - Accuracy: {transformer_accuracy:.4f}, F1 Score: {transformer_f1:.4f}")

    # Compare and decide
    if transformer_f1 > baseline_f1:
        print("\nTransformer model performs better and is recommended for production.")
        best_model = "Transformer"
    else:
        print("\nBaseline model performs better and is recommended for production.")
        best_model = "Baseline"

    # Save results
    results = {
        "baseline_model": {
            "accuracy": baseline_accuracy,
            "f1_score": baseline_f1
        },
        "transformer_model": {
            "accuracy": transformer_accuracy,
            "f1_score": transformer_f1
        },
        "best_model_for_production": best_model
    }

    with open('/content/Results/model_comparison.json', 'w') as f:
        json.dump(results, f, indent=4)
    print("\nModel comparison results saved to results/model_comparison.json")

Evaluating Baseline Model...
Baseline Model - Accuracy: 0.9953, F1 Score: 0.9953

Evaluating Transformer Model...
Transformer Model - Accuracy: 0.9977, F1 Score: 0.9977

Transformer model performs better and is recommended for production.

Model comparison results saved to results/model_comparison.json
