In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# Replace 'your_file.csv' with the actual path to your CSV file
try:
    df = pd.read_csv('combined_emails_with_natural_pii.csv')
except FileNotFoundError:
    print("Error: CSV file not found. Please check the file path.")
    exit()

# Display the first few rows to get an idea of the data
print("First few rows of the data:")
print(df.head())

# Check the distribution of categories
print("\nCategory distribution:")
print(df['type'].value_counts())

First few rows of the data:
                                               email      type
0  Subject: Unvorhergesehener Absturz der Datenan...  Incident
1  Subject: Customer Support Inquiry\n\nSeeking i...   Request
2  Subject: Data Analytics for Investment\n\nI am...   Request
3  Subject: Krankenhaus-Dienstleistung-Problem\n\...  Incident
4  Subject: Security\n\nDear Customer Support, I ...   Request

Category distribution:
type
Incident    9586
Request     6860
Problem     5037
Change      2517
Name: count, dtype: int64


In [5]:
X = df['email']
y = df['type']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining data size: {len(X_train)}")
print(f"Testing data size: {len(X_test)}")


Training data size: 19200
Testing data size: 4800


In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')  # Remove common English stop words
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nShape of TF-IDF training data:", X_train_tfidf.shape)
print("Shape of TF-IDF testing data:", X_test_tfidf.shape)


Shape of TF-IDF training data: (19200, 28040)
Shape of TF-IDF testing data: (4800, 28040)


In [8]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tfidf, y_train)

In [9]:
y_pred = naive_bayes_classifier.predict(X_test_tfidf)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy on the test set: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy on the test set: 0.67

Classification Report:
              precision    recall  f1-score   support

      Change       1.00      0.11      0.20       504
    Incident       0.61      0.98      0.75      1917
     Problem       0.43      0.01      0.03      1007
     Request       0.77      0.92      0.84      1372

    accuracy                           0.67      4800
   macro avg       0.70      0.51      0.45      4800
weighted avg       0.66      0.67      0.57      4800



In [11]:
naive_bayes_classifier_balanced = MultinomialNB(class_weight='balanced')
naive_bayes_classifier_balanced.fit(X_train_tfidf, y_train)
y_pred_balanced = naive_bayes_classifier_balanced.predict(X_test_tfidf)
print("\nAccuracy with balanced class weights:", accuracy_score(y_test, y_pred_balanced))
print("\nClassification Report with balanced class weights:")
print(classification_report(y_test, y_pred_balanced))

TypeError: __init__() got an unexpected keyword argument 'class_weight'

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
logistic_regression_balanced = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)
logistic_regression_balanced.fit(X_train_tfidf, y_train)

In [14]:
y_pred_lr_balanced = logistic_regression_balanced.predict(X_test_tfidf)

In [15]:
print("\nAccuracy with Logistic Regression (balanced class weights):", accuracy_score(y_test, y_pred_lr_balanced))
print("\nClassification Report with Logistic Regression (balanced class weights):")
print(classification_report(y_test, y_pred_lr_balanced))


Accuracy with Logistic Regression (balanced class weights): 0.7527083333333333

Classification Report with Logistic Regression (balanced class weights):
              precision    recall  f1-score   support

      Change       0.86      0.85      0.85       504
    Incident       0.71      0.78      0.75      1917
     Problem       0.53      0.42      0.47      1007
     Request       0.90      0.92      0.91      1372

    accuracy                           0.75      4800
   macro avg       0.75      0.74      0.74      4800
weighted avg       0.74      0.75      0.75      4800



In [16]:
from sklearn.svm import SVC

In [17]:
svm_balanced = SVC(class_weight='balanced', kernel='linear', random_state=42)
svm_balanced.fit(X_train_tfidf, y_train)

In [18]:
y_pred_svm_balanced = svm_balanced.predict(X_test_tfidf)

In [19]:
print("\nAccuracy with SVM (balanced class weights):", accuracy_score(y_test, y_pred_svm_balanced))
print("\nClassification Report with SVM (balanced class weights):")
print(classification_report(y_test, y_pred_svm_balanced))


Accuracy with SVM (balanced class weights): 0.7489583333333333

Classification Report with SVM (balanced class weights):
              precision    recall  f1-score   support

      Change       0.85      0.90      0.87       504
    Incident       0.76      0.67      0.71      1917
     Problem       0.50      0.61      0.55      1007
     Request       0.94      0.91      0.92      1372

    accuracy                           0.75      4800
   macro avg       0.76      0.77      0.76      4800
weighted avg       0.76      0.75      0.75      4800



In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf_balanced = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_balanced.fit(X_train_tfidf, y_train)

In [22]:
y_pred_rf_balanced = rf_balanced.predict(X_test_tfidf)

In [23]:
print("\nAccuracy with Random Forest (balanced class weights):", accuracy_score(y_test, y_pred_rf_balanced))
print("\nClassification Report with Random Forest (balanced class weights):")
print(classification_report(y_test, y_pred_rf_balanced))


Accuracy with Random Forest (balanced class weights): 0.7464583333333333

Classification Report with Random Forest (balanced class weights):
              precision    recall  f1-score   support

      Change       0.96      0.61      0.74       504
    Incident       0.66      0.97      0.78      1917
     Problem       0.83      0.14      0.24      1007
     Request       0.86      0.93      0.89      1372

    accuracy                           0.75      4800
   macro avg       0.83      0.66      0.67      4800
weighted avg       0.78      0.75      0.70      4800



In [51]:
from sklearn.tree import DecisionTreeClassifier

In [60]:
def train_decision_tree_model(csv_file_path):
    # Load dataset
    data = pd.read_csv(csv_file_path)
    
    # Extract features and labels
    emails = data['email']
    labels = data['type']
    
    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(emails)
    
    # Split into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    # Initialize and train Decision Tree classifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Print classification metrics
    report = classification_report(y_test, y_pred)
    print(report)
    

In [61]:
path = "combined_emails_with_natural_pii.csv"
train_decision_tree_model(path)

              precision    recall  f1-score   support

      Change       0.62      0.62      0.62       479
    Incident       0.66      0.66      0.66      1920
     Problem       0.39      0.38      0.38      1009
     Request       0.82      0.83      0.83      1392

    accuracy                           0.64      4800
   macro avg       0.62      0.62      0.62      4800
weighted avg       0.64      0.64      0.64      4800



In [26]:
!pip uninstall tensorflow tensorflow-macos tensorflow-metal -y


Found existing installation: tensorflow 2.16.2
Uninstalling tensorflow-2.16.2:
  Successfully uninstalled tensorflow-2.16.2
Found existing installation: tensorflow-macos 2.16.2
Uninstalling tensorflow-macos-2.16.2:
  Successfully uninstalled tensorflow-macos-2.16.2
[0m

In [28]:
!python3 -m pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable


In [29]:
!pip install tensorflow-macos

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow-macos
  Using cached tensorflow_macos-2.16.2-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting tensorflow==2.16.2 (from tensorflow-macos)
  Using cached tensorflow-2.16.2-cp39-cp39-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting numpy<2.0.0,>=1.23.5 (from tensorflow==2.16.2->tensorflow-macos)
  Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached tensorflow_macos-2.16.2-cp39-cp39-macosx_12_0_arm64.whl (2.1 kB)
Using cached tensorflow-2.16.2-cp39-cp39-macosx_12_0_arm64.whl (227.0 MB)
Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy, tensorflow, tensorflow-macos
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all t

In [30]:
!pip install tensorflow-metal

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow-metal
  Downloading tensorflow_metal-1.2.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (1.3 kB)
Downloading tensorflow_metal-1.2.0-cp39-cp39-macosx_12_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-metal
Successfully installed tensorflow-metal-1.2.0


In [1]:
!pip install "numpy~=1.26.0"


Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
print(np.__version__)

1.26.4


In [7]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print("GPU devices found:")
    for gpu in gpus:
        print(f"- Name: {gpu.name}, Type: {gpu.device_type}")
    try:
        # Check for Metal devices specifically
        metal_devices = [gpu for gpu in gpus if 'METAL' in gpu.name]
        if metal_devices:
            print("\nMetal (MPS) is available and being used by TensorFlow.")
        else:
            print("\nMetal (MPS) is NOT explicitly listed, but TensorFlow might still be using the GPU.")
    except Exception as e:
        print(f"\nError checking for Metal devices: {e}")
else:
    print("No GPU devices found.")

GPU devices found:
- Name: /physical_device:GPU:0, Type: GPU

Metal (MPS) is NOT explicitly listed, but TensorFlow might still be using the GPU.


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [10]:
# Load your CSV file
try:
    df = pd.read_csv('combined_emails_with_natural_pii.csv')
except FileNotFoundError:
    print("Error: CSV file not found. Please check the file path.")
    exit()

In [11]:
X = df['email']
y = df['type']

In [29]:
# Convert categories to numerical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

In [30]:
# Split data into training and testing sets
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
# Feature Extraction with TF-IDF and N-grams
max_words = 10000  # Keep this for potential embedding layer size
ngram_range = (1, 2)
tfidf_vectorizer = TfidfVectorizer(max_features=max_words, stop_words='english', ngram_range=ngram_range)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

print("Shape of TF-IDF training data:", X_train_tfidf.shape)
print("Shape of TF-IDF testing data:", X_test_tfidf.shape)

Shape of TF-IDF training data: (19200, 10000)
Shape of TF-IDF testing data: (4800, 10000)


In [35]:
# Reshape TF-IDF output for LSTM (add a time step dimension)
X_train_reshaped = X_train_tfidf.reshape(X_train_tfidf.shape[0], 1, X_train_tfidf.shape[1])
X_test_reshaped = X_test_tfidf.reshape(X_test_tfidf.shape[0], 1, X_test_tfidf.shape[1])

print("Shape of reshaped training data for LSTM:", X_train_reshaped.shape)
print("Shape of reshaped testing data for LSTM:", X_test_reshaped.shape)

Shape of reshaped training data for LSTM: (19200, 1, 10000)
Shape of reshaped testing data for LSTM: (4800, 1, 10000)


In [36]:
# Build the LSTM Model (adjusted input shape)
embedding_dim = 128
model = Sequential()
model.add(LSTM(128, input_shape=(1, max_words)))  # Input shape adjusted
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [37]:
# Calculate class weights
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(y_train_encoded),
                                                 y=y_train_encoded)
class_weight_dict = dict(enumerate(class_weights))
print("Class Weights:", class_weight_dict)

Class Weights: {0: 2.384500745156483, 1: 0.6258964662928674, 2: 1.1910669975186103, 3: 0.8746355685131195}


In [45]:
# Early stopping and model checkpoint callbacks
epochs = 30
batch_size = 32
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(filepath='best_lstm_tfidf_ngrams.keras',
                                   monitor='val_accuracy',
                                   save_best_only=True,
                                   mode='max',
                                   verbose=1)

In [46]:
# Train the model
history = model.fit(X_train_reshaped, y_train_encoded,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[early_stopping, model_checkpoint],
                    class_weight=class_weight_dict)

Epoch 1/30
[1m539/540[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.8897 - loss: 0.2201
Epoch 1: val_accuracy improved from -inf to 0.73594, saving model to best_lstm_tfidf_ngrams.keras
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 23ms/step - accuracy: 0.8897 - loss: 0.2201 - val_accuracy: 0.7359 - val_loss: 0.6255
Epoch 2/30
[1m538/540[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.9201 - loss: 0.1684
Epoch 2: val_accuracy improved from 0.73594 to 0.74323, saving model to best_lstm_tfidf_ngrams.keras
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - accuracy: 0.9201 - loss: 0.1685 - val_accuracy: 0.7432 - val_loss: 0.6915
Epoch 3/30
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9315 - loss: 0.1409
Epoch 3: val_accuracy did not improve from 0.74323
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - ac

In [47]:
# Load the best saved model
best_model = load_model('best_lstm_tfidf_ngrams.keras')


In [48]:
# Evaluate the best model on the test set
y_pred_probs = best_model.predict(X_test_reshaped)
y_pred = y_pred_probs.argmax(axis=-1)
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test_encoded)

[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


In [50]:
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"\nAccuracy on the test set (LSTM with TF-IDF and N-grams): {accuracy:.2f}")
print("\nClassification Report (LSTM with TF-IDF and N-grams):")
print(classification_report(y_test_labels, y_pred_labels))


Accuracy on the test set (LSTM with TF-IDF and N-grams): 0.74

Classification Report (LSTM with TF-IDF and N-grams):
              precision    recall  f1-score   support

      Change       0.87      0.85      0.86       504
    Incident       0.72      0.70      0.71      1917
     Problem       0.49      0.51      0.50      1007
     Request       0.91      0.91      0.91      1372

    accuracy                           0.74      4800
   macro avg       0.75      0.74      0.75      4800
weighted avg       0.74      0.74      0.74      4800



In [4]:
pip install transformers datasets scikit-learn accelerate torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import (
    XLMRobertaTokenizer, 
    XLMRobertaForSequenceClassification, 
    Trainer, 
    TrainingArguments
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def mask_text(text):
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '{{EMAIL}}', text)
    text = re.sub(r'\+?\d[\d\s\-()]{7,}\d', '{{PHONE}}', text)
    return text

In [3]:
df = pd.read_csv("combined_emails_with_natural_pii.csv")
df['email'] = df['email'].apply(mask_text)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['type'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['email'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [5]:
pip install sentencepiece


Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.7 kB)
Downloading sentencepiece-0.2.0-cp39-cp39-macosx_11_0_arm64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 19200/19200 [00:06<00:00, 3179.58 examples/s]
Map: 100%|██████████| 4800/4800 [00:01<00:00, 3006.55 examples/s]


In [5]:
num_labels = len(df['label'].unique())
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
pip install --upgrade transformers accelerate


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl (2.7 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.39.3
    Uninstalling transformers-4.39.3:
      Successfully uninstalled transformers-4.39.3
Successfully installed tokenizers-0.21.1 transformers-4.51.3
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install --upgrade transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip show transformers

Name: transformers
Version: 4.51.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /Users/vighneshms/Library/Python/3.9/lib/python/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install --upgrade datasets accelerate evaluate

Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import TrainingArguments

args = TrainingArguments(output_dir="test", evaluation_strategy="epoch")
print("Looks good!")

TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m232104[0m ([33m232104-iiit-trichy[0m). Use [1m`wandb login --relogin`[0m to force relogin


wandb-core(56005) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56006) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56012) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56013) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56033) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56034) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56050) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56051) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56058) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56059) MallocStackLogging: can't turn off malloc stack logging because 

RuntimeError: MPS backend out of memory (MPS allocated: 8.08 GB, other allocations: 2.27 GB, max allowed: 9.07 GB). Tried to allocate 256 bytes on shared pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [8]:
preds = trainer.predict(val_dataset)
pred_labels = preds.predictions.argmax(axis=-1)

print("Classification Report:\n")
print(classification_report(val_labels, pred_labels, target_names=label_encoder.classes_))


RuntimeError: MPS backend out of memory (MPS allocated: 8.08 GB, other allocations: 1.45 GB, max allowed: 9.07 GB). Tried to allocate 32.00 KB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [36]:
import pandas as pd
from datasets import Dataset
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torch
import gc

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x29f00beb0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 2a3a3c3a0, raw_cell="import pandas as pd
from datasets import Dataset
f.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/vighneshms/Downloads/Email_classifier/models/models1.ipynb#Y131sZmlsZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x29f00beb0>> (for post_run_cell), with arguments args (<ExecutionResult object at 2a3a3c2b0, execution_count=36 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 2a3a3c3a0, raw_cell="import pandas as pd
from datasets import Dataset
f.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/vighneshms/Downloads/Email_classifier/models/models1.ipynb#Y131sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [32]:
#device = torch.device("cpu")

# Clear memory
# gc.collect()
# if torch.cuda.is_available():
#     torch.cuda.empty_cache()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x29f00beb0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 1724b0a30, raw_cell="#device = torch.device("cpu")

# Clear memory
# gc.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/vighneshms/Downloads/Email_classifier/models/models1.ipynb#Y132sZmlsZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x29f00beb0>> (for post_run_cell), with arguments args (<ExecutionResult object at 1724b0940, execution_count=32 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 1724b0a30, raw_cell="#device = torch.device("cpu")

# Clear memory
# gc.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/vighneshms/Downloads/Email_classifier/models/models1.ipynb#Y132sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [33]:
df = pd.read_csv("combined_emails_with_natural_pii.csv")  # Replace with your actual filename
df = df.dropna()

# Label mapping
labels = df["type"].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
df["label"] = df["type"].map(label2id)

dataset = Dataset.from_pandas(df[["email", "label"]])

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x29f00beb0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 153f910d0, raw_cell="df = pd.read_csv("combined_emails_with_natural_pii.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/vighneshms/Downloads/Email_classifier/models/models1.ipynb#Y135sZmlsZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x29f00beb0>> (for post_run_cell), with arguments args (<ExecutionResult object at 153f91070, execution_count=33 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 153f910d0, raw_cell="df = pd.read_csv("combined_emails_with_natural_pii.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/vighneshms/Downloads/Email_classifier/models/models1.ipynb#Y135sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [26]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

def extract_subject_and_body(email_text):
    lines = email_text.splitlines()
    subject = ""
    body = ""
    for line in lines:
        if line.lower().startswith("subject:"):
            subject = line.replace("Subject:", "").strip()
        else:
            body += line.strip() + " "
    return subject, body.strip()

def tokenize_with_subject(batch):
    subjects, bodies = [], []
    for email in batch["email"]:
        subject, body = extract_subject_and_body(email)
        subjects.append(subject)
        bodies.append(body)
    combined_texts = [f"Subject: {s} [SEP] Body: {b}" for s, b in zip(subjects, bodies)]
    return tokenizer(combined_texts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_with_subject, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)


Map:  42%|████▏     | 10000/24000 [00:03<00:04, 3329.92 examples/s]wandb-core(59092) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59093) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Map: 100%|██████████| 24000/24000 [00:08<00:00, 2961.72 examples/s]


In [30]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,                      # Enables evaluation
    save_steps=500,                    # Save checkpoint every 500 steps
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,               # Limit number of checkpoints
    no_cuda=True                      # Forces CPU usage if CUDA/MPS is not usable
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

wandb-core(59373) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59374) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Step,Training Loss
10,1.3359
20,1.4025
30,1.3348


wandb-core(59390) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59391) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59407) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59408) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59423) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59424) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59460) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59461) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59481) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(59482) MallocStackLogging: can't turn off malloc stack logging because 

KeyboardInterrupt: 