In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/selected-data/selected_data.csv


In [12]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification



In [13]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification



In [16]:
data = pd.read_csv('/kaggle/input/selected-data/selected_data.csv')
data.shape

(29026, 6)

In [17]:

# Preprocess the text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 512

In [18]:
def preprocess_text(text):
    text = str(text)
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_text['input_ids'], encoded_text['attention_mask']

In [19]:

X = data['body']
y = data['To']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_ids, X_train_mask = zip(*[preprocess_text(text) for text in X_train])
X_test_ids, X_test_mask = zip(*[preprocess_text(text) for text in X_test])

X_train_ids = torch.cat(X_train_ids, dim=0)
X_train_mask = torch.cat(X_train_mask, dim=0)
X_test_ids = torch.cat(X_test_ids, dim=0)
X_test_mask = torch.cat(X_test_mask, dim=0)

# Perform label encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

## Convert numpy arrays to tensors
y_train = torch.tensor(y_train_encoded).long()
y_test = torch.tensor(y_test_encoded).long()

# Create the train and test datasets
train_dataset = TensorDataset(X_train_ids, X_train_mask, y_train)
test_dataset = TensorDataset(X_test_ids, X_test_mask, y_test)
# Create the dataloaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [20]:

# Create the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the optimizer and loss function
optimizer = transformers.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:


# Training loop
epochs = 10
history = {'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': []}

total_steps = len(train_dataloader) * epochs
current_step = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_acc = 0.0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)
        train_acc += torch.sum(preds == labels).item()

        current_step += 1
        progress = float(current_step) / total_steps
        print(f"Epoch: {epoch + 1}/{epochs}, Step: {step + 1}/{len(train_dataloader)}, Progress: {progress:.2%}", end="\r")

    train_loss /= len(train_dataset)
    train_acc /= len(train_dataset)
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)

    # Evaluation on test set
    model.eval()
    test_loss = 0.0
    test_acc = 0.0

    with torch.no_grad():
        for batch in test_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)

            test_loss += loss.item() * input_ids.size(0)
            test_acc += torch.sum(preds == labels).item()

    test_loss /= len(test_dataset)
    test_acc /= len(test_dataset)
    history['test_loss'].append(test_loss)
    history['test_acc'].append(test_acc)

    print(f"Epoch: {epoch + 1}/{epochs}, Step: {step + 1}/{len(train_dataloader)}, Progress: {progress:.2%}")
    print(f"Train Loss: {train_loss:.4f} Train Acc: {train_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f} Test Acc: {test_acc:.4f}")
    print()

Epoch: 1/10, Step: 1452/1452, Progress: 10.00%
Train Loss: 0.2200 Train Acc: 0.9183
Test Loss: 0.1009 Test Acc: 0.9612

Epoch: 2/10, Step: 1452/1452, Progress: 20.00%
Train Loss: 0.0638 Train Acc: 0.9750
Test Loss: 0.0670 Test Acc: 0.9693

Epoch: 3/10, Step: 1452/1452, Progress: 30.00%
Train Loss: 0.0327 Train Acc: 0.9857
Test Loss: 0.0669 Test Acc: 0.9776

Epoch: 4/10, Step: 1452/1452, Progress: 40.00%
Train Loss: 0.0303 Train Acc: 0.9865
Test Loss: 0.0808 Test Acc: 0.9687

Epoch: 5/10, Step: 1452/1452, Progress: 50.00%
Train Loss: 0.0243 Train Acc: 0.9875
Test Loss: 0.0643 Test Acc: 0.9802

Epoch: 6/10, Step: 1452/1452, Progress: 60.00%
Train Loss: 0.0241 Train Acc: 0.9886
Test Loss: 0.0599 Test Acc: 0.9811

Epoch: 7/10, Step: 1452/1452, Progress: 70.00%
Train Loss: 0.0207 Train Acc: 0.9888
Test Loss: 0.0740 Test Acc: 0.9738

Epoch: 8/10, Step: 1365/1452, Progress: 79.40%

In [None]:
# CrossEntropyLoss
# Adaptive Moment Estimation with Weight Decay
# Saving the model
torch.save(model.state_dict(), 'saved_model.pt')

In [None]:
# Plot training and validation loss
plt.plot(history['train_loss'], label='Training Loss')
plt.plot(history['test_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.plot(history['train_acc'], label='Training Accuracy')
plt.plot(history['test_acc'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

In [None]:
# Generate classification report
model.eval()
y_pred_labels = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        y_pred_labels.extend(preds.cpu().numpy())

y_pred_labels = label_encoder.inverse_transform(y_pred_labels)
print(classification_report(y_test, y_pred_labels))


In [None]:

# Loading the saved model
loaded_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
loaded_model.load_state_dict(torch.load('saved_model.pt'))

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Sample email
sample_email = ["I hope this email finds you well. I wanted to schedule a meeting to discuss the upcoming project. Please let me know your availability. Thank you."]
preprocessed_sample = [preprocess_text(email) for email in sample_email]

# Tokenize and pad the preprocessed sample email(s):
sample_sequences = tokenizer.texts_to_sequences(preprocessed_sample)
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding='post')

# Convert to tensors
sample_input_ids = torch.tensor(sample_padded).to(device)
sample_attention_mask = torch.tensor((sample_padded > 0), dtype=torch.long).to(device)

# Predict the recipient(s) using the trained model:
model.eval()
with torch.no_grad():
    outputs = model(sample_input_ids, attention_mask=sample_attention_mask)
    logits = outputs.logits
    predicted_classes = logits.argmax(dim=1).cpu().numpy()

predicted_labels = label_encoder.inverse_transform(predicted_classes)
print("Predicted Recipient(s):", predicted_labels)


In [None]:
body_2 = data.loc[1, 'body']
body_2 = data.loc[1, 'body']
recipient_2 = data.loc[1, 'To']

print("Body (2nd email):", body_2)
print("Original Recipient (2nd email):", recipient_2)

# Preprocessing
preprocessed_body_2 = preprocess_text(body_2)

# Tokenize and pad the preprocessed body:
body_2_sequence = tokenizer.texts_to_sequences([preprocessed_body_2])
body_2_padded = pad_sequences(body_2_sequence, maxlen=max_length, padding='post')

# Convert to tensors
body_2_input_ids = torch.tensor(body_2_padded).to(device)
body_2_attention_mask = torch.tensor((body_2_padded > 0), dtype=torch.long).to(device)

# Predict the recipient using the trained model:
model.eval()
with torch.no_grad():
    outputs = model(body_2_input_ids, attention_mask=body_2_attention_mask)
    logits = outputs.logits
    predicted_recipient_2 = logits.argmax(dim=1).cpu().numpy()

predicted_label_2 = label_encoder.inverse_transform(predicted_recipient_2)
print("Predicted Recipient (2nd email):", predicted_label_2)
