In [None]:
!pip install pandas
!pip install torch
!pip install transformers
!pip install scikit-learn

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

In [None]:
df = pd.read_csv("email_subjects_dataset_large.csv")

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_df['Subject']), truncation=True, padding=True, return_tensors='pt', max_length=512)

test_encodings = tokenizer(list(test_df['Subject']), truncation=True, padding=True, return_tensors='pt', max_length=512)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

train_labels = torch.tensor(label_encoder.fit_transform(list(train_df['Category'])))
test_labels = torch.tensor(label_encoder.transform(list(test_df['Category'])))

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['Category'].unique()))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    model.train()
    for batch in train_loader:
        inputs, attention_mask, labels = batch
        inputs, attention_mask, labels = inputs.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs, attention_mask, labels = batch
        inputs, attention_mask, labels = inputs.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Classification Report:\n", classification_report(all_labels, all_preds))

In [None]:
torch.save(model.state_dict(), 'Trained_ESC_dict.pth')

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

categories = ["Technology", "Health", "Finance", "Travel", "Food",
              "Fashion", "Sports", "Education", "Entertainment", "Science",
              "Art", "Business", "Music", "Fitness", "Home",
              "Gaming", "Environment", "Books", "Pets", "Movies",
              "Automotive", "Social Media", "Career", "Shopping", "Weather"]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model_state_dict = torch.load('/content/WTF.pth')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(categories))

model.load_state_dict(model_state_dict, strict=False)

model.eval()

input_text = ["Cars and Carts", "PRoduct sales"]

input_encodings = tokenizer (input_text, truncation=True, padding=True, return_tensors='pt', max_length=512)

with torch.no_grad():
  outputs = model(**input_encodings)
  logits = outputs.logits
  predictions = torch.argmax(logits, dim=1)

interpreted_predictions = [categories[prediction.item()] for prediction in predictions]

print("Predictions:", interpreted_predictions)