In [None]:
# Step 0: Load the data
import pandas as pd

# Assuming the train.csv and test.csv are in 'dataset/ag-news-classification-dataset'
train_df = pd.read_csv('dataset/ag-news-classification-dataset/train.csv')
test_df = pd.read_csv('dataset/ag-news-classification-dataset/test.csv')

# Combine Title and Description into one text field
train_texts = (train_df['Title'] + " " + train_df['Description']).tolist()
test_texts = (test_df['Title'] + " " + test_df['Description']).tolist()

# Labels are Class Index but shifted down by 1 to get 0-based classes
train_labels = (train_df['Class Index'] - 1).tolist()
test_labels = (test_df['Class Index'] - 1).tolist()

In [None]:

# Step 1: Preprocess the text
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# Function to extract embeddings
def get_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[0, 0, :]
    return cls_embedding

# Generate embeddings for fractions of data
fractions = [1.0]
for frac in fractions:
    print(f"Generating embeddings for {int(frac*100)}% of the training data.")
    
    # Subsample the training data
    subset_size = int(len(train_texts) * frac)
    subset_texts = train_texts[:subset_size]
    
    embeddings = []
    for i in tqdm(range(len(subset_texts))):
        e = get_embeddings(subset_texts[i], model, tokenizer)
        embeddings.append(e.detach().numpy())
    
    embeddings = np.array(embeddings)
    np.save(f'bert_embeddings_ag_{int(frac*100)}.npy', embeddings)


In [None]:
# Step 2: Load embeddings and train the classifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train and test model for different fractions
for frac in fractions:
    print(f"\nTraining with {int(frac*100)}% data")
    
    # Load the embeddings for the current fraction
    embeddings = np.load(f'bert_embeddings_{int(frac*100)}.npy')
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(embeddings, train_labels, test_size=0.2, random_state=42)
    
    # Train a classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = clf.predict(X_test)
    
    # Print the classification report
    print(f"Classification Report for {int(frac*100)}% training data:")
    print(classification_report(y_test, y_pred))
