In [7]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [6]:
# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Define the label mapping based on our decision. Here I am still using the labelling logic from the model on github.
label_mapping = {
    0: "business",
    1: "tech",
    2: "politics",
    3: "sport"  # Assuming "entertainment" could be mapped here if similar enough in context
}

In [9]:
def classify_text(text):
    # Tokenize the text for input to BERT
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    
    # Forward pass, get logits
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    
    # Convert logits to probabilities and get the most likely class
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_index = probabilities.argmax(-1).item()
    predicted_label = label_mapping[predicted_class_index]
    
    return predicted_label


In [12]:
# Path to the summaries CSV file
summaries_path = 'summaries.csv'
summaries_df = pd.read_csv(summaries_path)

In [14]:
print(summaries_df.head())

                                                text
0  Time Warner profits up 76% to $1.13bn for the ...
1  Dollar gains on Greenspan speech ahead of G7 f...
2  Yukos' owner Menatep Group says it will ask Ro...


In [15]:
# Classify each summary and print the result
for index, row in summaries_df.iterrows():
    label = classify_text(row['text'])
    print(f"Summary: {row['text']}\nPredicted Category: {label}\n")

Summary: Time Warner profits up 76% to $1.13bn for the three months to December. Firm now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes.
Predicted Category: business

Summary: Dollar gains on Greenspan speech ahead of G7 finance ministers. In late trading in New York, the dollar reached $1.2871 against the euro. Greenspan highlighted the US government's willingness to curb spending and rising household savings.
Predicted Category: business

Summary: Yukos' owner Menatep Group says it will ask Rosneft to repay a loan that Yugansk had secured on its assets. Legal experts said RosneFT's purchase of Yugansk would include such obligations.
Predicted Category: business

