In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import classification_report
import torch.nn.functional as F
from IPython.display import display, HTML
import numpy as np
import torch
import json
import os

**INFORMATION MODEL**

In [2]:
# Define the path to your files
path_to_files = "information-model"

# Load the tokenizer
tokenizer_opinion = AutoTokenizer.from_pretrained(path_to_files)

# Load the model configuration and weights
model_opinion = AutoModelForCausalLM.from_pretrained(path_to_files).to('cpu')  # Load model on CPU


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\users\i747613\appdata\local\miniconda3\envs\lab\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\users\i747613\appdata\local\miniconda3\envs\lab\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\users\i747613\appdata\local\miniconda3\envs\lab\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\users\i747613\appdata\local\miniconda3\envs\lab\lib\site-packages\traitlets\config\appl

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


**OPINION MODEL**

In [3]:
# Define the path to your files
path_to_files = "opinion-model"

# Load the tokenizer
tokenizer_information = AutoTokenizer.from_pretrained(path_to_files)

# Load the model configuration and weights
model_information = AutoModelForCausalLM.from_pretrained(path_to_files).to('cpu')  # Load model on CPU



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

**NORMALIZE THE PROBABILITIES**

In [4]:
def evaluate_probabilities(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt")
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1).cpu()
    return probabilities

def aggregate_probabilities(probabilities):
        return torch.max(probabilities, dim=1).values

def classify_text(text_probs_opinion, text_probs_information):
    avg_opinion_prob = torch.mean(text_probs_opinion).item()
    avg_information_prob = torch.mean(text_probs_information).item()
    
    if avg_opinion_prob > avg_information_prob:
        return "opinion"
    elif avg_information_prob > avg_opinion_prob:
        return "information"
    else:
        return "uncertain"

def classify_tokens(opinion_probs, information_probs):
    # Max probabilities for each token
    opinion_max_probs = opinion_probs[0, :, :].max(dim=-1).values
    information_max_probs = information_probs[0, :, :].max(dim=-1).values
    
    # Classification based on probabilities
    classifications = ['opinion' if o > i else 'information' if i > o else 'neutral'
                       for o, i in zip(opinion_max_probs, information_max_probs)]
    return classifications

def highlight_text(sentence, tokenizer, opinion_probs, information_probs):
    # Tokenize the input sentence
    tokens = tokenizer.tokenize(sentence)
    # Re-tokenize the sentence to match tokenizer encoding
    token_ids = tokenizer.encode(sentence, add_special_tokens=False)
    words = tokenizer.convert_ids_to_tokens(token_ids)
    
    # Classify tokens
    classifications = classify_tokens(opinion_probs, information_probs)

    # Highlight the text
    highlighted_sentence = []
    for word, classification in zip(words, classifications):
        if classification == 'opinion':
            color = 'red'
        elif classification == 'information':
            color = 'green'
        else:
            color = 'yellow'  # Uncertain
        
        highlighted_sentence.append(f"<span style='color: {color}; padding: 2px;'>{word}</span>")

    return " ".join(highlighted_sentence)

**KAGGLE NEWSPAPER DATASET TESTING**

In [26]:
# List of Headlines from the kaggle newspaper dataset
headlines = [
    "Supreme Court Rules Biden Properly Ended Trump's 'Remain In Mexico' Policy.",
    "White House Releases Progress Report On Travel Industry Improvements.",
    "Let Us Not Celebrate a Fifth Anniversary of the Syrian Conflict.",
    "How Tim Walz has already changed the campaign.",
    "With the closure of checkpoints, Israeli Arabs cannot come to Jenin and Tulkarm to shop, and West Bank Palestinians cannot leave to work in Israel.",
    "Winning a gold medal means taking home a piece of the Eiffel Tower.",
    "Ukraine: 9,000 Of Its Troops Killed Since Russia Began War." ,
    "195 House Republicans Voted Against Birth Control Protections.",
    "Ex-DHS Aide Suggests She 'Went Very Public' Because She Didn't Trust Inspector General.",
    "Beautiful And Sad At The Same Time: Ukrainian Cultural Festival Takes On A Deeper Meaning This Year.",
    "In A Nod To JFK, Joe Biden Pushing 'Moonshot' To Fight Cancer.",
    "Come On, Bernie! Why Democrats Left Child Tax Credit Out Of The Inflation Reduction Act.",
    "France Praises YouTubers For Resisting Effort To Smear The Pfizer Vaccine.",
    "'Friends' Cast Says The 1 Thing Fans Won't Want To Hear.",
    "13 Shot, 10 Dead In 'Racially Motivated Hate Crime' At Buffalo Supermarket: Sheriff.",
    "Texas Gov. On State's Deadliest School Shooting: 'It Could Have Been Worse'.",
    "'The Mandalorian' Crew's Biggest Worry About Baby Yoda Seems Kind Of Ridiculous Now.",
    "Tom Brady Takes Batting Practice. Gronk Shags The Balls. Enough Said.",
    "Barron Trump Had COVID-19, First Lady Melania Trump Says.",
    "The Greatest Love Story Ever Told.",
    "Tucker Carlson's 'Very Odd' Pronunciation Of 'Ottawa' Has People In Stitches.",
    "California Drought Tests History Of Endless Growth.",
    "Crock Pot Ham for the Easiest Easter Ever!",
    "All Over The Country, Kids Are Getting Shocked With Tasers And Sprayed With Chemicals In School.",
    "Man Exonerated After 30 Years On Death Row.",
    "The Fight Rages On... In the GOP.",
    "Americans Work Too Hard.",
    "UNC Charlotte Students: We Protect Our Family, And We Take Sexual Assault Seriously.",
    "Obama's Next Step On Iran Deal: Selling It.",
    "Iran Eases Ban On Women Attending Sports Matches."
]
# Manual Labels for each Headlines
labels = [
    "Information",
    "Information",
    "Opinion",
    "Opinion",
    "Information",
    "Opinion",
    "Information",
    "Information",
    "Opinion",
    "Information",
    "Information",
    "Opinion",
    "Information",
    "Opinion",
    "Information",
    "Information",
    "Opinion",
    "Opinion",
    "Information",
    "Opinion",
    "Information",
    "Information",
    "Information",
    "Information",
    "Information",
    "Opinion",
    "Opinion",
    "Opinion",
    "Information",
    "Information"
]

In [27]:
# Evaluate and classify each input, then highlight the text based on token probabilities
predictions = []
for input_text in headlines:
    opinion_probs = evaluate_probabilities(model_opinion, tokenizer_opinion, input_text)
    information_probs = evaluate_probabilities(model_information, tokenizer_information, input_text)

    text_probs_opinion = aggregate_probabilities(opinion_probs)
    text_probs_information = aggregate_probabilities(information_probs)
    
    highlighted_sentence = highlight_text(input_text, tokenizer_opinion, opinion_probs, information_probs)

    classification = classify_text(text_probs_opinion, text_probs_information).upper()
    predictions.append(classification)

    output = f"<p><strong>Headline:</strong> {input_text}</p>" \
             f"<p>{highlighted_sentence}</p>" \
             f"<p><strong>Model Output: </strong> <em> {classification} </em></p>" \
             f"<hr style='border: 1px solid #ccc;' />"
    
    display(HTML(output))


In [28]:
labels = [s.upper() for s in labels]
report = classification_report(labels, predictions)
print("Evaluataion Report\n", report)

Evaluataion Report
               precision    recall  f1-score   support

 INFORMATION       0.86      1.00      0.92        18
     OPINION       1.00      0.75      0.86        12

    accuracy                           0.90        30
   macro avg       0.93      0.88      0.89        30
weighted avg       0.91      0.90      0.90        30

