In [None]:
import shap
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
import torch.nn.functional as F

# Load model and tokenizer
model_path = "../models/DeBERTa_98_only_content"

model = DebertaV2ForSequenceClassification.from_pretrained(model_path)
tokenizer = DebertaV2Tokenizer.from_pretrained(model_path)

# Move model to device (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

sample_text_center = """Besides his most recent trip to Quetta, Mr. Rahami visited Karachi, Pakistan, in 2005. Both of those cities’ reputations have become entwined with the militant groups who have sheltered there: Karachi as a haven for the Pakistani Taliban and Al Qaeda, and Quetta as the headquarters of the exiled Afghan Taliban leadership. But both cities are also home to generations of Afghans who have fled violence in their home country.
Much about his New Jersey life did seem unremarkable. Amarjit Singh, a limousine driver, was friends with Mr. Rahami at Edison High School. The person he knew, he said, was a determined student with an abundance of friends and a string of girlfriends. “Everyone seemed to like him,” he said. “Smart, funny, humble.”
He viewed the teenage Mr. Rahami as the prototypical immigrant, teetering between two worlds. While he wore jeans and sweatshirts like his friends and worked at a Pathmark supermarket after school, he preferred Afghan music and prayed at the mosque on Friday. Collisions between those worlds sometimes led to rifts with his father, who was more religious and traditional. “The two of them would argue,” Mr. Singh said. “There seemed to be a lot of tension.”
His father was especially displeased when Mr. Rahami had a daughter with a high school girlfriend, according to friends. Reached at her home on Monday night, she declined to comment. “My heart is just broken,” said the woman, who The New York Times is not identifying. “I don’t even know what to think.”
After high school, Mr. Singh said that he and Mr. Rahami had worked together for a while on the night shift at Royal Fried Chicken in Newark. Mr. Singh worked the fryer in the back. Mr. Rahami handled the register. Whenever Mr. Singh got into a dispute with customers, he remembered Mr. Rahami stepping in as the peacemaker. In recent years, the two drifted apart. Mr. Singh was also aware that Mr. Rahami had traveled abroad and that he had become more religious and had taken to wearing Muslim robes.
The events on Monday were not Mr. Rahami’s first encounter with law enforcement. He was arrested in 2014 on weapons and aggravated assault charges for allegedly stabbing a relative in the leg in a domestic incident, according to court documents. He spent over three months in jail on the charges, according to a high-ranking law enforcement official with knowledge of the investigation. A grand jury, however, declined to indict Mr. Rahami. He also spent a day in jail in February 2012 for allegedly violating a restraining order, the official said."""

sample_text_left = """The government must take immediate action to address income inequality. It is simply unacceptable that the wealthiest individuals in our society continue to amass fortunes while millions of hardworking families struggle to make ends meet. Policies such as a progressive tax system, affordable healthcare, and a living wage for all workers are not just morally right, they are necessary to create a more just society. We need to ensure that everyone has access to the resources they need to thrive, regardless of their background or economic status. Economic justice must be at the forefront of our policy agenda."""

sample_text_right = """The importance of a strong national defense cannot be overstated. We live in a world where threats to our security are real and ever-present. The government must prioritize the protection of our borders, ensure our military remains the most powerful in the world, and enforce strict immigration policies to safeguard our way of life. A strong economy and job creation should be the focus of any government, not handouts or entitlement programs. It's time to stop rewarding failure and start promoting individual responsibility and the values that have made our country great.
"""
sample_text_something_else = """The Earth's atmosphere is a complex layer of gases surrounding our planet, essential for life. It consists primarily of nitrogen (78%) and oxygen (21%), with trace amounts of carbon dioxide, argon, and other gases. This layer plays a crucial role in regulating the planet's temperature through the greenhouse effect, which traps heat and helps maintain conditions suitable for life. The atmosphere is divided into several layers: the troposphere, where weather occurs, the stratosphere, which contains the ozone layer, the mesosphere, the thermosphere, and the exosphere. The ozone layer is particularly important as it absorbs the majority of the Sun’s harmful ultraviolet radiation. Winds, weather patterns, and storms are all influenced by the interaction of these layers with the Earth's surface."""

# Tokenize input
encodings = tokenizer(sample_text_center, truncation=True, padding=True, max_length=384, return_tensors='pt').to(device)
print("Tokenized Input:", encodings)

# Get model output
with torch.no_grad():
    outputs = model(**encodings)
print("Model Output:", outputs)

# Calculate probabilities
probs = F.softmax(outputs.logits, dim=-1)
print("Probabilities:", probs)

# Get the predicted class (highest probability)
predicted_class = torch.argmax(probs, dim=-1).item()

# Mapping of class indices to labels
class_labels = ['left', 'center', 'right']

# Output the prediction
predicted_label = class_labels[predicted_class]
print(f"The model predicts the bias as: {predicted_label}")


Tokenized Input: {'input_ids': tensor([[    1,  5216,   315,   370,  1031,  1400,   264, 81346,   261,   945,
           260, 77983,  6177,  3451, 26003,   261,  4146,   261,   267,  2589,
           260,  2147,   265,   421,  2350,   276, 52868,   286,   638, 67525,
           275,   262, 20203,  1303,   328,   286, 25958,   343,   294, 26003,
           283,   266,  1791,   270,   262, 14390, 14589,   263,  2513, 25580,
           261,   263, 81346,   283,   262,  7020,   265,   262, 38145, 14402,
         14589,  2297,   260,   420,   462,  2350,   281,   327,   425,   264,
          5409,   265, 58711,   328,   286, 10752,  2742,   267,   308,   425,
           658,   260,  4801,   314,   315,   485,  3744,   432,   464,  1329,
         58632,   260, 30497, 28474,  7552,   261,   266, 30607,  1986,   261,
           284,   774,   275,   945,   260, 77983,  6177,   288, 22304,  1418,
          1059,   260,   279,   604,   313,  1322,   261,   313,   357,   261,
           284,   266

In [None]:
import shap
import torch
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
import re
import unicodedata

# Preprocessing functions
def remove_boilerplate(text):
    boilerplate_patterns = [
        r"(?:According to|As reported by|As per)?\s?(CNN|Fox News|Reuters|BBC|The New York Times|AP|Washington Post)[,:\s]*(reports|says|states)?",
        r"(?:Reported by|From the article in)?\s?(CNN|Fox News|Reuters|BBC|The New York Times|AP|Washington Post)"
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    return text.strip()

def remove_accents(text):
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

def preprocess_for_bert(text):
    if not isinstance(text, str):
        return ""
    text = remove_boilerplate(text)
    text = remove_accents(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.lower().strip()

# Model loading
model_path = "../models/DeBERTa_98_only_content"
model = DebertaV2ForSequenceClassification.from_pretrained(model_path)
tokenizer = DebertaV2Tokenizer.from_pretrained(model_path)
model.eval()

def predict(texts):
    texts = [preprocess_for_bert(text) for text in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=384)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()

# SHAP explainer
explainer = shap.Explainer(predict, tokenizer)

# Input text
text = "The Biden administration is delusional."

# Get SHAP values
shap_values = explainer([text])

# Plot SHAP values
shap.plots.text(shap_values[0])  # Text plot to visualize feature importance

# Display text-based SHAP values for clarity
shap_values[0].data

probs = predict([text])
predicted_class = np.argmax(probs)
class_prob = probs[0][predicted_class]
labels = ["Left", "Center", "Right"]  # Update this according to your classes
predicted_label = labels[predicted_class]

# Print predicted class and probability
print(f"Predicted Class: {predicted_label} (Probability: {class_prob:.4f})")

Predicted Class: Right (Probability: 0.9999)


In [22]:
import shap
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import unicodedata
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Preprocessing functions
def remove_boilerplate(text):
    boilerplate_patterns = [
        r"(?:According to|As reported by|As per)?\s?(CNN|Fox News|Reuters|BBC|The New York Times|AP|Washington Post)[,:\s]*(reports|says|states)?",
        r"(?:Reported by|From the article in)?\s?(CNN|Fox News|Reuters|BBC|The New York Times|AP|Washington Post)"
    ]
    for pattern in boilerplate_patterns:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)
    return text.strip()

def remove_accents(text):
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

def preprocess_for_bert(text):
    if not isinstance(text, str):
        return ""
    text = remove_boilerplate(text)
    text = remove_accents(text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.lower().strip()

# Load the model and tokenizer
model_path = "../models/DeBERTa_98_only_content"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()

# Define the prediction function
def predict(texts):
    texts = [preprocess_for_bert(text) for text in texts]
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=384)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()

# Create SHAP explainer
explainer = shap.Explainer(predict, tokenizer)

# Define function for extracting n-grams (word combinations)
def get_ngrams(text, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    ngrams = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    return feature_names

# Example text
text = """                                                           Pfizer says its COVID-19 vaccine is shown to be extremely effective in young teenagers.                                      Esteban Felix/AP                            hide caption              toggle caption              Esteban Felix/AP                                           Pfizer says its COVID-19 vaccine is shown to be extremely effective in young teenagers.                  Esteban Felix/AP                  New clinical trials showed that Pfizer's COVID-19 vaccine elicits "100% efficacy and robust antibody responses" in adolescents from 12 to 15 years old, the drug company announced Wednesday. The trial included 2,260 participants; the results are even better than earlier responses from participants ages 16 to 25.  Pfizer and its vaccine partner BioNTech said they will submit the results "as soon as possible" to the U.S. Food and Drug Administration and the European Medicines Agency, asking regulators to expand their authorizations for the vaccine's use in young people.  Pfizer will submit the data "in the coming weeks," Pfizer CEO and Chairman Albert Bourla said in a news release about the trial. Calling the results encouraging, he added that the company is acting "with the hope of starting to vaccinate this age group before the start of the next school year."      During the clinical trial, 18 people who were in the placebo group developed COVID-19, while none of the people in the vaccinated group did. Blood tests showed a strong immune system response one month after participants received the second vaccine dose, according to a summary of test data released by Pfizer.  As for potential side effects, Pfizer said the vaccine was tolerated well in the late-stage trial.  Currently, the Pfizer-BioNTech vaccine is authorized for use only in people who are at least 16. The companies, which developed the vaccine together, are working to test it in children as young as 6 months. In the first part of that study, a group of kids from 5 to 11 years old got their first shots last week. A second group, ages 2 to 5, are slated to receive their first doses next week.    Moderna, whose COVID-19 vaccine is authorized for people 18 and older in the U.S., is also testing its vaccine in adolescents; it announced a trial of around 3,000 participants from 12 to 18 years old in December. Moderna also said earlier this month it had administered the first doses of its vaccine to young children in a separate study that involves kids from 6 months to less than 12 years old.  Johnson & Johnson, whose vaccine got U.S. authorization one month ago, has also been moving to include children in clinical trials. The company will test the vaccine in only a small number of adolescents initially, with plans to expand the study if it is shown to be safe, according to a spokesperson at Janssen, the Johnson & Johnson subsidiary that developed the vaccine.
"""

# Extract n-grams (e.g., bigrams)
ngrams = get_ngrams(text, n=2)
print("Bigrams:", ngrams)

# Get SHAP values
shap_values = explainer([text])

# Visualize word importance and n-gram combinations
shap.plots.text(shap_values[0])

# Print predicted class and probability
class_labels = ['left', 'center', 'right']
output = predict([text])
predicted_class = np.argmax(output)
predicted_prob = np.max(output)
print(f"Predicted Class: {class_labels[predicted_class]} (Probability: {predicted_prob:.4f})")


Bigrams: ['000 participants' '100 efficacy' '11 years' '12 to' '12 years'
 '15 years' '16 the' '16 to' '18 and' '18 people' '18 years' '19 vaccine'
 '19 while' '25 pfizer' '260 participants' 'about the' 'according to'
 'acting with' 'added that' 'administered the' 'administration and'
 'adolescents from' 'adolescents initially' 'adolescents it'
 'after participants' 'age group' 'agency asking' 'ages 16' 'ages to'
 'ago has' 'albert bourla' 'also been' 'also said' 'also testing'
 'and chairman' 'and drug' 'and its' 'and older' 'and robust' 'and the'
 'announced trial' 'announced wednesday' 'antibody responses' 'ap hide'
 'ap new' 'ap pfizer' 'are at' 'are even' 'are slated' 'are working'
 'around 000' 'as for' 'as months' 'as possible' 'as soon' 'as young'
 'asking regulators' 'at janssen' 'at least' 'authorization one'
 'authorizations for' 'authorized for' 'be extremely' 'be safe'
 'been moving' 'before the' 'better than' 'biontech said'
 'biontech vaccine' 'blood tests' 'bourla said'

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [02:36, 156.47s/it]              


Predicted Class: left (Probability: 0.9997)
