In [None]:
pip install openai transformers-interpret


Collecting transformers-interpret
  Downloading transformers_interpret-0.10.0-py3-none-any.whl.metadata (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting captum>=0.3.1 (from transformers-interpret)
  Downloading captum-0.8.0-py3-none-any.whl.metadata (26 kB)
Collecting numpy<2.0 (from captum>=0.3.1->transformers-interpret)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting jedi>=0.16 (from ipython<8.0.0,>=7.31.1->transformers-interpret)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10->captum>=0.3.1->transformers-interpret)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import openai
import torch
import os
from transformers_interpret import SequenceClassificationExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "/content/drive/MyDrive/new_model/"  # Replace with your model path or Hugging Face model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

cls_explainer = SequenceClassificationExplainer(model, tokenizer)

#Function to merge subword attributions
def merge_subword_attributions(attributions):
    merged_attributions = []
    prev_word = None
    accumulated_score = 0.0

    for token, score in attributions:
        clean_token = token.replace("##", "")  # Remove subword markers

        if token.startswith("##") and prev_word:
            prev_word += clean_token
            accumulated_score += score
        else:
            if prev_word:
                merged_attributions.append((prev_word, accumulated_score))
            prev_word = clean_token
            accumulated_score = score

    if prev_word:
        merged_attributions.append((prev_word, accumulated_score))

    return merged_attributions

#Function to apply a dynamic explanation threshold
def should_explain(input_text, total_attribution_score, alpha=0.15):
    num_words = len(input_text.split())
    threshold = alpha * num_words  # Dynamic threshold based on input length
    return total_attribution_score > threshold


# Load API key securely

OPENAI_API_KEY = "..."

client = openai.OpenAI(api_key=OPENAI_API_KEY)


def generate_gpt_explanation(prediction, key_words, confidence):
    prompt = f"""
    The model classified this text as '{prediction}' with {confidence:.2%} confidence.
    The most important words influencing this decision were: {', '.join(key_words)}.
    Provide a simple explanation for a non-technical user.
    """

    client = openai.OpenAI(api_key=openai.api_key)  #
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0  # Ensures consistent responses
    )

    return response.choices[0].message.content

#Main function to process text, explain, and generate output
def explain_text(input_text, visualize=True):
    #Get Model Prediction & Attributions
    _ = cls_explainer(input_text)  # Generate explanations
    attributions_list = cls_explainer.word_attributions
    merged_attributions = merge_subword_attributions(attributions_list)

    predicted_label = cls_explainer.predicted_class_name
    confidence = cls_explainer.pred_probs.max().item()

    total_attribution_score = sum(abs(score) for _, score in merged_attributions)
    if not should_explain(input_text, total_attribution_score):
        return f"The model classified this as '{predicted_label}' with {confidence:.2%} confidence. No strong influencing words detected."

    merged_attributions.sort(key=lambda x: abs(x[1]), reverse=True)
    key_words = [word for word, score in merged_attributions[:5]]

    explanation = generate_gpt_explanation(predicted_label, key_words, confidence)

    if visualize:
        cls_explainer.visualize()

    return explanation




In [None]:
import openai

#replace this with your actual API key
OPENAI_API_KEY = "...."

client = openai.OpenAI(api_key=OPENAI_API_KEY)

def generate_gpt_explanation(prediction, key_words, confidence):
    prompt = f"""
    The model classified this text as '{prediction}' with {confidence:.2%} confidence.

    The most important words influencing this decision were: {', '.join(key_words)}.

    Provide a **concise, context-aware explanation** (50-80 words) about why these words suggest {prediction}.
    - Explain **how these words are used in misinformation**.
    - Provide **one real-world example** of misinformation using these words.
    - Compare how these words are used in **real scientific news**.
    - Make the explanation **brief and easy to understand**.
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=100
    )

    return response.choices[0].message.content





In [None]:

# Example usage
explanation = generate_gpt_explanation("Misinformation", ["cures", "research"], 0.97)
print(explanation)

In misinformation, the words "cures" and "research" are often used to make false claims about miracle treatments or breakthroughs that have not been scientifically proven. For example, a fake news article might claim that a certain herb cures cancer based on "research" that doesn't actually exist. In real scientific news, these words are used cautiously, with research findings undergoing rigorous testing and peer review before any claims of cures are made. Misinformation exploits these terms to spread false hope and misleading


In [None]:
def generate_explanation(input_text, visualize=True):

    _ = cls_explainer(input_text)
    attributions_list = cls_explainer.word_attributions

    merged_attributions = merge_subword_attributions(attributions_list)

    merged_attributions.sort(key=lambda x: abs(x[1]), reverse=True)

    predicted_label = cls_explainer.predicted_class_name
    confidence = cls_explainer.pred_probs.max().item()

    # Separating positive and negative influence words
    positive_words = [(word, score) for word, score in merged_attributions if score > 0]
    negative_words = [(word, score) for word, score in merged_attributions if score < 0]

    # Selecting the most influential words
    top_positive = positive_words[:3]  # Top 3 words supporting the prediction
    top_negative = negative_words[:2]  # Top 2 words that pushed against the prediction

    # Construct explanation based on classification
    explanation = f"" **This statement was classified as '{predicted_label}' with {confidence:.2%} confidence.**\n\n"

    if top_positive:
        explanation += "The model identified the following words as important in making this decision:\n"
        for word, score in top_positive:
            explanation += f"\"{word}\" reinforced this classification."

    if top_negative:
        explanation += "\n**However, the following words slightly pushed against this classification:**\n"
        for word, score in top_negative:
            explanation += f"- **\"{word}\"** was present but did not strongly contribute.\n"

    explanation += "This explanation is based on which words the model found important in making its prediction."

    if visualize:
        cls_explainer.visualize()

    return explanation

# Example usage:
explanation = generate_explanation("Diabetes misinformation can be harmful, spreading false claims about causes, treatments, and cures.")
print(explanation)


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,-0.28,"[CLS] diabetes mis ##in ##form ##ation can be harmful , spreading false claims about causes , treatments , and cure ##s . [SEP]"
,,,,


🟢 **This statement was classified as 'LABEL_1' with 99.86% confidence.**

**The model identified the following words as important in making this decision:**
- **"diabetes"** reinforced this classification.
- **"harmful"** reinforced this classification.
- **","** reinforced this classification.

**However, the following words slightly pushed against this classification:**
- **"misinformation"** was present but did not strongly contribute.
- **"cures"** was present but did not strongly contribute.

📌 *This explanation is based on which words the model found important in making its prediction.*


In [None]:
import openai
import torch
import os
from transformers_interpret import SequenceClassificationExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#Load Model and Tokenizer
model_name = "/content/drive/MyDrive/new_model/"  # Replace with actual model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Initialize Transformers-Interpret Explainer
cls_explainer = SequenceClassificationExplainer(model, tokenizer)

#Function to merge subword attributions properly
def merge_subword_attributions(attributions):
    merged_tokens = []  #Initialize properly
    merged_values = []  #Initialize properly
    prev_word = None
    accumulated_score = 0.0

    for token, score in attributions:
        clean_token = token.replace("##", "")  # Remove subword markers

        if token.startswith("##") and prev_word:
            prev_word += clean_token
            accumulated_score += score
        else:
            if prev_word:
                merged_tokens.append(prev_word)
                merged_values.append(accumulated_score)
            prev_word = clean_token
            accumulated_score = score

    if prev_word:
        merged_tokens.append(prev_word)
        merged_values.append(accumulated_score)

    return list(zip(merged_tokens, merged_values))

#Function to extract key words based on feature attribution scores
def extract_key_words(text, top_n=5):
    """Extracts top influential words from the model's attributions."""
    attributions = cls_explainer(text)
    merged_attributions = merge_subword_attributions(attributions)

    #Now sorting works correctly
    merged_attributions = sorted(merged_attributions, key=lambda x: abs(x[1]), reverse=True)
    key_words = [word for word, score in merged_attributions[:top_n]]

    # Compute total attribution score
    total_attribution_score = sum(abs(score) for _, score in merged_attributions)

    return key_words, total_attribution_score

#Function to classify text using the fine-tuned BERT model
def classify_with_bert(text):
    """Classifies text using the BERT model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]

    predicted_label_index = probabilities.argmax()
    predicted_label = "Misinformation" if predicted_label_index == 1 else "Real"
    confidence = probabilities[predicted_label_index]

    return predicted_label, confidence

#Function to test multiple variations of an input
def test_multiple_cases(original_text, variations):
    """Runs the model on multiple variations of the text and selects the best prediction."""
    results = []

    for variation in variations:
        predicted_label, confidence = classify_with_bert(variation)
        key_words, total_attribution_score = extract_key_words(variation)

        results.append({
            "text": variation,
            "predicted_label": predicted_label,
            "confidence": confidence,
            "key_words": key_words,
            "total_attribution_score": total_attribution_score
        })

    #Sort by highest confidence + highest total attribution score
    results = sorted(results, key=lambda x: (x["confidence"], x["total_attribution_score"]), reverse=True)

    #Select the most confident prediction
    best_result = results[0]

    return best_result
#Make sure to replace this with your actual API key
OPENAI_API_KEY = ""

#Explicitly pass the API key when initializing OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

def generate_gpt_explanation(prediction, key_words, confidence):
    """Generates a user-friendly explanation using OpenAI's GPT model."""

    if not key_words:
        return f"The model classified this text as '{prediction}' but did not find strong influencing words for further explanation."

    explanation_type = "why this text might contain misinformation" if prediction == "Misinformation" else "why this text was classified as real"

    prompt = f"""
    The model classified this text as '{prediction}' with {confidence:.2%} confidence.

    The most important words influencing this decision were: {', '.join(key_words)}.

    Provide a **concise, context-aware explanation** (50-80 words) about why these words suggest {prediction}.
    - Explain **how these words are used in {prediction.lower()} content**.
    - Provide **one real-world example** of {prediction.lower()} content using these words.
    - Compare how these words are used in **real scientific news**.
    - Make the explanation **brief and easy to understand**.
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,  # Keeps responses consistent
        max_tokens=100  # Limits response length (approx. 50-80 words)
    )

    return response.choices[0].message.content



def explain_text(original_text, variations, top_n=5):
    """Runs multiple test cases on the model before selecting the best one for explanation."""

    # Step 1: Test multiple cases and pick the best
    best_case = test_multiple_cases(original_text, variations)

    # Step 2: Extract final key words & confidence
    key_words = best_case["key_words"]
    prediction = best_case["predicted_label"]
    confidence = best_case["confidence"]

    # Step 3: Generate Explanation Using GPT-3.5
    gpt_explanation = generate_gpt_explanation(prediction, key_words, confidence)

    return {
        "prediction": prediction,
        "confidence": f"{confidence:.2%}",
        "best_text": best_case["text"],
        "key_words": key_words,
        "explanation": gpt_explanation
    }




In [None]:

user_text = "Diabetes misinformation can be harmful, spreading false claims about causes, treatments, and cures."

explanation_result = explain_text(user_text, [user_text])

print(explanation_result)


{'prediction': 'Misinformation', 'confidence': '99.87%', 'best_text': 'Diabetes misinformation can be harmful, spreading false claims about causes, treatments, and cures.', 'key_words': ['misinformation', 'diabetes', 'harmful', 'cures', 'claims'], 'explanation': 'In misinformation content, words like "misinformation," "harmful," "cures," and "claims" are often used to promote false information about health topics like diabetes. For example, a misleading article might claim that a certain product can cure diabetes without scientific evidence. In contrast, real scientific news would use these words in a factual and evidence-based manner, discussing the potential harms of misinformation and the importance of evidence-based treatments for diabetes.'}


In [None]:
!pip install numpy.rec

[31mERROR: Could not find a version that satisfies the requirement numpy.rec (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for numpy.rec[0m[31m
[0m

In [None]:
import openai
import torch
import os
from transformers_interpret import SequenceClassificationExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load Model and Tokenizer
model_name = "/content/drive/MyDrive/new_model/"  # Replace with actual model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# Initialize Transformers-Interpret Explainer
cls_explainer = SequenceClassificationExplainer(model, tokenizer)

# Function to merge subword attributions properly
def merge_subword_attributions(attributions):
    merged_tokens = []
    merged_values = []
    prev_word = None
    accumulated_score = 0.0

    for token, score in attributions:
        clean_token = token.replace("##", "")

        if token.startswith("##") and prev_word:
            prev_word += clean_token
            accumulated_score += score
        else:
            if prev_word:
                merged_tokens.append(prev_word)
                merged_values.append(accumulated_score)
            prev_word = clean_token
            accumulated_score = score

    if prev_word:
        merged_tokens.append(prev_word)
        merged_values.append(accumulated_score)

    return list(zip(merged_tokens, merged_values))


def extract_key_words(text, top_n=5):
    """Extracts top influential words from the model's attributions."""
    attributions = cls_explainer(text)
    merged_attributions = merge_subword_attributions(attributions)

    merged_attributions = sorted(merged_attributions, key=lambda x: abs(x[1]), reverse=True)
    key_words = [word for word, score in merged_attributions[:top_n]]

    total_attribution_score = sum(abs(score) for _, score in merged_attributions)

    return key_words, total_attribution_score

def should_explain(input_text, total_attribution_score, alpha=0.15):
    """Determines whether an explanation should be generated based on attribution strength."""
    num_words = len(input_text.split())
    threshold = alpha * num_words  # Dynamic threshold based on input length
    return total_attribution_score > threshold

def classify_with_bert(text):
    """Classifies text using the BERT model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()[0]

    predicted_label_index = probabilities.argmax()
    predicted_label = "Misinformation" if predicted_label_index == 1 else "Real"
    confidence = probabilities[predicted_label_index]

    return predicted_label, confidence

def test_multiple_cases(original_text, variations):
    """Runs the model on multiple variations of the text and selects the best prediction."""
    results = []

    for variation in variations:
        predicted_label, confidence = classify_with_bert(variation)
        key_words, total_attribution_score = extract_key_words(variation)

        results.append({
            "text": variation,
            "predicted_label": predicted_label,
            "confidence": confidence,
            "key_words": key_words,
            "total_attribution_score": total_attribution_score
        })
    results = sorted(results, key=lambda x: (x["confidence"], x["total_attribution_score"]), reverse=True)

    best_result = results[0]

    return best_result

OPENAI_API_KEY = ""  # Replace with actual API key

client = openai.OpenAI(api_key=OPENAI_API_KEY)


def generate_gpt_explanation(prediction, key_words, confidence, user_text):
    """Generates a user-friendly explanation using OpenAI's GPT model."""

    if not key_words:
        return f"The model classified this text as '{prediction}' but did not find strong influencing words for further explanation."

    prompt = f"""
    The model classified this text as '{prediction}' with {confidence:.2%} confidence.

    **Original sentence:** "{user_text}"

    The most important words influencing this decision were: {', '.join(key_words)}.

    Provide a **concise, context-aware explanation** (50-80 words) about why these words suggest {prediction}.
    - Explain how these words are used in {prediction.lower()} content.
    - Provide one real-world example of {prediction.lower()} content using these words.
    - Compare how these words are used in real scientific news.
    - Make the explanation brief and easy to understand.
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=50
    )

    return response.choices[0].message.content

def explain_text(original_text, variations, top_n=5):
    """Runs multiple test cases on the model before selecting the best one for explanation."""

    best_case = test_multiple_cases(original_text, variations)

    key_words = best_case["key_words"]
    prediction = best_case["predicted_label"]
    confidence = best_case["confidence"]
    total_attribution_score = best_case["total_attribution_score"]

    if not should_explain(original_text, total_attribution_score):
        return {
            "prediction": prediction,
            "confidence": f"{confidence:.2%}",
            "best_text": best_case["text"],
            "key_words": key_words,
            "explanation": "The model's attribution scores were too low for a reliable explanation."
        }

    gpt_explanation = generate_gpt_explanation(prediction, key_words, confidence, original_text)

    return {
        "prediction": prediction,
        "confidence": f"{confidence:.2%}",
        "best_text": best_case["text"],
        "key_words": key_words,
        "explanation": gpt_explanation
    }


RuntimeError: Failed to import transformers.modeling_utils because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
No module named 'numpy.rec'

In [None]:

# Example Usage in Jupyter Notebook
user_text = "Diabetes misinformation can be harmful, spreading false claims about causes, treatments, and cures."
test_variations = [
    "Some say diabetes can be cured with natural remedies, but is it true?",
    "Doctors warn against unverified diabetes treatments claiming to be effective.",
    "A new study suggests potential diabetes treatments, but more research is needed.",
    "False claims about diabetes cures are spreading rapidly on social media."
]

explanation_result = explain_text(user_text, test_variations)
print(explanation_result)


NameError: name 'explain_text' is not defined