In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Load the CSV file with claims and facts, specifying the encoding
file_path = "/content/Combined.csv"
data = pd.read_csv(file_path, encoding='latin-1')
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True
)

# Set up the pipeline for text generation
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# Define the parameters for generation with focus on word count and tone
generation_args = {
    "max_new_tokens": 750,  # Ensures space for longer narratives
    "temperature": 0.7,  # Adjusted for more controlled outputs
    "do_sample": True,
    "top_p": 0.85,
    "top_k": 60
}

# Function to generate a structured counter-narrative for each claim using its fact
def generate_counter_narrative(claim, fact):
    prompt = f"""<|system|>
You are a helpful assistant who provides fact-based counter-narratives for claims related to America's election stories.<|end|>
<|user|>
Claim: {claim}
Fact: {fact}<|end|>
<|assistant|>
The counter-narrative should be between 200-400 words, using a structured approach:
- Start with an introduction that addresses the claim directly.
- Use the fact to create a body that dismantles the claim logically.
- Conclude by reinforcing the counter-narrative's position, avoiding any unsupported assumptions.<|end|>
Counter-Narrative:"""

    # Generate the narrative
    output = pipe(prompt, **generation_args)

    # Extract only the generated counter-narrative by removing prompt text
    generated_text = output[0]['generated_text']
    counter_narrative = generated_text[len(prompt):].strip()

    # Ensure output meets word count requirement and is well structured
    word_count = len(counter_narrative.split())
    if word_count < 200 or word_count > 400:
        # Regenerate if it doesn’t meet the word count criteria
        output = pipe(prompt, **generation_args)
        counter_narrative = output[0]['generated_text'][len(prompt):].strip()

    return counter_narrative

# Loop through the dataset to generate counter-narratives
data['Counter-Narrative'] = data.apply(lambda row: generate_counter_narrative(row['Claim'], row['Fact']), axis=1)

# Save the results to a new CSV file
output_file = "Combined_counter_narratives.csv"
data.to_csv(output_file, index=False)

print(f"Counter-narratives generated and saved to {output_file}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Counter-narratives generated and saved to Combined_counter_narratives.csv
