In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from tqdm import tqdm

In [None]:
df=pd.read_csv('/kaggle/input/2024-us-presidential-elections-twitter-data/preprocessedtranslated_tweets_us24.csv')

In [None]:
model_name = "declare-lab/flan-alpaca-large"

# Initialize the tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up the text generation pipeline for text-to-text generation.
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
# Create prompts in advance
def create_prompt(text):
    return (
        "Classify the tweet's stance towards Democrats in one of the following categories: "
        "Pro Democrat, Anti Democrat, or Neutral.\n\n"
        f"Tweet: \"{text}\"\n\n"
        "Answer with only one of the above labels:"
    )

df["Prompt"] = df["Text"].apply(create_prompt)

In [None]:
batch_size = 32
stances = []

for i in tqdm(range(0, len(df), batch_size), desc="Classifying in Batches"):
    batch_prompts = df["Prompt"].iloc[i:i + batch_size].tolist()
    outputs = generator(batch_prompts, max_length=64, do_sample=False, num_return_sequences=1)
    
    for out in outputs:
        # Take only the first word as label, clean and standardize it
        label = out["generated_text"].strip().split()[0]
        stances.append(label)

# Add stance column
df = df.iloc[:len(stances)]
df["Stance"] = stances

In [None]:
# Save results
df.to_csv("withStance.csv", index=False)
print(df[["Text", "Stance"]].head())

## Different Method

In [None]:
import pandas as pd
import re
from itertools import chain
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import Dataset  # HuggingFace dataset

In [None]:
tqdm.pandas()

In [None]:
df = pd.read_csv('/kaggle/input/2024-us-presidential-elections-twitter-data/preprocessedtranslated_tweets_us24.csv')

# 2) Initialize a sentiment-analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
def generate_aliases(person):
    first, last = person["first"], person["last"]
    full = f"{first} {last}"
    aliases = [full, first, last]
    if "nicknames" in person:
        aliases.extend(person["nicknames"])
    if "handle" in person:
        aliases.extend([person["handle"], f"@{person['handle']}"])
    return aliases

# People and slogans
dem_people = [
    {"first": "Joe", "last": "Biden", "nicknames": ["JoeBiden"], "handle": "JoeBiden"},
    {"first": "Kamala", "last": "Harris", "nicknames": ["KamalaHarris"], "handle": "KamalaHarris"},
]
rep_people = [
    {"first": "Donald", "last": "Trump", "nicknames": ["Trump"], "handle": "realDonaldTrump"},
    {"first": "Nikki", "last": "Haley", "nicknames": ["NikkiHaley"], "handle": "NikkiHaley"},
    {"first": "Mike", "last": "Pence", "nicknames": ["Pence"], "handle": "Mike_Pence"},
    {"first": "Vivek", "last": "Ramaswamy", "nicknames": ["Vivek2024"], "handle": "VivekGRamaswamy"},
    {"first": "JD", "last": "Vance", "nicknames": ["JDVance"], "handle": "JDVance1"},
    {"first": "Robert", "last": "Kennedy Jr.", "nicknames": ["RFKJr", "Kennedy2024"], "handle": "RobertKennedyJr"},
]

# Entities
dem_entities = list(chain.from_iterable(generate_aliases(p) for p in dem_people)) + [
    "democrat", "democrats", "Democrats", "democratic party", "dnc", "vote blue", "blue wave", "bidenomics"
]
rep_entities = list(chain.from_iterable(generate_aliases(p) for p in rep_people)) + [
    "republican", "Republican", "republicans", "gop", "rnc", "maga", "trump2024", "drain the swamp"
]

# Regex patterns
def compile_pattern(entities):
    sorted_ents = sorted(set(entities), key=len, reverse=True)
    pat = r'\b(' + '|'.join(re.escape(ent) for ent in sorted_ents) + r')\b'
    return re.compile(pat, flags=re.IGNORECASE)

dem_pattern = compile_pattern(dem_entities)
rep_pattern = compile_pattern(rep_entities)


In [None]:
def truncate_sent(text, max_len=512):
    if not isinstance(text, str):
        return ""  # or return a placeholder like "[no text]"
    return text if len(text) <= max_len else text[:max_len]

In [None]:
df = pd.read_csv('/kaggle/input/2024-us-presidential-elections-twitter-data/preprocessedtranslated_tweets_us24.csv')
ds = Dataset.from_pandas(df[['Text']])  # Just keep the needed column

# Load sentiment model (batch capable)
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)  # if GPU available

# Apply sentiment in batch
# 2. If you worry about super‑long texts, truncate in Python:
# 2) Apply to all tweets
truncated_texts = [truncate_sent(t) for t in ds['Text']]

# 3) Run sentiment analysis in batch
sentiments = sentiment_analyzer(truncated_texts, batch_size=32)

ds = ds.add_column("Sentiment", sentiments)

In [None]:
# 1) Update infer_stance to return a dict
def infer_stance(example, pos_threshold=0.7, neg_threshold=0.7):
    text = example['Text'] or ""  # guard against None
    # check mentions
    mentions_dem = bool(dem_pattern.search(text))
    mentions_rep = bool(rep_pattern.search(text))

    # default
    stance = "Neutral"
    if mentions_dem or mentions_rep:
        sent = sentiment_analyzer(text[:512])[0]
        label, score = sent['label'], sent['score']
        if mentions_dem:
            if label == "POSITIVE" and score >= pos_threshold:
                stance = "Pro Democrat"
            elif label == "NEGATIVE" and score >= neg_threshold:
                stance = "Anti Democrat"
        elif mentions_rep:
            if label == "POSITIVE" and score >= pos_threshold:
                stance = "Anti Democrat"
            elif label == "NEGATIVE" and score >= neg_threshold:
                stance = "Pro Democrat"

    return {"Stance": stance}

# 2) Map over the dataset to add the new column
ds = ds.map(
    infer_stance,
    fn_kwargs={"pos_threshold": 0.7, "neg_threshold": 0.7},
    remove_columns=[],     # keep all existing columns
    desc="Inferring stance"
)

# 3) Bring it back into your DataFrame
df['Stance'] = ds['Stance']

# 4) Inspect
print(df['Stance'].value_counts())


## Zero Shot 

In [None]:
df.to_csv('stance.csv',index=False)

In [None]:
df = df[df['Text'].str.strip().astype(bool)].reset_index(drop=True)

In [None]:
from transformers import pipeline

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=0  # your P100
)

candidate_labels = ["Pro Democrat", "Anti Democrat", "Neutral"]

# in batches:
results = classifier(
    df["Text"].tolist(),
    candidate_labels=candidate_labels,
    batch_size=32,
    multi_label=False
)

df["Stance"] = [r["labels"][0] for r in results]


In [None]:
batch_size = 32
results = []
texts = df['Text'].tolist()

for i in tqdm(range(0, len(texts), batch_size), desc="Zero‑Shot Batches"):
    batch = texts[i : i + batch_size]
    # call the pipeline on a clean list of strings
    batch_out = classifier(
        batch,
        candidate_labels=candidate_labels,
        multi_label=False,
        batch_size=batch_size
    )
    results.extend(batch_out)

# Extract the top label for each result
df['Stance'] = [r['labels'][0] for r in results]

# Quick check
print(df['Stance'].value_counts())


In [None]:
df.to_csv('stanceZeroShot.csv',index=False)

## Using CardiffNLPTwitter

In [None]:
import pandas as pd
import re
from itertools import chain
from transformers import pipeline
from tqdm.auto import tqdm

# 1) Load & clean your DataFrame
df = pd.read_csv('/kaggle/input/2024-us-presidential-elections-twitter-data/preprocessedtranslated_tweets_us24.csv')
df['Text'] = df['Text'].fillna("").astype(str)
df = df[df['Text'].str.strip().astype(bool)].reset_index(drop=True)


In [None]:
def generate_aliases(person):
    first, last = person["first"], person["last"]
    full = f"{first} {last}"
    aliases = [full, first, last]
    if "nicknames" in person:
        aliases.extend(person["nicknames"])
    if "handle" in person:
        aliases.extend([person["handle"], f"@{person['handle']}"])
    return aliases

# People and slogans
dem_people = [
    {"first": "Joe", "last": "Biden", "nicknames": ["JoeBiden"], "handle": "JoeBiden"},
    {"first": "Kamala", "last": "Harris", "nicknames": ["KamalaHarris"], "handle": "KamalaHarris"},
]
rep_people = [
    {"first": "Donald", "last": "Trump", "nicknames": ["Trump"], "handle": "realDonaldTrump"},
    {"first": "Nikki", "last": "Haley", "nicknames": ["NikkiHaley"], "handle": "NikkiHaley"},
    {"first": "Mike", "last": "Pence", "nicknames": ["Pence"], "handle": "Mike_Pence"},
    {"first": "Vivek", "last": "Ramaswamy", "nicknames": ["Vivek2024"], "handle": "VivekGRamaswamy"},
    {"first": "JD", "last": "Vance", "nicknames": ["JDVance"], "handle": "JDVance1"},
    {"first": "Robert", "last": "Kennedy Jr.", "nicknames": ["RFKJr", "Kennedy2024"], "handle": "RobertKennedyJr"},
]

# Entities
dem_entities = list(chain.from_iterable(generate_aliases(p) for p in dem_people)) + [
    "democrat", "democrats", "Democrats", "democratic party", "dnc", "vote blue", "blue wave", "bidenomics"
]
rep_entities = list(chain.from_iterable(generate_aliases(p) for p in rep_people)) + [
    "republican", "Republican", "republicans", "gop", "rnc", "maga", "trump2024", "drain the swamp"
]

# Regex patterns
def compile_pattern(entities):
    sorted_ents = sorted(set(entities), key=len, reverse=True)
    pat = r'\b(' + '|'.join(re.escape(ent) for ent in sorted_ents) + r')\b'
    return re.compile(pat, flags=re.IGNORECASE)

dem_pattern = compile_pattern(dem_entities)
rep_pattern = compile_pattern(rep_entities)


In [None]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=0
)

# 4) Batch infer + map to stance
batch_size = 32
stances = []
texts = df['Text'].tolist()

In [None]:
# 1) Update infer_stance to return a dict
def infer_stance(example, pos_threshold=0.7, neg_threshold=0.7):
    text = example['Text'] or ""  # guard against None
    # check mentions
    mentions_dem = bool(dem_pattern.search(text))
    mentions_rep = bool(rep_pattern.search(text))

    # default
    stance = "Neutral"
    if mentions_dem or mentions_rep:
        sent = sentiment_analyzer(text[:512])[0]
        label, score = sent['label'], sent['score']
        if mentions_dem:
            if label == "POSITIVE" and score >= pos_threshold:
                stance = "Pro Democrat"
            elif label == "NEGATIVE" and score >= neg_threshold:
                stance = "Anti Democrat"
        elif mentions_rep:
            if label == "POSITIVE" and score >= pos_threshold:
                stance = "Anti Democrat"
            elif label == "NEGATIVE" and score >= neg_threshold:
                stance = "Pro Democrat"

    return {"Stance": stance}

# 2) Map over the dataset to add the new column
ds = ds.map(
    infer_stance,
    fn_kwargs={"pos_threshold": 0.7, "neg_threshold": 0.7},
    remove_columns=[],     # keep all existing columns
    desc="Inferring stance"
)

# 3) Bring it back into your DataFrame
df['Stance'] = ds['Stance']

# 4) Inspect
print(df['Stance'].value_counts())


## In-Shot Learning

In [None]:
import pandas as pd
import re
from itertools import chain
from transformers import pipeline
from tqdm.auto import tqdm


In [None]:
from datasets import Dataset

In [None]:
    import pandas as pd
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
    
    # Load dataset
    df = pd.read_csv("/kaggle/input/2024-us-presidential-elections-twitter-data/final_raw.csv")
    
    # Manually curate 3-5 representative examples for each class
    few_shot_examples = [
    # Pro Democrat
    ("Biden’s student loan relief is a game changer for millions!", "Pro Democrat"),
    ("Finally, some real climate action from the White House.", "Pro Democrat"),
    ("Kamala Harris handled that debate like a pro. #Leadership", "Pro Democrat"),
    ("The Democrats are really pushing for universal healthcare. About time!", "Pro Democrat"),
    ("Thanks to Biden’s infrastructure bill, roads in my town are finally fixed.", "Pro Democrat"),

    # Anti Democrat
    ("Wow, another day, another tax hike. Thanks, Democrats.", "Anti Democrat"),
    ("Open borders and inflation — the Dems' legacy. #Bidenomics", "Anti Democrat"),
    ("I miss the days before Bidenomics ruined everything.", "Anti Democrat"),
    ("The left only cares about pronouns and cancel culture.", "Anti Democrat"),
    ("Why do Democrats hate energy independence so much?", "Anti Democrat"),

    # Neutral
    ("Kamala Harris speaks at Howard University tomorrow.", "Neutral"),
    ("Joe Biden is set to meet with tech leaders this afternoon.", "Neutral"),
    ("The Democratic primary results will be announced at 8 PM.", "Neutral"),
    ("New legislation introduced in Congress today.", "Neutral"),
    ("The Senate is expected to vote on the immigration bill soon.", "Neutral"),

    # Sarcasm / complex tone
    ("Oh great, another inspiring speech from Biden. Can’t wait.", "Anti Democrat"),
    ("Democrats are saving America, one trillion dollar bill at a time.", "Anti Democrat"),
    ("Guess who just tweeted about climate change again? Kamala. Yay.", "Anti Democrat"),
    ("Because banning gas stoves will totally fix everything. Thanks Dems!", "Anti Democrat"),

    # High sentiment intensity
    ("Absolutely love how the Biden admin is prioritizing green energy!", "Pro Democrat"),
    ("Sick and tired of Democrats ruining the country!", "Anti Democrat"),
    ("So many lies from both parties. I’m done.", "Neutral"),
    ("Democrats are doing amazing work on healthcare reform.", "Pro Democrat"),
    ("No comment on the Dems, but I’ll just say... wow.", "Neutral"),
]


In [None]:
MAX_TOKENS = 512
def create_few_shot_prompt_safe(text, examples, tokenizer):
    prompt = "Classify the stance towards Democrats in these examples:\n"
    for ex_text, ex_label in examples:
        example = f"Tweet: {ex_text}\nStance: {ex_label}\n\n"
        # Check if adding this example keeps us within token limit
        if len(tokenizer(prompt + example + f"Tweet: {text}\nStance:")["input_ids"]) <= MAX_TOKENS:
            prompt += example
        else:
            break
    prompt += (
        "Now classify this new tweet. Only respond with Pro Democrat, Anti Democrat, or Neutral.\n"
        f"Tweet: {text}\nStance:"
    )
    return prompt

In [None]:
import tqdm

In [None]:
model_name = "declare-lab/flan-alpaca-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Generator pipeline (using GPU)
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

# Build prompts with truncation
df["prompt"] = df["Text"].apply(lambda x: create_few_shot_prompt_safe(x, few_shot_examples, tokenizer))

# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df[["prompt"]])

# Run generation efficiently with tqdm progress bar
outputs = []
batch_size = 16
for i in tqdm(range(0, len(hf_dataset), batch_size), desc="Classifying Tweets"):
    batch_prompts = hf_dataset["prompt"][i:i+batch_size]
    output = generator(batch_prompts, max_length=15, do_sample=False, num_beams=3, early_stopping=True)
    outputs.extend([out["generated_text"].strip().split("\n")[0] for out in output])

# Normalize labels
label_map = {
    "pro": "Pro Democrat",
    "anti": "Anti Democrat",
    "neutral": "Neutral",
    "democrat": "Pro Democrat",
    "against": "Anti Democrat"
}

df["stance"] = (
    pd.Series(outputs)
    .str.lower()
    .replace({v.lower(): k for k, v in label_map.items()}, regex=True)
    .map(label_map)
    .fillna("Neutral")
)

# Save final CSV (optional)


In [None]:
df.to_csv("inshot3.csv", index=False)