In [9]:
import torch  # Import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
from tqdm import tqdm

# Use correct model identifier
MODEL_NAME = "MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33"

def initialize_classifier():
    try:
        # Initialize with proper configuration
        return pipeline(
            "zero-shot-classification",
            model=MODEL_NAME,
            tokenizer=MODEL_NAME,
            device=0 if torch.cuda.is_available() else -1
        )
    except Exception as e:
        print(f"Initialization error: {e}")
        return None

# Improved keyword list for relevance (US, Russia, Ukraine)
# Updated keyword list for relevance (US, Russia, Ukraine)
RELEVANCE_KEYWORDS = {
    # United States (US)
    'us', 'usa', 'u.s.', 'united states', 'america', 'american', 'washington', 'biden', 'trump', 'white house',
    'congress', 'senate', 'house of representatives', 'new york', 'california', 'florida', 'texas', 'chicago',
    'detroit', 'democrat', 'republican', 'federal reserve', 'department of state', 'pentagon', 'capitol hill',
    'president', 'supreme court', 'nato', 'united nations', 'military', 'veterans', 'nra', 'fbi', 'cia', 'nsa', 'socal',
    'midwest', 'east coast', 'west coast', 'southwest', 'great lakes',

    # Russia
    'russia', 'russian', 'moscow', 'kremlin', 'putin', 'novorossiya', 'saint petersburg', 'crimea', 'russian military',
    'soviet union', 'russian government', 'vladimir putin', 'kremlin spokesperson', 'russian foreign policy', 'russian opposition',
    'donbas', 'ukraine', 'sevastopol', 'russian navy', 'russian soldiers', 'russian troops', 'red square', 'duma', 'siberia',
    'putin regime', 'oligarchs', 'russian economy', 'russian diplomats', 'russian air force', 'russian propaganda', 'russian hackers',
    'gas pipeline', 'russian election interference',

    # Ukraine
    'ukraine', 'ukrainian', 'kyiv', 'zelensky', 'volodymyr zelensky', 'donetsk', 'kharkiv', 'dnipropetrovsk', 'crimea',
    'donbas', 'ukrainian military', 'ukrainian army', 'ukraine war', 'russia ukraine conflict', 'kiev', 'battalion', 'ukrainian president',
    'ukraine crisis', 'ukraine conflict', 'ukrainian forces', 'eastern ukraine', 'southeast ukraine', 'maidan', 'ukraine nato',
    'ukraine refugees', 'ukraine foreign policy', 'ukraine resistance', 'ukraine war crimes', 'ukraine sanctions', 'ukraine peace talks',

    # Recent Developments
    'zelenskyy', 'trump zelenskyy meeting', 'oval office confrontation', 'us ukraine relations', 'us aid to ukraine', 'peace talks russia ukraine',
    'jd vance', 'ukraine rare earth minerals', 'european support for zelenskyy', 'china reacts to trump zelenskyy meeting', 'keir starmer zelenskyy summit',
    'russia us relations', 'impeachment inquiry zelenskyy', 'ukraine nato aspirations', 'zelenskyy european tour', 'ukraine war strategy', 'zelenskyy public address'
}


def analyze_dataset(df):
    classifier = initialize_classifier()
    if not classifier:
        return df

    # Apply sentiment analysis for US, Russia, and Ukraine separately
    df['us_sentiment'], df['russia_sentiment'], df['ukraine_sentiment'] = zip(*df.progress_apply(lambda row: analyze_row(row, classifier), axis=1))
    return df

def analyze_row(row, classifier):
    text = row['cleaned_text'] or row['cleaned_title']

    if not is_us_related(text) and not is_russia_related(text) and not is_ukraine_related(text):
        return "irrelevant", "irrelevant", "irrelevant"

    # Sentiment towards the US, Russia, and Ukraine
    us_result = classify_sentiment(classifier, text, "This text expresses {} sentiment toward the United States.")
    russia_result = classify_sentiment(classifier, text, "This text expresses {} sentiment toward Russia.")
    ukraine_result = classify_sentiment(classifier, text, "This text expresses {} sentiment toward Ukraine.")

    return us_result, russia_result, ukraine_result

def classify_sentiment(classifier, text, hypothesis_template):
    try:
        result = classifier(
            text,
            candidate_labels=["favor", "oppose", "neutral"],
            hypothesis_template=hypothesis_template
        )
        return result['labels'][0]
    except Exception as e:
        print(f"Error during classification: {e}")
        return "error"

def is_us_related(text):
    # Ensure text is a string and handle missing values
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    return any(keyword in text.lower() for keyword in RELEVANCE_KEYWORDS if 'us' in keyword)

def is_russia_related(text):
    # Ensure text is a string and handle missing values
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    return any(keyword in text.lower() for keyword in RELEVANCE_KEYWORDS if 'russia' in keyword)

def is_ukraine_related(text):
    # Ensure text is a string and handle missing values
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    return any(keyword in text.lower() for keyword in RELEVANCE_KEYWORDS if 'ukraine' in keyword)


if __name__ == "__main__":
    # Load data with error handling
    try:
        df = pd.read_csv("thread_cleaned.csv")
        tqdm.pandas()  # Enable progress bar for apply
        print(f"Columns before processing: {df.columns.tolist()}")  # Check initial columns
        analyzed_df = analyze_dataset(df)
        print(f"Columns after processing: {analyzed_df.columns.tolist()}")  # Check columns after processing
        analyzed_df.to_csv("analyzed_sentiments.csv", index=False)
        print(analyzed_df[['cleaned_title', 'us_sentiment', 'russia_sentiment', 'ukraine_sentiment']].head(10))
    except Exception as e:
        print(f"Critical error: {e}")


Columns before processing: ['date_utc', 'timestamp', 'title', 'text', 'subreddit', 'comments', 'url', 'cleaned_title', 'cleaned_text']


Device set to use cuda:0
100%|██████████| 4423/4423 [02:42<00:00, 27.21it/s]

Columns after processing: ['date_utc', 'timestamp', 'title', 'text', 'subreddit', 'comments', 'url', 'cleaned_title', 'cleaned_text', 'us_sentiment', 'russia_sentiment', 'ukraine_sentiment']
                                       cleaned_title us_sentiment  \
0  zelensky tells bbc ukraine ready sign minerals...   irrelevant   
1  tens thousands anti government protesters hold...   irrelevant   
2                  trump rift disinformation ukraine   irrelevant   
3  u.s ukraine finalized natural resources deal k...   irrelevant   
4                               starmer trump russia   irrelevant   
5  european pravda text minerals deal agreed ukra...   irrelevant   
6                            poland film poland 1940   irrelevant   
7  lawrence trump humiliated world stage france's...   irrelevant   
8  rubio explains voted resolution condemning rus...   irrelevant   
9  robert fico ready block european summit aid uk...   irrelevant   

  russia_sentiment ukraine_sentiment  
0       ir




In [10]:
print(analyzed_df)

        date_utc     timestamp  \
0     2025-03-02  1.740955e+09   
1     2025-03-01  1.740845e+09   
2     2025-02-27  1.740677e+09   
3     2025-02-26  1.740587e+09   
4     2025-02-26  1.740575e+09   
...          ...           ...   
4418  2025-03-03  1.741019e+09   
4419  2025-03-02  1.740930e+09   
4420  2025-03-02  1.740936e+09   
4421  2025-02-28  1.740724e+09   
4422  2025-02-25  1.740492e+09   

                                                  title  \
0     Zelensky tells BBC Ukraine 'ready to sign' min...   
1     Tens of thousands of anti-government protester...   
2            Trump rift opens disinformation on Ukraine   
3     The U.S. and Ukraine have finalized a natural ...   
4          What can Starmer get out of Trump on Russia?   
...                                                 ...   
4418  UA POV: Ukrainian commander of the ground forc...   
4419  UA POV: The Lunch menu during Zelenskyy's visi...   
4420  UA POV: According to BILD, United States Presi...   
4