# Stance Classification for PhemeRumors
- read in the data and pre-process
- set up an LLM
- Investigate different prompting schemes
    - just context
    - context + few-shot learning
    - content + few-shot learning + reasoning

In [2]:
# Package installations to work on WIRE

! pip install transformers
! pip install langchain
! pip install accelerate
! pip install einops
! pip install deepspeed

Collecting transformers
  Using cached transformers-4.33.1-py3-none-any.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Using cached huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (771 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: tokenizers, safetensors, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.17.1 regex-2023.8.8 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m

In [2]:
import os, re, pandas as pd, numpy as np, ast, json
from pprint import pprint
from tqdm import tqdm  

import torch
from langchain import PromptTemplate, FewShotPromptTemplate, HuggingFacePipeline, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import deepspeed

from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns

2023-09-12 21:21:45.801333: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-09-12 21:21:47,695] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# 1. Import and Preprocess Data

In [3]:
file = os.path.join("phemerumours","data_merged.csv")

In [4]:
df = pd.read_csv(file)

In [5]:
df.head()

Unnamed: 0,tweet_id,stance,event,full_text
0,576755174531862529,agree,putinmissing,Coup? RT @jimgeraghty: Rumors all Russian mili...
1,576319832800555008,agree,putinmissing,Hoppla! @L0gg0l: Swiss Rumors: Putin absence d...
2,576513463738109954,disagree,putinmissing,Putin reappears on TV amid claims he is unwell...
3,552783667052167168,agree,charliehebdo,France: 10 people dead after shooting at HQ of...
4,552793679082311680,agree,charliehebdo,"11 confirmed dead, Francois Hollande to visit ..."


In [6]:
df.shape

(2859, 4)

In [7]:
# Define a dictionary with the original values as keys and new values as corresponding values  
event_dict = {  
    'putinmissing': "Russian President Putin has gone missing",   
    'charliehebdo': "there was a shooting event at Charlie Hebdo in Paris",   
    'prince-toronto': "singer Prince will play a secret show in Toronto",   
    'ferguson': "There is unrest in Ferguson, Missouri",   
    'germanwings-crash': "A Germanwings plane crashed",   
    'ottawashooting': "There was a shooting event in Ottawa",   
    'sydneysiege': "There is a hostage situation in Sydney",   
    'ebola-essien': "Michael Essien contracted Ebola"  
}  
  
# Use the map function to replace the values  
df['event'] = df['event'].map(event_dict)  

# 2. Connect to LLM
- for standing up a smaller huggingface model
```python
llm = HuggingFacePipeline.from_model_id(model_id="declare-lab/flan-alpaca-gpt4-xl", task = 'text2text-generation', device=0,
                                      model_kwargs={"max_length":500, "do_sample":False})
```
- for a mid-sized, more modern, huggingface model. You can use accelerate and chance ``` device = "auto"``` to use multiple GPUs
```python
model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=0,
    max_length=200,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)
```


In [8]:
model = "google/flan-ul2"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

look at a few examples

In [9]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@vondeveen If the Army wants to actually recruit people, maybe stop breaking people and actually prosecute sexual assualt #nomorewar."
'''
llm(question)

'AGAINST'

In [10]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@artfulask I have never seen a pink-eared duck before. #Army"
'''
llm(question)

'NEUTRAL'

In [11]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"I think the @Army helped me become disciplined. I would have surely flunked out of college chasing tail if I didn't get some discipline there. #SFL"
'''
llm(question)

'FOR'

# 3. Prompt Engineering for Stance Classification

## 3(a). Task-only prompt

In [12]:
# task-only prompt

task_template = '''
Classify the statement as to whether it "SUPPORTS", "DENIES", or is "NEUTRAL". Only return the classification label for the statement, and no other text.

statement: {statement}
'''

task_prompt = PromptTemplate(
    input_variables=["statement"],
    template=task_template
)

### Run an example

In [13]:
# Create a sample for all examples

random_sample_df = df.sample(50)
y_true = random_sample_df['stance'].tolist()

In [14]:
llm_chain = LLMChain(prompt=task_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [15]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = [text.lower().replace('denies', 'disagree').replace('supports', 'agree').replace("neutrals", "neutral") for text in results]

# generating the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00         4
    disagree       0.00      0.00      0.00         0
     neutral       0.90      0.80      0.85        46

    accuracy                           0.74        50
   macro avg       0.30      0.27      0.28        50
weighted avg       0.83      0.74      0.78        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [30]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2859it [05:32,  8.61it/s]


In [31]:
np.unique(results, return_counts=True)

array(['', 'DENIES', 'NEUTRAL', 'STRESS', 'SUPPORTS', 'Support',
       'Supports',
       'The statement "I WILL KEEP THE BROWN FAMILY IN PRAYERS!" is classified as "SUPPORTS".',
       'The statement "SUPPORTS" the statement.', 'X'], dtype='<U85')

In [32]:
output_list = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        output_list.append('agree')  
    elif 'neg' in lower_word or 'denies' in lower_word:  
        output_list.append('disagree')  
    elif 'neutral' in lower_word:  
        output_list.append('neutral')  
    else:  
        output_list.append('neutral')
        
df['task_preds'] = output_list

In [33]:
np.unique(df['task_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([ 323,  532, 2004]))

In [34]:
report = classification_report(df['stance'], df['task_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.03      0.05      0.04       238
    disagree       0.01      0.33      0.01         9
     neutral       0.89      0.68      0.77      2612

    accuracy                           0.63      2859
   macro avg       0.31      0.35      0.27      2859
weighted avg       0.81      0.63      0.71      2859



## 3(b). Context prompt

In [20]:
# context prompt

context_template = '''
The following statement is a social media post commenting on whether a rumor is true. Classify the statement as to whether it "SUPPORTS", "DENIES", or is "NEUTRAL" toward the rumor below being true. Only return the classification label for the statement toward the rumor being true, and no other text.

rumor: {event}
statement: {statement}
'''

context_prompt = PromptTemplate(
    input_variables=["event","statement"],
    template=context_template
)

### Run an example

In [21]:
llm_chain = LLMChain(prompt=context_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [22]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'neg' in lower_word or 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.14      0.50      0.22         2
    disagree       0.00      0.00      0.00         0
     neutral       0.96      0.46      0.62        48

    accuracy                           0.46        50
   macro avg       0.37      0.32      0.28        50
weighted avg       0.92      0.46      0.60        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [39]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2859it [06:02,  7.89it/s]


In [40]:
np.unique(results, return_counts=True)

array(['', 'DENIES', 'MT', 'NEUTRAL', 'STRESS!', 'SUPPORTS', 'Supports',
       'The statement "SUPPORTS" the rumor that Prince will play a secret show in Toronto.',
       'The statement "SUPPORTS" the rumor that a Germanwings plane crashed.',
       'The statement "SUPPORTS" the rumor that there is a hostage situation in Sydney.',
       'The statement "SUPPORTS" the rumor that there was a shooting event at Charlie Hebdo in Paris.',
       'The statement "SUPPORTS" the rumor that there was a shooting event in Ottawa.',
       'X'], dtype='<U93')

In [41]:
output_list = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        output_list.append('agree')  
    elif 'neg' in lower_word or 'denies' in lower_word:  
        output_list.append('disagree')  
    elif 'neutral' in lower_word:  
        output_list.append('neutral')  
    else:  
        output_list.append('neutral')
        
df['context_preds'] = output_list

In [42]:
np.unique(df['context_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([ 262,  254, 2343]))

In [43]:
report = classification_report(df['stance'], df['context_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.23      0.25      0.24       238
    disagree       0.01      0.33      0.02         9
     neutral       0.93      0.83      0.88      2612

    accuracy                           0.78      2859
   macro avg       0.39      0.47      0.38      2859
weighted avg       0.86      0.78      0.82      2859



## 3(c). Context + FSP

In [23]:
# Create an example template

example_template = '''
rumor: {rumor}
statement: {statement}
stance: {stance}
'''

example_prompt = PromptTemplate(
    input_variables=["rumor","statement", "stance"],
    template=example_template
)

In [24]:
# Give some examples

examples = [
    {'rumor':"Putin has gone missing",
     'statement':"Putin reappears on TV amid claims he is unwell and under threat of coup http://t.co/YZln23EUx1 http://t.co/ZsAnBa5gz3",
     'stance': 'DENIES'},
    {'rumor':"Michael Essien contracted Ebola",
     'statement': '''What? "@FootballcomEN: Unconfirmed reports claim that Michael Essien has contracted Ebola. http://t.co/GsEizhwaV7"''',
     'stance': 'NEUTRAL'},
    {'rumor':"A Germanwings plane crashed",
     'statement': '''@thatjohn @planefinder why would they say urgence in lieu of mayday which is standard ?''',
     'stance': 'NEUTRAL'},
    {'rumor':"There is a hostage situation in Sydney",
     'statement': '''@KEEMSTARx dick head it's not confirmed its Jihadist extremists. Don't speculate''',
     'stance': 'NEUTRAL'},
    {'rumor':"singer Prince will play a secret show in Toronto",
     'statement': '''OMG. #Prince rumoured to be performing in Toronto today. Exciting!''',
     'stance': 'SUPPORTS'}
]

In [25]:
prefix = """
The following are social media posts commenting on whether a rumor is true. Each statement can either 'SUPPORT', be 'AGAINST', or be 'NEUTRAL' toward their associated rumor.
"""

suffix = '''
Now, classify the following statement as to whether it "SUPPORTS", "DENIES", or is "NEUTRAL" toward the rumor below being true. Only return the classification label for the statement toward the rumor, and no other text.

rumor: {event}
statement: {statement}
'''

few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [26]:
llm_chain = LLMChain(prompt=few_shot_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [27]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report
report = classification_report(y_true, y_pred)
  
print(report)

              precision    recall  f1-score   support

       agree       0.14      0.50      0.22         2
    disagree       0.00      0.00      0.00         0
     neutral       0.97      0.79      0.87        48

    accuracy                           0.78        50
   macro avg       0.37      0.43      0.37        50
weighted avg       0.94      0.78      0.85        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [80]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2859it [09:02,  5.27it/s]


In [81]:
np.unique(results, return_counts=True)

(array(["'AGAINST'", "'NEUTRAL'", "'SUPPORTS'", 'AGAINST', 'DENIES',
        'NEUTRAL', 'Support', 'Supports', 'X'], dtype='<U10'),
 array([ 275,    1,   67,   74,   45, 2256,   63,   76,    2]))

In [82]:
output_list = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        output_list.append('agree')  
    elif 'denies' in lower_word:  
        output_list.append('disagree')  
    elif 'neutral' in lower_word:  
        output_list.append('neutral')  
    else:  
        output_list.append('neutral')
        
df['fsp_preds'] = output_list

In [83]:
np.unique(df['fsp_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([ 206,   45, 2608]))

In [84]:
report = classification_report(df['stance'], df['fsp_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.21      0.18      0.20       238
    disagree       0.04      0.22      0.07         9
     neutral       0.92      0.92      0.92      2612

    accuracy                           0.86      2859
   macro avg       0.39      0.44      0.40      2859
weighted avg       0.86      0.86      0.86      2859



## 3(d). Context + FSP + Reasoning

In [16]:
# Create an example template

example_and_reason_template = '''
rumor: {rumor}
statement: {statement}
stance: {stance}
reason: {reason}
'''

example_and_reason_prompt = PromptTemplate(
    input_variables=["rumor","statement", "stance", "reason"],
    template=example_and_reason_template
)

In [17]:
# Give some examples

examples = [
    {'rumor':"Putin has gone missing",
     'statement':"Putin reappears on TV amid claims he is unwell and under threat of coup http://t.co/YZln23EUx1 http://t.co/ZsAnBa5gz3",
     'stance': 'DENIES',
     'reason': "the statement denies that Putin has gone missing by saying he has reappeared on TV."
    },
    {'rumor':"Michael Essien contracted Ebola",
     'statement': '''What? "@FootballcomEN: Unconfirmed reports claim that Michael Essien has contracted Ebola. http://t.co/GsEizhwaV7"''',
     'stance': 'NEUTRAL',
     'reason': "the statement is neutral toward wether Michael Essen contracted Ebola, as it mostly just repeats the original post from @FootballcomEN while asking for more information."
    },
    {'rumor':"A Germanwings plane crashed",
     'statement': '''@thatjohn @planefinder why would they say urgence in lieu of mayday which is standard ?''',
     'stance': 'NEUTRAL',
     'reason': "the statement is neutral toward wether a germanwings plane crashed as it is only asking for clarifiying details about what has been reported."
    },
    {'rumor':"There is a hostage situation in Sydney",
     'statement': '''@KEEMSTARx dick head it's not confirmed its Jihadist extremists. Don't speculate''',
     'stance': 'NEUTRAL',
     'reason': "the statement is neutral toward wether there is a hostage siutation happening in Sydney as it is admonishing someone for speculating on a detail of the situation."
    },
    {'rumor':"singer Prince will play a secret show in Toronto",
     'statement': '''OMG. #Prince rumoured to be performing in Toronto today. Exciting!''',
     'stance': 'SUPPORTS',
     'reason': 'The statement supports the rumor that the singer Prince performing a show in Toronto, as it expresses excitment at the singer performing.'
    }
]

In [18]:
prefix = """
The following are social media posts commenting on whether a rumor is true. Each statement can support, deny, or be neutral toward its associated rumor and Each statement has the reason for its stance toward the rumor.
"""

suffix = '''
Now, classify the following statement as to whether it "SUPPORTS", "DENIES", or is "NEUTRAL" toward the rumor below being true, and give your reason for the classification. Only return the classification for the statement towards the rumor and the reasoning for the classification in the form of:'stance: STANCE, reason: REASON'

rumor: {event}
statement: {statement}
'''

few_shot_and_reason_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_and_reason_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [None]:
llm_chain = LLMChain(prompt=few_shot_and_reason_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [None]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'for' in stance:  
        y_pred.append('agree')  
    elif 'against' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')

y_true = [str.lower(i) for i in y_true]

# generating the classification report
report = classification_report(y_true, y_pred)
print(report)

### Run on all Data

In [98]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2859it [43:25,  1.10it/s]


In [99]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'for' in stance:  
        y_pred.append('agree')  
    elif 'against' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')
        
df['fsp_reason_preds'] = y_pred
df['fsp_reason_reasons'] = reasons

In [100]:
np.unique(df['fsp_reason_preds'], return_counts=True)

(array(['neutral'], dtype=object), array([2859]))

In [101]:
report = classification_report(df['stance'], df['fsp_reason_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00       238
    disagree       0.00      0.00      0.00         9
     neutral       0.91      1.00      0.95      2612

    accuracy                           0.91      2859
   macro avg       0.30      0.33      0.32      2859
weighted avg       0.83      0.91      0.87      2859



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
df.to_csv("phemerumors_flan-alpaca-3B_results.csv")