# Stance Classification for SemEval2016
- read in the data and pre-process
- set up an LLM
- Investigate different prompting schemes
    - just context
    - context + few-shot learning
    - content + few-shot learning + reasoning

In [12]:
# Package installations to work on WIRE

! pip install transformers
! pip install langchain
! pip install accelerate
! pip install einops
! pip install deepspeed


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: 

In [27]:
import os, re, pandas as pd, numpy as np, ast, json
from pprint import pprint
from tqdm import tqdm  

import torch
from langchain import PromptTemplate, FewShotPromptTemplate, HuggingFacePipeline, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import deepspeed

from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns

# 1. Import and Preprocess Data

In [28]:
file = os.path.join("semeval","data_merged.csv")

In [29]:
df = pd.read_csv(file)

In [30]:
df.head()

Unnamed: 0,tweet_id,event,full_text,stance
0,101,Atheism,dear lord thank u for all of ur blessings forg...,disagree
1,102,Atheism,"Blessed are the peacemakers, for they shall be...",disagree
2,103,Atheism,I am not conformed to this world. I am transfo...,disagree
3,104,Atheism,Salah should be prayed with #focus and #unders...,disagree
4,105,Atheism,And stay in your houses and do not display you...,disagree


In [31]:
df.shape

(2814, 4)

# 2. Connect to LLM
- for standing up a smaller huggingface model
```python
llm = HuggingFacePipeline.from_model_id(model_id="declare-lab/flan-alpaca-gpt4-xl", task = 'text2text-generation', device=0,
                                      model_kwargs={"max_length":500, "do_sample":False})
```
- for a mid-sized, more modern, huggingface model. You can use accelerate and chance ``` device = "auto"``` to use multiple GPUs
```python
model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=0,
    max_length=200,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)
```


In [32]:
model = "declare-lab/flan-alpaca-xxl"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

look at a few examples

In [33]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@vondeveen If the Army wants to actually recruit people, maybe stop breaking people and actually prosecute sexual assualt #nomorewar."
'''
llm(question)

'AGAINST'

In [34]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@artfulask I have never seen a pink-eared duck before. #Army"
'''
llm(question)

'NEUTRAL'

In [35]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"I think the @Army helped me become disciplined. I would have surely flunked out of college chasing tail if I didn't get some discipline there. #SFL"
'''
llm(question)

'FOR'

# 3. Prompt Engineering for Stance Classification

## 3(a). Task-only prompt

In [36]:
# task-only prompt

task_template = '''
Classify the statement as to whether it is 'FOR', 'AGAINST', or 'NEUTRAL'. Only return the classification label for the statement, and no other text.

statement: {statement}
'''

task_prompt = PromptTemplate(
    input_variables=["statement"],
    template=task_template
)

### Run an example

In [37]:
# Create a sample for all examples

random_sample_df = df.sample(50)
y_true = random_sample_df['stance'].tolist()

In [38]:
llm_chain = LLMChain(prompt=task_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [39]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word or 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

       agree       0.28      0.62      0.38         8
    disagree       0.69      0.69      0.69        26
     neutral       0.83      0.31      0.45        16

    accuracy                           0.56        50
   macro avg       0.60      0.54      0.51        50
weighted avg       0.67      0.56      0.57        50



### Run on all Data

In [40]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [15:57,  2.94it/s]


In [41]:
np.unique(results, return_counts=True)

(array(['AAGAINST', 'AGAINST', 'AGGAINST', 'Against', 'FOR', 'NEG',
        'NEUTRAL'], dtype='<U8'),
 array([  76, 1057,    1,   35, 1287,    1,  357]))

In [42]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word or 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['task_preds'] = y_pred

In [43]:
np.unique(df['task_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([1287, 1168,  359]))

In [44]:
report = classification_report(df['stance'], df['task_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.35      0.61      0.44       731
    disagree       0.59      0.51      0.55      1342
     neutral       0.46      0.22      0.30       741

    accuracy                           0.46      2814
   macro avg       0.47      0.45      0.43      2814
weighted avg       0.49      0.46      0.46      2814



## 3(b). Context prompt

In [45]:
# context prompt

context_template = '''
The following statement is a social media post expressing an opinion about an entity. Classify the statement as to whether it is 'FOR', 'AGAINST', or 'NEUTRAL' toward the entity below. Only return the classification label for the statement toward the entity, and no other text.

entity: {event}
statement: {statement}
'''

context_prompt = PromptTemplate(
    input_variables=["event","statement"],
    template=context_template
)

### Run an example

In [46]:
llm_chain = LLMChain(prompt=context_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [47]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.45      0.62      0.53         8
    disagree       0.71      0.85      0.77        26
     neutral       0.62      0.31      0.42        16

    accuracy                           0.64        50
   macro avg       0.60      0.59      0.57        50
weighted avg       0.64      0.64      0.62        50



### Run on all Data

In [48]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [17:58,  2.61it/s]


In [49]:
np.unique(results, return_counts=True)

(array(['AAGAINST', 'AGAINST', 'Against', 'Atheism', 'FOR', 'NEUTRAL'],
       dtype='<U8'),
 array([  35, 1390,   25,    1,  905,  458]))

In [50]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['context_preds'] = y_pred

In [51]:
np.unique(df['context_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([ 905, 1450,  459]))

In [52]:
report = classification_report(df['stance'], df['context_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.56      0.69      0.62       731
    disagree       0.70      0.75      0.73      1342
     neutral       0.58      0.36      0.44       741

    accuracy                           0.63      2814
   macro avg       0.61      0.60      0.60      2814
weighted avg       0.63      0.63      0.62      2814



## 3(c). Context + FSP

In [53]:
# Create an example template

example_template = '''
entity: {entity}
statement: {statement}
stance: {stance}
'''

example_prompt = PromptTemplate(
    input_variables=["entity","statement", "stance"],
    template=example_template
)

In [54]:
# Give some examples

examples = [
    {'entity':"Atheism",
     'statement':"Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST",
     'stance': 'FOR'},
    {'entity':"Climate Change is a Real Concern",
     'statement':"@AlharbiF I'll bomb anything I can get my hands on, especially if THEY aren't christian. #graham2016 #GOP #SemST",
     'stance': 'NEUTRAL'},
    {'entity':"Feminist Movement",
     'statement':"Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST",
     'stance': 'FOR'},
    {'entity':"Hillary Clinton",
     'statement':"Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST",
     'stance': 'AGAINST'},
    {'entity':"Legalization of Abortion",
     'statement':"@k_yoder That lady needs help, mental illness is a serious issue. #SemST",
     'stance': 'NEUTRAL'},
]

In [55]:
prefix = """
The following statements are social media posts expressing opinions about an entities. Each statement can either be 'FOR', 'AGAINST', or 'NEUTRAL' toward their associated entity.
"""

suffix = '''
Now, classify the following statement as to whether it is 'FOR', 'AGAINST', or 'NEUTRAL' toward the entity below. Only return the classification label for the statement toward the entity, and no other text.

entity: {event}
statement: {statement}
'''

few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [56]:
llm_chain = LLMChain(prompt=few_shot_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [57]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.50      0.50      0.50         8
    disagree       0.72      0.81      0.76        26
     neutral       0.69      0.56      0.62        16

    accuracy                           0.68        50
   macro avg       0.64      0.62      0.63        50
weighted avg       0.68      0.68      0.68        50



### Run on all Data

In [58]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [35:24,  1.32it/s]


In [59]:
np.unique(results, return_counts=True)

(array(['AGAINST', 'Against', 'Atheism', 'FOR', 'NEUTRAL'], dtype='<U7'),
 array([1352,    4,    1,  791,  666]))

In [60]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['fsp_preds'] = y_pred

In [61]:
np.unique(df['fsp_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([ 791, 1356,  667]))

In [62]:
report = classification_report(df['stance'], df['fsp_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.61      0.66      0.64       731
    disagree       0.71      0.72      0.71      1342
     neutral       0.47      0.43      0.45       741

    accuracy                           0.63      2814
   macro avg       0.60      0.60      0.60      2814
weighted avg       0.62      0.63      0.62      2814



## 3(d). Context + FSP + Reasoning

In [63]:
# Create an example template

example_and_reason_template = '''
entity: {entity}
statement: {statement}
stance: {stance}
reason: {reason}
'''

example_and_reason_prompt = PromptTemplate(
    input_variables=["entity","statement", "stance", "reason"],
    template=example_and_reason_template
)

In [64]:
# Give some examples

examples = [
    {'entity':"Atheism",
     'statement':'''Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST''',
     'stance': 'FOR',
     'reason': "the statement is for atheism as it claims leaving a religion like christianity makes one more loving."
    },
    {'entity':"Climate Change is a Real Concern",
     'statement':'''@AlharbiF I'll bomb anything I can get my hands on, especially if THEY aren't christian. #graham2016 #GOP #SemST''',
     'stance': 'NEUTRAL',
     'reason': "the statement does not explicity reference climate change and it is not clear who the author would bomb, so it is neutral toward climate change."
    },
    {'entity':"Feminist Movement",
     'statement':'''Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST''',
     'stance': 'FOR',
     'reason': "the statement supports the feminist movement as it makes fun of alpha males, who are typically seen as being against feminism."
    },
    {'entity':"Hillary Clinton",
     'statement':'''Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST''',
     'stance': 'AGAINST',
     'reason': "the statement is against Hillary Clinton as it calles her a bitch and claims she does bad things like hides emails and lies."
    },
    {'entity':"Legalization of Abortion",
     'statement':'''@k_yoder That lady needs help, mental illness is a serious issue. #SemST''',
     'stance': 'NEUTRAL',
     'reason': "the statement is neutral toward legalization of abortion as it does not talk about abortion and its not clear if the mentioned lady is related to abortion at all."
    }
]

In [65]:
prefix = """
The following statements are social media posts expressing opinions about an entities. Each statement can either be 'FOR', 'AGAINST', or 'NEUTRAL' toward their associated entity and includes the reason why they have that stance.
"""

suffix = '''
Now, classify the following statement as to whether it is 'FOR', 'AGAINST', or 'NEUTRAL' toward the entity below and give your reasoning. Only return the classification label for the statement toward the target and the reason for the classification label in the form of: 'stance: STANCE, reason: REASON'

entity: {event}
statement: {statement}
'''

few_shot_and_reason_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_and_reason_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [66]:
llm_chain = LLMChain(prompt=few_shot_and_reason_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))

Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors


In [67]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'for' in stance:  
        y_pred.append('agree')  
    elif 'against' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')

y_true = [str.lower(i) for i in y_true]

# generating the classification report
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

       agree       0.29      0.88      0.44         8
    disagree       0.72      0.69      0.71        26
     neutral       1.00      0.06      0.12        16

    accuracy                           0.52        50
   macro avg       0.67      0.54      0.42        50
weighted avg       0.74      0.52      0.47        50



### Run on all Data

In [68]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [3:54:55,  5.01s/it]


In [69]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'for' in stance:  
        y_pred.append('agree')  
    elif 'against' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')
        
df['fsp_reason_preds'] = y_pred
df['fsp_reason_reasons'] = reasons

In [70]:
np.unique(df['fsp_reason_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([1374, 1328,  112]))

In [71]:
report = classification_report(df['stance'], df['fsp_reason_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.44      0.82      0.57       731
    disagree       0.68      0.67      0.68      1342
     neutral       0.75      0.11      0.20       741

    accuracy                           0.57      2814
   macro avg       0.62      0.54      0.48      2814
weighted avg       0.64      0.57      0.52      2814



In [72]:
df.to_csv("semeval_flan-alpaca-11B_results.csv")