# Stance Classification for SemEval2016
- read in the data and pre-process
- set up an LLM
- Investigate different prompting schemes
    - just context
    - context + few-shot learning
    - content + few-shot learning + reasoning

In [1]:
# Package installations to work on WIRE

! pip install transformers
! pip install langchain
! pip install accelerate
! pip install einops
! pip install deepspeed

Collecting transformers
  Using cached transformers-4.33.1-py3-none-any.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Using cached huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (771 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: tokenizers, safetensors, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.17.1 regex-2023.8.8 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m

In [2]:
import os, re, pandas as pd, numpy as np, ast, json
from pprint import pprint
from tqdm import tqdm  

import torch
from langchain import PromptTemplate, FewShotPromptTemplate, HuggingFacePipeline, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import deepspeed

from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns

2023-09-13 13:08:32.295215: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-13 13:08:32.347494: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-09-13 13:08:34,786] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# 1. Import and Preprocess Data

In [3]:
file = os.path.join("semeval","data_merged.csv")

In [4]:
df = pd.read_csv(file)

In [5]:
df.head()

Unnamed: 0,tweet_id,event,full_text,stance
0,101,Atheism,dear lord thank u for all of ur blessings forg...,disagree
1,102,Atheism,"Blessed are the peacemakers, for they shall be...",disagree
2,103,Atheism,I am not conformed to this world. I am transfo...,disagree
3,104,Atheism,Salah should be prayed with #focus and #unders...,disagree
4,105,Atheism,And stay in your houses and do not display you...,disagree


In [6]:
df.shape

(2814, 4)

# 2. Connect to LLM
- for standing up a smaller huggingface model
```python
llm = HuggingFacePipeline.from_model_id(model_id="declare-lab/flan-alpaca-gpt4-xl", task = 'text2text-generation', device=0,
                                      model_kwargs={"max_length":500, "do_sample":False})
```
- for a mid-sized, more modern, huggingface model. You can use accelerate and chance ``` device = "auto"``` to use multiple GPUs
```python
model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=0,
    max_length=200,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)
```


In [11]:
model = "declare-lab/flan-alpaca-gpt4-xl"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map=0,
    max_length=1000,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

look at a few examples

In [12]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@vondeveen If the Army wants to actually recruit people, maybe stop breaking people and actually prosecute sexual assualt #nomorewar."
'''
llm(question)

'AGAINST'

In [13]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@artfulask I have never seen a pink-eared duck before. #Army"
'''
llm(question)

'NEUTRAL'

In [14]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"I think the @Army helped me become disciplined. I would have surely flunked out of college chasing tail if I didn't get some discipline there. #SFL"
'''
llm(question)

'FOR'

# 3. Prompt Engineering for Stance Classification

## 3(a). Task-only prompt

In [76]:
# task-only prompt

task_template = '''
Classify the sentiment of the statement as 'POSITIVE', 'NEGATIVE', or 'NEUTRAL'. Only return the sentiment label for the statement, and no other text.

statement: {statement}
'''

task_prompt = PromptTemplate(
    input_variables=["statement"],
    template=task_template
)

### Run an example

In [77]:
# Create a sample for all examples

random_sample_df = df.sample(100)
y_true = random_sample_df['stance'].tolist()

In [78]:
llm_chain = LLMChain(prompt=task_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))

In [79]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'positive' in lower_word:  
        y_pred.append('agree')  
    elif 'negative' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

       agree       0.35      0.51      0.42        35
    disagree       0.52      0.39      0.44        41
     neutral       0.17      0.12      0.14        24

    accuracy                           0.37       100
   macro avg       0.35      0.34      0.34       100
weighted avg       0.38      0.37      0.36       100



### Run on all Data

In [80]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [07:41,  6.10it/s]


In [81]:
np.unique(results, return_counts=True)

(array(['NEGATIVE', 'NEUTRAL', 'POSITIVE'], dtype='<U8'),
 array([1092,  400, 1322]))

In [82]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'positive' in lower_word:  
        y_pred.append('agree')  
    elif 'negative' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['task_preds'] = y_pred

In [83]:
np.unique(df['task_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([1322, 1092,  400]))

In [84]:
report = classification_report(df['stance'], df['task_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.29      0.52      0.37       731
    disagree       0.54      0.44      0.49      1342
     neutral       0.31      0.17      0.22       741

    accuracy                           0.39      2814
   macro avg       0.38      0.38      0.36      2814
weighted avg       0.42      0.39      0.39      2814



## 3(b). Context prompt

In [85]:
# context prompt

context_template = '''
The following statement is a social media post expressing an opinion about an entity. Classify the sentiment of the statement only toward the entity below as 'NEGATIVE', 'POSITIVE', or 'NEUTRAL'. Only return the sentiment label for the statement toward the entity, and no other text.

entity: {event}
statement: {statement}
'''

context_prompt = PromptTemplate(
    input_variables=["event","statement"],
    template=context_template
)

### Run an example

In [86]:
llm_chain = LLMChain(prompt=context_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [87]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'positive' in lower_word:  
        y_pred.append('agree')  
    elif 'negative' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.47      0.69      0.56        35
    disagree       0.63      0.41      0.50        41
     neutral       0.27      0.25      0.26        24

    accuracy                           0.47       100
   macro avg       0.46      0.45      0.44       100
weighted avg       0.49      0.47      0.46       100



### Run on all Data

In [88]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [07:44,  6.05it/s]


In [89]:
np.unique(results, return_counts=True)

(array(['NEGATIVE', 'NEUTRAL', 'POSITIVE'], dtype='<U8'),
 array([1004,  620, 1190]))

In [90]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'positive' in lower_word:  
        y_pred.append('agree')  
    elif 'negative' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['context_preds'] = y_pred

In [91]:
np.unique(df['context_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([1190, 1004,  620]))

In [92]:
report = classification_report(df['stance'], df['context_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.40      0.65      0.49       731
    disagree       0.67      0.50      0.57      1342
     neutral       0.31      0.26      0.29       741

    accuracy                           0.48      2814
   macro avg       0.46      0.47      0.45      2814
weighted avg       0.50      0.48      0.48      2814



## 3(c). Context + FSP

In [93]:
# Create an example template

example_template = '''
entity: {entity}
statement: {statement}
sentiment: {sentiment}
'''

example_prompt = PromptTemplate(
    input_variables=["entity","statement", "sentiment"],
    template=example_template
)

In [94]:
# Give some examples

examples = [
    {'entity':"Atheism",
     'statement':"Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST",
     'sentiment': 'POSITIVE'},
    {'entity':"Climate Change is a Real Concern",
     'statement':"@AlharbiF I'll bomb anything I can get my hands on, especially if THEY aren't christian. #graham2016 #GOP #SemST",
     'sentiment': 'NEUTRAL'},
    {'entity':"Feminist Movement",
     'statement':"Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST",
     'sentiment': 'POSITIVE'},
    {'entity':"Hillary Clinton",
     'statement':"Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST",
     'sentiment': 'NEGATIVE'},
    {'entity':"Legalization of Abortion",
     'statement':"@k_yoder That lady needs help, mental illness is a serious issue. #SemST",
     'sentiment': 'NEUTRAL'},
]

In [95]:
prefix = """
The following statements are social media posts expressing opinions about an entities. Each statement can have a sentiment of 'POSITIVE', 'NEGATIVE', or 'NEUTRAL' toward their associated entity.
"""

suffix = '''
Now, classify the sentiment of following statement toward the following entity as 'POSITIVE', 'NEGATIVE', or 'NEUTRAL'. Only return the sentiment label for the statement toward the entity, and no other text.

entity: {event}
statement: {statement}
'''

few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [96]:
llm_chain = LLMChain(prompt=few_shot_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [97]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'positive' in lower_word:  
        y_pred.append('agree')  
    elif 'negative' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.46      0.66      0.54        35
    disagree       0.65      0.41      0.51        41
     neutral       0.38      0.38      0.38        24

    accuracy                           0.49       100
   macro avg       0.50      0.48      0.47       100
weighted avg       0.52      0.49      0.49       100



### Run on all Data

In [98]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [10:23,  4.51it/s]


In [99]:
np.unique(results, return_counts=True)

(array(['NEGATIVE', 'NEUTRAL', 'POSITIVE'], dtype='<U8'),
 array([1043,  594, 1177]))

In [100]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'positive' in lower_word:  
        y_pred.append('agree')  
    elif 'negative' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['fsp_preds'] = y_pred

In [101]:
np.unique(df['fsp_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([1177, 1043,  594]))

In [102]:
report = classification_report(df['stance'], df['fsp_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.41      0.66      0.50       731
    disagree       0.69      0.54      0.60      1342
     neutral       0.45      0.36      0.40       741

    accuracy                           0.52      2814
   macro avg       0.52      0.52      0.50      2814
weighted avg       0.55      0.52      0.52      2814



## 3(d). Context + FSP + Reasoning

In [103]:
# Create an example template

example_and_reason_template = '''
entity: {entity}
statement: {statement}
sentiment: {sentiment}
reason: {reason}
'''

example_and_reason_prompt = PromptTemplate(
    input_variables=["entity","statement", "sentiment", "reason"],
    template=example_and_reason_template
)

In [104]:
# Give some examples

examples = [
    {'entity':"Atheism",
     'statement':'''Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST''',
     'sentiment': 'POSITIVE',
     'reason': "The statement is positive toward atheism as it claims leaving a religion like christianity makes one more loving."
    },
    {'entity':"Climate Change is a Real Concern",
     'statement':'''@AlharbiF I'll bomb anything I can get my hands on, especially if THEY aren't christian. #graham2016 #GOP #SemST''',
     'sentiment': 'NEUTRAL',
     'reason': "The statement does not explicity reference climate change and it is not clear who the author would bomb, so it is neutral toward climate change."
    },
    {'entity':"Feminist Movement",
     'statement':'''Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST''',
     'sentiment': 'POSITIVE',
     'reason': "The statement is positive towards the feminist movement as it makes fun of alpha males, who are typically seen as being against feminism."
    },
    {'entity':"Hillary Clinton",
     'statement':'''Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST''',
     'sentiment': 'NEGATIVE',
     'reason': "The statement is negative toward Hillary Clinton as it calles her a bitch and claims she does bad things like hides emails and lies."
    },
    {'entity':"Legalization of Abortion",
     'statement':'''@k_yoder That lady needs help, mental illness is a serious issue. #SemST''',
     'sentiment': 'NEUTRAL',
     'reason': "the statement is neutral toward legalization of abortion as it does not talk about abortion and its not clear if the mentioned lady is related to abortion at all."
    }
]

In [105]:
prefix = """
The following statements are social media posts expressing opinions about an entities. The sentiment of each statement can be 'POSITIVE', 'NEGATIVE', or 'NEUTRAL' toward their associated entity. Each statement has a reason for why the senitment toward the entity is classified as it is.
"""

suffix = '''
Now, classify the sentiment of the following statement toward the entity below as 'POSITIVE', NEGATIVE', or 'NEUTRAL' and give your reasoning. Only return the sentiment label for the statement toward the entity and the reason for the sentiment label in the form of: 'sentiment: SENTIMENT, reason: REASON'

entity: {event}
statement: {statement}
'''

few_shot_and_reason_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_and_reason_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [106]:
llm_chain = LLMChain(prompt=few_shot_and_reason_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [107]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('sentiment:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('sentiment:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'positive' in stance:  
        y_pred.append('agree')  
    elif 'negative' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')

y_true = [str.lower(i) for i in y_true]

# generating the classification report
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

       agree       0.50      0.86      0.63        35
    disagree       0.73      0.59      0.65        41
     neutral       0.86      0.25      0.39        24

    accuracy                           0.60       100
   macro avg       0.69      0.56      0.56       100
weighted avg       0.68      0.60      0.58       100



### Run on all Data

In [108]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [39:04,  1.20it/s]


In [109]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('sentiment:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('sentiment:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'positive' in stance:  
        y_pred.append('agree')  
    elif 'negative' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')
        
df['fsp_reason_preds'] = y_pred
df['fsp_reason_reasons'] = reasons

In [110]:
np.unique(df['fsp_reason_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([1470, 1145,  199]))

In [111]:
report = classification_report(df['stance'], df['fsp_reason_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.38      0.76      0.51       731
    disagree       0.66      0.56      0.60      1342
     neutral       0.62      0.17      0.26       741

    accuracy                           0.51      2814
   macro avg       0.55      0.50      0.46      2814
weighted avg       0.58      0.51      0.49      2814



In [112]:
df.to_csv("semeval_sentiment_flan-alpaca-3B_results.csv")