# Stance Classification for SemEval2016
- read in the data and pre-process
- set up an LLM
- Investigate different prompting schemes
    - just context
    - context + few-shot learning
    - content + few-shot learning + reasoning

In [1]:
# Package installations to work on WIRE

! pip install transformers
! pip install langchain
! pip install accelerate
! pip install einops
! pip install deepspeed

Collecting transformers
  Using cached transformers-4.33.1-py3-none-any.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Using cached huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (771 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: tokenizers, safetensors, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.17.1 regex-2023.8.8 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m

In [1]:
import os, re, pandas as pd, numpy as np, ast, json
from pprint import pprint
from tqdm import tqdm  

import torch
from langchain import PromptTemplate, FewShotPromptTemplate, HuggingFacePipeline, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import deepspeed

from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns

2023-09-14 14:39:26.509606: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-09-14 14:39:28,041] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# 1. Import and Preprocess Data

In [2]:
file = os.path.join("semeval","data_merged.csv")

In [3]:
df = pd.read_csv(file)

In [4]:
df.head()

Unnamed: 0,tweet_id,event,full_text,stance
0,101,Atheism,dear lord thank u for all of ur blessings forg...,disagree
1,102,Atheism,"Blessed are the peacemakers, for they shall be...",disagree
2,103,Atheism,I am not conformed to this world. I am transfo...,disagree
3,104,Atheism,Salah should be prayed with #focus and #unders...,disagree
4,105,Atheism,And stay in your houses and do not display you...,disagree


In [5]:
df.shape

(2814, 4)

# 2. Connect to LLM
- for standing up a smaller huggingface model
```python
llm = HuggingFacePipeline.from_model_id(model_id="declare-lab/flan-alpaca-gpt4-xl", task = 'text2text-generation', device=0,
                                      model_kwargs={"max_length":500, "do_sample":False})
```
- for a mid-sized, more modern, huggingface model. You can use accelerate and chance ``` device = "auto"``` to use multiple GPUs
```python
model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=0,
    max_length=200,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)
```


In [6]:
# read in personal access token

with open('personal_hugginface_token.txt', 'r') as file:  
    token = file.read().strip()  

In [7]:
model = "meta-llama/Llama-2-13b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model, token=token)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    token = token,
    device_map="auto",
    max_length=1024,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=2.0
)

llm = HuggingFacePipeline(pipeline=pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

look at a few examples

In [8]:
question = '''What is the stance of the following social media post given in quotes toward the U.S. Army? Give the stance as either for, against, or neutral. Only return the stance and no other text.
"@vondeveen If the Army wants to actually recruit people, maybe stop breaking people and actually prosecute sexual assualt #nomorewar."
'''
llm(question)

'StANCE: Against'

In [9]:
question = '''What is the stance of the following social media post given in quotes toward the U.S. Army? Give the stance as either for, against, or neutral. Only return the stance and no other text.
"@artfulask I have never seen a pink-eared duck before. #Army"
'''
llm(question)

'Stanced Against'

In [10]:
question = '''What is the stance of the following social media post given in quotes toward the U.S. Army? Give the stance as either for, against, or neutral. Only return the stance and no other text.
"I think the @Army helped me become disciplined. I would have surely flunked out of college chasing tail if I didn't get some discipline there. #SFL"
'''
llm(question)

'StANCE: FOR'

# 3. Prompt Engineering for Stance Classification

## 3(a). Task-only prompt

In [11]:
# task-only prompt

task_template = '''
Classify the statement, "{statement}" as to whether it is for, against, or neutral. Only return the classification label for the statement, and no other text.
'''

task_prompt = PromptTemplate(
    
    input_variables=["statement"],
    template=task_template
)

### Run an example

In [12]:
# Create a sample for all examples

random_sample_df = df.sample(100)
y_true = random_sample_df['stance'].tolist()

In [13]:
llm_chain = LLMChain(prompt=task_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [14]:
np.unique(results, return_counts=True)

(array(['  ', '   */', '   For example: "For", "-Against-".',
        '* For: "for".',
        '* For: "onestepforwardintherightdirection". This means that Anna\'s tweet expresses a positive sentiment towards Semantic Scholarship (#semst).',
        'For example if there were any spelling errors in your analysis please ignore them (such a typo) but do not correct such mistakes within my original input data). Please also provide some information about how confident are that this classifications accurate? For instance perhaps on scale of one-to ten where would confidence level be at when assigning accuracy rating here?"',
        'For example if this were a real tweet with that content (which i doubt) then we would get back \'Against\' because RT means retweet which implies they are expressing opposition/disagreement towards what someone else said before them in their original post by repeating its contents while adding some additional commentary at end here indicating disagree ment."',


In [15]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word or 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

       agree       0.23      0.86      0.37        21
    disagree       1.00      0.09      0.17        53
     neutral       0.28      0.19      0.23        26

    accuracy                           0.28       100
   macro avg       0.50      0.38      0.26       100
weighted avg       0.65      0.28      0.23       100



### Run on all Data

In [15]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [34:45,  1.35it/s]


In [None]:
np.unique(results, return_counts=True)

In [17]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word or 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['task_preds'] = y_pred

In [None]:
np.unique(df['task_preds'], return_counts=True)

In [None]:
report = classification_report(df['stance'], df['task_preds'])

print(report)

## 3(b). Context prompt

In [16]:
# context prompt

context_template = '''
The statement, "{statement}" is a social media post expressing an opinion about an entity. Classify the statement in quotes as to whether it is for, against, or neutral toward the entity, "{event}". Only return the classification label for the statement in quotes toward the entity in quotes, and no other text.
'''

context_prompt = PromptTemplate(
    input_variables=["event","statement"],
    template=context_template
)

### Run an example

In [17]:
llm_chain = LLMChain(prompt=context_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [18]:
np.unique(results, return_counts=True)

(array(['* For: Greenlands Ice Melts Faster This Summer Under A Dome Of High Pressure (2019)',
        "Example: For FemInism - would be labeled simply 'For'.  ",
        'Example: For FemInvestmentBanks because they are amazeballs! would be labeled AsFor (or Positive). Against FreeSpeech when expressed by Conservatives? Would likely get marked NeutralNegative(leaning negative) since free speech isn\'t explicitly positive/negative but has nuances depending on perspective of user evaluating context."',
        'Example: For HillayryClintion would be classified simply F (for) because only part of teh tweet mentions GOp which has nothing tdo wuth hte clasification being foe/against etc...',
        'Example: For HilleryClintion (HC) would be labeled HC-F; Against Hiloryclnton(hc); Neutral Toward hilroy clitnon()',
        'Example: For/Against Classification Labels are case-insensitive (either all caps OR lowercase)  \nFor HillaryClintion = F    Againstdt= A\tNeutral towards hilllryclitno

In [19]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.19      0.86      0.31        21
    disagree       1.00      0.04      0.07        53
     neutral       0.25      0.04      0.07        26

    accuracy                           0.21       100
   macro avg       0.48      0.31      0.15       100
weighted avg       0.64      0.21      0.12       100



### Run on all Data

In [23]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [32:05,  1.46it/s]


In [None]:
np.unique(results, return_counts=True)

In [25]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['context_preds'] = y_pred

In [None]:
np.unique(df['context_preds'], return_counts=True)

In [None]:
report = classification_report(df['stance'], df['context_preds'])

print(report)

## 3(c). Context + FSP

In [20]:
# Create an example template

example_template = '''
entity: {entity}
statement: {statement}
stance: {stance}
'''

example_prompt = PromptTemplate(
    input_variables=["entity","statement", "stance"],
    template=example_template
)

In [21]:
# Give some examples

examples = [
    {'entity': "Atheism",
     'statement':"Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST",
     'stance': 'for'},
    {'entity': "Climate Change is a Real Concern",
     'statement':"@AlharbiF I'll bomb anything I can get my hands on, especially if THEY aren't christian. #graham2016 #GOP #SemST",
     'stance': 'neutral'},
    {'entity': "Feminist Movement",
     'statement':"Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST",
     'stance': 'for'},
    {'entity': "Hillary Clinton",
     'statement':"Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST",
     'stance': 'against'},
    {'entity': "Legalization of Abortion",
     'statement':"@k_yoder That lady needs help, mental illness is a serious issue. #SemST",
     'stance': 'neutral'},
]

In [22]:
prefix = """
The following statements are social media posts expressing opinions about an entities. Each statement can either be for, against, or neutral toward their associated entity.
"""

suffix = '''
Now, classify the following statement toward the following entity. Only return the stance classification of the statement toward the entity and no other text.

entity: {event}
statement: {statement}
stance:
'''

few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [23]:
llm_chain = LLMChain(prompt=few_shot_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [24]:
np.unique(results, return_counts=True)

(array(['Please note there will only ever exist three possible classes (for/againstaGAINST) so please do not include any additional information such as confidence scores etc., just provide one word describing how much each should receive based upon its content towards this particular topic area; i e whether they support it strongly ("strongFor") weakley supports but still has reservations("weakfory"), actively opposes(AGAInst), neither endorsements nor opposition expressed whatsoever which leaves everything open ended leaving readers free form interpretation without explicit guidance from authorial intent behind words chosen here today alike...',
        'Please note this will not only test single word sentiment but also multiword phrase sentiments as well since there may exist phrases like "one smallstep" which would have different meaning than just considering each individual words (for/againsta).',
        'Please provide only one word as answer (for/againsto).',
        'Please sel

In [25]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00        21
    disagree       0.00      0.00      0.00        53
     neutral       0.24      0.88      0.38        26

    accuracy                           0.23       100
   macro avg       0.08      0.29      0.13       100
weighted avg       0.06      0.23      0.10       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [33]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [1:12:04,  1.54s/it]


In [None]:
np.unique(results, return_counts=True)

In [35]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['fsp_preds'] = y_pred

In [None]:
np.unique(df['fsp_preds'], return_counts=True)

In [None]:
report = classification_report(df['stance'], df['fsp_preds'])

print(report)

## 3(d). Context + FSP + Reasoning

In [26]:
# Create an example template

example_and_reason_template = '''
entity: {entity}
statement: {statement}
stance: {stance}
reason: {reason}
'''

example_and_reason_prompt = PromptTemplate(
    input_variables=["entity","statement", "stance", "reason"],
    template=example_and_reason_template
)

In [27]:
# Give some examples

examples = [
    {'entity': "Atheism",
     'statement':'''Leaving Christianity enables you to love the people you once rejected. #freethinker #Christianity #SemST''',
     'stance': 'for',
     'reason': "The statement is for atheism as it claims leaving a religion like christianity makes one more loving."
    },
    {'entity': "Climate Change is a Real Concern",
     'statement':'''@AlharbiF I'll bomb anything I can get my hands on, especially if THEY aren't christian. #graham2016 #GOP #SemST''',
     'stance': 'neutral',
     'reason': "The statement does not explicity reference climate change and it is not clear who the author would bomb, so it is neutral toward climate change."
    },
    {'entity': "Feminist Movement",
     'statement':'''Always a delight to see chest-drumming alpha males hiss and scuttle backwards up the wall when a feminist enters the room. #manly #SemST''',
     'stance': 'for',
     'reason': "The statement supports the feminist movement as it makes fun of alpha males, who are typically seen as being against feminism."
    },
    {'entity': "Hillary Clinton",
     'statement':'''Would you wanna be in a long term relationship with some bitch that hides her emails, & lies to your face? Then #Dontvote #SemST''',
     'stance': 'against',
     'reason': "The statement is against Hillary Clinton as it calles her a bitch and claims she does bad things like hides emails and lies."
    },
    {'entity': "Legalization of Abortion",
     'statement':'''@k_yoder That lady needs help, mental illness is a serious issue. #SemST''',
     'stance': 'neutral',
     'reason': "The statement is neutral toward legalization of abortion as it does not talk about abortion and its not clear if the mentioned lady is related to abortion at all."
    }
]

In [28]:
prefix = """
The following statements are social media posts expressing opinions about an entities. Each statement can either be for, against, or neutral toward their associated entity, and each statement includes the reason why they have that stance.
"""

suffix = '''
Now, classify the following statement toward the following entity as for, against, or neutral and give the reason why you classified it as that stance. Only return the stance classification of the statement toward the entity and the reason for that classifcation, and no other text.

entity: {event}
statement: {statement}
stance:
reason:
'''

few_shot_and_reason_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_and_reason_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [29]:
llm_chain = LLMChain(prompt=few_shot_and_reason_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [30]:
np.unique(results)

array(['',
       'Please answer these questions based only from information provided above',
       'Please answer this question based only from what we know here; do NOT look beyond these texts given above!',
       'Please answer this question based soleately upon these given criteria; do NOT research beyond just reading them here!',
       'Please include only relevant information while answering this question (no needless words)',
       'Please note this should only contain "again",neutral"or neither answer will work',
       'Please note this task has only three options (for/againsta gainstal) because there may sometimes exist situations where none o these apply well enough; hence please do indicate any such cases clearly',
       'Please provide justification',
       'Please provide only two options instead o three because i will use spaCy library which returns top N items(i am using nlu() function)',
       '```'], dtype='<U199')

In [31]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'for' in stance:  
        y_pred.append('agree')  
    elif 'against' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')

y_true = [str.lower(i) for i in y_true]

# generating the classification report
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

       agree       0.33      0.05      0.08        21
    disagree       0.00      0.00      0.00        53
     neutral       0.26      0.96      0.41        26

    accuracy                           0.26       100
   macro avg       0.20      0.34      0.16       100
weighted avg       0.14      0.26      0.12       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [43]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

2814it [3:45:33,  4.81s/it]


In [44]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'for' in stance:  
        y_pred.append('agree')  
    elif 'against' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance:  
        y_pred.append('neutral')
    else:  
        y_pred.append('neutral')
        
df['fsp_reason_preds'] = y_pred
df['fsp_reason_reasons'] = reasons

In [None]:
np.unique(df['fsp_reason_preds'], return_counts=True)

In [None]:
report = classification_report(df['stance'], df['fsp_reason_preds'])

print(report)

In [47]:
df.to_csv("semeval_llama-2-7B_results.csv")