# Stance Classification for CovidLies
- read in the data and pre-process
- set up an LLM
- Investigate different prompting schemes
    - just context
    - context + few-shot learning
    - content + few-shot learning + reasoning

In [1]:
# Package installations to work on WIRE

! pip install transformers
! pip install langchain
! pip install accelerate
! pip install einops
! pip install deepspeed

Collecting transformers
  Using cached transformers-4.33.1-py3-none-any.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Using cached huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (771 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting safetensors>=0.3.1 (from transformers)
  Using cached safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Installing collected packages: tokenizers, safetensors, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.17.1 regex-2023.8.8 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m

In [1]:
import os, re, pandas as pd, numpy as np, ast, json
from pprint import pprint
from tqdm import tqdm  

import torch
from langchain import PromptTemplate, FewShotPromptTemplate, HuggingFacePipeline, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import deepspeed

from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import seaborn as sns

2023-09-13 12:34:59.983309: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2023-09-13 12:35:01,544] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# 1. Import and Preprocess Data

In [2]:
file = os.path.join("covid-lies","data_merged.csv")

In [3]:
df = pd.read_csv(file)
df = df.rename(columns={'misconception': 'event'}) 

In [4]:
df.head()

Unnamed: 0,tweet_id,stance,full_text,event
0,1233907923765559296,neutral,Getting coronavirus and then coughing on peopl...,Blowing conch shells destroys coronavirus pote...
1,1233907923765559296,neutral,Getting coronavirus and then coughing on peopl...,Eating cabbage prevents coronavirus infection.
2,1233911842910720000,neutral,@Barioth_a_bot そう…ですね…私は常時深海に居ります故…ふわふわ、という感触で...,Swans and dolphins swimming in Venice canals f...
3,1233947734094290944,neutral,RT @TabascoHOY: #Coronavirus☣\n➡Los infectados...,Cocaine cures coronavirus.
4,1233947734094290944,neutral,RT @TabascoHOY: #Coronavirus☣\n➡Los infectados...,Avocado and mint tea cures coronavirus.


In [5]:
df.shape

(3196, 4)

# 2. Connect to LLM
- for standing up a smaller huggingface model
```python
llm = HuggingFacePipeline.from_model_id(model_id="declare-lab/flan-alpaca-gpt4-xl", task = 'text2text-generation', device=0,
                                      model_kwargs={"max_length":500, "do_sample":False})
```
- for a mid-sized, more modern, huggingface model. You can use accelerate and chance ``` device = "auto"``` to use multiple GPUs
```python
model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=0,
    max_length=200,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

llm = HuggingFacePipeline(pipeline=pipe)
```


In [6]:
model = "google/flan-ul2"

tokenizer = AutoTokenizer.from_pretrained(model)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",
    max_length=1000,
    do_sample=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

look at a few examples

In [7]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@vondeveen If the Army wants to actually recruit people, maybe stop breaking people and actually prosecute sexual assualt #nomorewar."
'''
llm(question)

'AGAINST'

In [8]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"@artfulask I have never seen a pink-eared duck before. #Army"
'''
llm(question)

'NEUTRAL'

In [9]:
question = '''What is the stance of of the following social media post toward the U.S. Army. Give the stance as either 'FOR', "AGAINST', or 'NEUTRAL'. Only return the stance:
"I think the @Army helped me become disciplined. I would have surely flunked out of college chasing tail if I didn't get some discipline there. #SFL"
'''
llm(question)

'FOR'

# 3. Prompt Engineering for Stance Classification

## 3(a). Task-only prompt

In [10]:
# task-only prompt

task_template = '''
Classify the statement as to whether it is 'SUPPORTS', 'DENIES', 'NEUTRAL', or 'UNRELATED'. Only return the classification label for the statement, and no other text.

statement: {statement}
'''

task_prompt = PromptTemplate(
    input_variables=["statement"],
    template=task_template
)

### Run an example

In [11]:
# Create a sample for all examples

random_sample_df = df.sample(100)
y_true = random_sample_df['stance'].tolist()

In [12]:
llm_chain = LLMChain(prompt=task_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [13]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'for' in lower_word:  
        y_pred.append('agree')  
    elif 'against' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word or 'unrelated' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report
report = classification_report(y_true, y_pred)

print(report)

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00         1
    disagree       0.00      0.00      0.00         1
     neutral       0.98      1.00      0.99        98

    accuracy                           0.98       100
   macro avg       0.33      0.33      0.33       100
weighted avg       0.96      0.98      0.97       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [14]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

3196it [28:08,  1.89it/s]


In [15]:
np.unique(results, return_counts=True)

(array(['DENIES', 'NEUTRAL', 'SUPPORTS', 'UNRELATED'], dtype='<U9'),
 array([ 559, 2072,  198,  367]))

In [16]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word or 'unrelated' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['task_preds'] = y_pred

In [17]:
np.unique(df['task_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([ 198,  559, 2439]))

In [18]:
report = classification_report(df['stance'], df['task_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.01      0.02      0.01       100
    disagree       0.04      0.28      0.07        85
     neutral       0.95      0.77      0.85      3011

    accuracy                           0.73      3196
   macro avg       0.33      0.36      0.31      3196
weighted avg       0.89      0.73      0.80      3196



## 3(b). Context prompt

In [19]:
# context prompt

context_template = '''
The following statement is social media post about COVID or Coronavirus. Classify the statement as to whether it 'SUPPORTS', 'DENIES', is 'NEUTRAL', or is 'UNRELATED' to the belief below being true. Only return the classification label for the statement toward the belief, and no other text.

belief: {event}
statement: {statement}
'''

context_prompt = PromptTemplate(
    input_variables=["event","statement"],
    template=context_template
)

### Run an example

In [20]:
llm_chain = LLMChain(prompt=context_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [21]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word or 'unrelated' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00         1
    disagree       0.00      0.00      0.00         1
     neutral       0.96      0.55      0.70        98

    accuracy                           0.54       100
   macro avg       0.32      0.18      0.23       100
weighted avg       0.94      0.54      0.69       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [22]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

3196it [30:31,  1.74it/s]


In [23]:
np.unique(results, return_counts=True)

(array(['DENIES', 'NEUTRAL', 'SUPPORTS', 'UNRELATED'], dtype='<U9'),
 array([1243, 1600,   24,  329]))

In [24]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word or 'unrelated' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['context_preds'] = y_pred

In [25]:
np.unique(df['context_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([  24, 1243, 1929]))

In [26]:
report = classification_report(df['stance'], df['context_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.58      0.14      0.23       100
    disagree       0.04      0.64      0.08        85
     neutral       0.96      0.62      0.75      3011

    accuracy                           0.60      3196
   macro avg       0.53      0.46      0.35      3196
weighted avg       0.93      0.60      0.72      3196



## 3(c). Context + FSP

In [27]:
# Create an example template

example_template = '''
belief: {belief}
statement: {statement}
stance: {stance}
'''

example_prompt = PromptTemplate(
    input_variables=["belief","statement", "stance"],
    template=example_template
)

In [28]:
# Give some examples

examples = [
    {'belief':"Africans are more resistant to coronavirus.",
     'statement':'''Happen now Blacks are Immune to the coronavirus ' there is a GOD https://t.co/LRq7SZYK0G''',
     'stance': 'SUPPORTS'},
    {'belief':"Alex Jones' silver-infused toothpaste kills COVID-19",
     'statement':'''#China #COVID-19 As work resumes in outbreak, brand-new 'normal' emerges https://t.co/VENOSSOnx5 https://t.co/RQoeSWoaHH''',
     'stance': 'UNRELATED'},
    {'belief':"COVID-19 is only as deadly as the seasonal flu.",
     'statement':'''@islandmonk @Stonekettle Closer to 650,000 people will die of the flu this year. The figure of 30,000 is just in the U.S. \nBut the flu has approximately 0.1% mortality vs 2% for COVID-19. Do the math.''',
     'stance': 'DENIES'},
    {'belief':"Coronavirus is genetically engineered.",
     'statement':'''@TheMadKiwi3 @goodfoodgal nah. A biological warfare agent would kill 99% of its victims, not 2% like the corona virus. This is a naturally occurring virus.''',
     'stance': 'DENIES'},
    {'belief':"SARS-CoV-2 can survive for weeks on surfaces.",
     'statement':'''Coronavirus could survive up to 9 days outside the body, study says https://t.co/JUzdJgc5Dz''',
     'stance': 'SUPPORTS'}
]

In [29]:
prefix = """
The following statements are social media posts about COVID or Coronavirus. The statements can support, deny, be neutral, or be unrelated toward its associated COVID belief."""

suffix = '''
Now, classify the following statement as to whether 'SUPPORTS', 'DENIES', is 'NEUTRAL', or is 'UNRELATED' toward the belief below being true. Only return the classification for the statement toward the belief, and no other text.

belief: {event}
statement: {statement}
'''

few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [30]:
llm_chain = LLMChain(prompt=few_shot_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [31]:
# converting to given labels for comparison

y_true = [str.lower(i) for i in y_true]
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word or 'unrelated' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')

# generating the classification report  
report = classification_report(y_true, y_pred)  
  
print(report)  

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00         1
    disagree       0.00      0.00      0.00         1
     neutral       0.96      0.54      0.69        98

    accuracy                           0.53       100
   macro avg       0.32      0.18      0.23       100
weighted avg       0.94      0.53      0.68       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Run on all Data

In [32]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

3196it [1:14:35,  1.40s/it]


In [33]:
np.unique(results, return_counts=True)

(array(['DENIES', 'NEUTRAL', 'SUPPORTS', 'stance: DENIES',
        'stance: NEUTRAL', 'stance: SUPPORTS', 'stance: UNRELATED'],
       dtype='<U17'),
 array([  60,   10,    1, 1416,  740,   46,  923]))

In [34]:
y_pred = []  
  
for word in results:  
    lower_word = word.lower()  
    if 'support' in lower_word:  
        y_pred.append('agree')  
    elif 'denies' in lower_word:  
        y_pred.append('disagree')  
    elif 'neutral' in lower_word or 'unrelated' in lower_word:  
        y_pred.append('neutral')  
    else:  
        y_pred.append('neutral')
        
df['fsp_preds'] = y_pred

In [35]:
np.unique(df['fsp_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([  47, 1476, 1673]))

In [36]:
report = classification_report(df['stance'], df['fsp_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.49      0.23      0.31       100
    disagree       0.04      0.72      0.08        85
     neutral       0.98      0.54      0.70      3011

    accuracy                           0.54      3196
   macro avg       0.50      0.50      0.36      3196
weighted avg       0.94      0.54      0.67      3196



## 3(d). Context + FSP + Reasoning

In [37]:
# Create an example template

example_and_reason_template = '''
belief: {belief}
statement: {statement}
stance: {stance}
reason: {reason}
'''

example_and_reason_prompt = PromptTemplate(
    input_variables=["belief","statement", "stance", "reason"],
    template=example_and_reason_template
)

In [38]:
# Give some examples

examples = [
    {'belief':"Africans are more resistant to coronavirus.",
     'statement':'''Happen now Blacks are Immune to the coronavirus ' there is a GOD https://t.co/LRq7SZYK0G''',
     'stance': 'SUPPORTS',
     'reason': 'The statement supports the beleif that Africans are more resistant to COVID, as it claims black people are immune to COVID, and most people in Africa are black.'
    },
    {'belief':"Alex Jones' silver-infused toothpaste kills COVID-19",
     'statement':'''#China #COVID-19 As work resumes in outbreak, brand-new 'normal' emerges https://t.co/VENOSSOnx5 https://t.co/RQoeSWoaHH''',
     'stance': 'UNRELATED',
     'reason': 'The statement is unrelated to the belief Alex Jones toothpaste as the statement does not talk about it.'
    },
    {'belief':"COVID-19 is only as deadly as the seasonal flu.",
     'statement':'''@islandmonk @Stonekettle Closer to 650,000 people will die of the flu this year. The figure of 30,000 is just in the U.S. \nBut the flu has approximately 0.1% mortality vs 2% for COVID-19. Do the math.''',
     'stance': 'DENIES',
     'reason': 'The statement denies COVID being only as deadly as the flu as it cites numbers that refute this belief.'
    },
    {'belief':"Coronavirus is genetically engineered.",
     'statement':'''@TheMadKiwi3 @goodfoodgal nah. A biological warfare agent would kill 99% of its victims, not 2% like the corona virus. This is a naturally occurring virus.''',
     'stance': 'DENIES',
     'reason': 'The statement denies COVID being genetically engineered as it claims if COVID was an engineered bioweapon, it would have killed much more people than it actually did.'
    },
    {'belief':"SARS-CoV-2 can survive for weeks on surfaces.",
     'statement':'''Coronavirus could survive up to 9 days outside the body, study says https://t.co/JUzdJgc5Dz''',
     'stance': 'SUPPORTS',
     'reason': 'The statement supports the belief that COVID can survive for weeks on surfaces because it claims COVID can survive outside of a body, which implies on a surface, for over a week.'
    }
]

In [39]:
prefix = """
The following statements are social media posts about COVID or Coronavirus. Each statement can support, deny, be neutral, or be unrelated toward its associated COVID belief and Each statement has the reason for that stance.
"""

suffix = '''
Now, classify the following statement as to whether 'SUPPORTS', 'DENIES', is 'NEUTRAL', or is 'UNRELATED' toward the belief below being true, and give your reason for the classification. Only return the classification for the statement toward the belief and the reasoning for the classification in the form of:'stance: STANCE, reason: REASON'

belief: {event}
statement: {statement}
'''

few_shot_and_reason_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_and_reason_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["event", "statement"],
    example_separator="\n"
)

### Run an example

In [40]:
llm_chain = LLMChain(prompt=few_shot_and_reason_prompt, llm=llm)

results=[]
for row in random_sample_df.iterrows():
    results.append(llm_chain.run(event=row[1]['event'], statement=row[1]['full_text']))



In [41]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'support' in stance:  
        y_pred.append('agree')  
    elif 'denies' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance or 'unrelated' in stance:  
        y_pred.append('neutral')  

y_true = [str.lower(i) for i in y_true]

# generating the classification report
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00         1
    disagree       0.00      0.00      0.00         1
     neutral       0.95      0.41      0.57        98

    accuracy                           0.40       100
   macro avg       0.32      0.14      0.19       100
weighted avg       0.93      0.40      0.56       100



### Run on all Data

In [42]:
# Running across the whole dataset

results = []
for index, row in tqdm(df.iterrows()):
    results.append(llm_chain.run(event=row['event'], statement=row['full_text']))

3196it [4:41:22,  5.28s/it]


In [47]:
results

['stance: DENIES, reason: REASON: This is unrelated to the topic.',
 'stance: DENIES, reason: REASON: Cabbage is not related to the COVID.',
 'stance: UNRELATED, reason: The statement is unrelated to the COVID-19 lockdown.',
 'stance: DENIES, reason: REASON: Cocaine is not a cure for COVID.',
 'stance: DENIES, reason: REASON: This is not true and there is no cure for this virus.',
 'stance: UNRELATED, reason: The statement is unrelated to the belief that Vitamin D cures coronavirus.',
 'stance: UNRELATED, reason: REASON: RT @TabascoHOY: #Coronavirus Los infectados por #COVID19 en el mundo asciende a 83 mil 652 casos, los muertos llegan a 2 mil 791 en #Chin...',
 'stance: UNRELATED, reason: Fennel tea is not a cure for COVID.',
 'stance: UNRELATED, reason: REASON: Essential oils are not related to the flu.',
 'stance: UNRELATED',
 'stance: DENIES, reason: REASON: COVID-19 is more dangerous than the seasonal flu.',
 'stance: DENIES reason: The statement denies COVID being only as deadly 

In [43]:
y_pred = []  
reasons = []  
  
for result in results:  
    split_result = result.split('reason:')  
    if len(split_result) == 2:  
        # Extract stance and reason, remove leading/trailing whitespace and convert stance to lowercase  
        stance, reason = split_result
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reason = reason.strip()
        reasons.append(reason)
    else:
        # model did not give a reason
        stance = split_result[0]
        stance = stance.replace('stance:', '').replace(',', '').strip().lower()
        reasons.append('')
    if 'support' in stance:  
        y_pred.append('agree')  
    elif 'denies' in stance:  
        y_pred.append('disagree')  
    elif 'neutral' in stance or 'unrelated' in stance:  
        y_pred.append('neutral')  
        
df['fsp_reason_preds'] = y_pred
df['fsp_reason_reasons'] = reasons

In [44]:
np.unique(df['fsp_reason_preds'], return_counts=True)

(array(['agree', 'disagree', 'neutral'], dtype=object),
 array([  38, 1845, 1313]))

In [45]:
report = classification_report(df['stance'], df['fsp_reason_preds'])

print(report)

              precision    recall  f1-score   support

       agree       0.50      0.19      0.28       100
    disagree       0.04      0.80      0.07        85
     neutral       0.97      0.42      0.59      3011

    accuracy                           0.43      3196
   macro avg       0.50      0.47      0.31      3196
weighted avg       0.93      0.43      0.57      3196



In [46]:
df.to_csv("covid-lies_flan-UL2_results.csv")