In [None]:
!pip install -q pyarrow==15.0.1
!pip install -q datasets
!pip install -q langchain-groq langchain
from datasets import Dataset, load_dataset
import pandas as pd
from tqdm.auto import tqdm
import time
from langchain_groq import ChatGroq
from collections import Counter

In [None]:
dataset1=load_dataset("mediabiasgroup/mbib-base")

In [None]:
chat = ChatGroq(temperature=0,groq_api_key='Your_API_key', model_name='llama-3.1-70b-versatile')

In [None]:
dataset2=pd.DataFrame(dataset1['political_bias'].shuffle(seed=42))


In [None]:
## chunk
dataset3=dataset2[2000:2500]

In [None]:
## Randomly sampling 3 data-points from each class from the entire dataset
df3=dataset2[dataset2['label']==1].sample(n=3,random_state=42)
df4 = dataset2[dataset2['label']==0].sample(n=3,random_state=42)

In [None]:
sampled_df = pd.concat([df4, df3])

In [None]:
sampled_df=sampled_df[['text','label']]

In [None]:
pairwise_list = []

# Iterate over the DataFrame rows
for index, row in sampled_df.iterrows():
    # Create a dictionary for each row
    pair_dict = {'text': row['text'], 'label': row['label']}
    # Append the dictionary to the list
    pairwise_list.append(pair_dict)

print(pairwise_list)

In [None]:
examples=pairwise_list

In [None]:
from langchain import PromptTemplate, FewShotPromptTemplate

example_formatter_template = """text: {text}
label: {label}
"""

example_prompt = PromptTemplate(
    input_variables=["text", "label"],
    template=example_formatter_template,
)

In [None]:
few_shot_prompt = FewShotPromptTemplate(

    examples=examples,

    example_prompt=example_prompt,

    prefix="Here are some examples of politically biased and unbiased text given below. The label for biased text is 1 and that of unbiased is 0:\n",

    suffix="Now only give the final label(0 or 1) indicating unbiased(0) or biased(1) text.\n\ntext: {input}\nlabel: ",

    input_variables=["input"],

    example_separator="\n",
)

In [None]:
chain=few_shot_prompt|chat

In [None]:
dataset3

In [None]:
import time
from tqdm import tqdm

pred = []
label = []
misclassified = []
max_retries = 5  # Set a limit for retries

for id, row in tqdm(dataset3.iterrows()):
    input = row['text']
    retries = 0  # Track the number of retries for each request

    while retries <= max_retries:
        try:
            response = chain.invoke({'input': input})
            break  # Exit the retry loop if successful
        except Exception as e:
            retries += 1
            if retries > max_retries:
                print(f"Max retries reached for input: {input}. Skipping this entry.")
                response = None  # You could skip processing for this entry if retries fail
                break
            print(f"Error occurred: {e}. Retrying in {2 ** retries} seconds...")
            time.sleep(2 ** retries)  # Exponential backoff

    if response is None:
        continue  # Skip the rest if response is None

    #print(input)
    print(response.content)
    print(row['label'])

    if response.content != str(row['label']):
        misclassified.append(row['text'])
        label.append(row['label'])

    pred.append(response.content)
    time.sleep(4)  # Control the rate of requests


In [None]:
pred=[int(x) for x in pred]

In [None]:
## Macro-F1-score in the classification report
from sklearn.metrics import classification_report

print(classification_report(dataset3['label'], pred))