In [None]:
import os
import pandas as pd
import openai
import re
import torch
import time
from transformers import AutoTokenizer#, LlamaForCausalLM, LlamaTokenizer, BloomTokenizerFast, BloomForCausalLM
from IPython.display import display, Markdown

### File

In [None]:
df_main = pd.read_csv('./labeled.csv')
#df_main = df_main[df_main['stance'].notna()]
#df_main['index'] = df_main.index
#df_main.head()

In [None]:
def clean_text(text):
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\x00-\x7F]', '', text)
    

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'&amp', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    text = text.replace('_', '')
    text = text.replace('-', '')

    # Convert to lowercase
    text = text.lower()

    return text

def clean_dataframe(df, column_name):
    # Clean text in the specified column of the DataFrame
    df[column_name] = df[column_name].apply(clean_text)
    return df

In [None]:
df_main = clean_dataframe(df_main, 'text')
df_main.head()

## GPT-3.5

In [None]:
openai.api_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"

#### Check for the length of each article in tokens

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

for i in range(len(df_main['text'])):
    text = df_main['text'][i]
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    num_tokens = input_ids.shape[1]
    if num_tokens >= 4000:
        print(num_tokens, 'tokens at index:', i)

In [None]:
sys = "You are an expert in political science, political narratives, social media, disinformation and misinformation spread in social media. You can carefully read political articles and define the political narrative of each article. You are an expert in recognizing pro russian, anti russian, and neutral narratives."
#question = "What is the main political narrative of the given political article? "
#question = "In 5 words, describe the main political narrative of the given political article? "
#question = "Reply to me in a single word. Does the stance of the following article is pro-russian, anti-russian, or neutral? "
#question = "Reply to me in a single word such as yes or no, and briefly explain your reasoning. Does this article contain the given political narrative or narratives? "

In [None]:
question = "Given the following news article: "
question1 = ". Reply to me in a single word only such as yes or no. Does the given article contain the following political narratives: "

In [None]:
# Stance

text = []
for i in range(len(df_main)):
    temp = question + "The article: " + df_main['text'][i]
    text.append(temp)
text

In [None]:
# Narratives

text = []
for i in range(len(df_main)):
    temp = question + df_main['text'][i] + question1 + df_main['narratives'][i] + "?"
    text.append(temp)
text

In [None]:
# creating chat

def chat(system, user_assistant):
    assert isinstance(system, str), "`system` should be a string"
    assert isinstance(user_assistant, list), "`user_assistant` should be a list"
    system_msg = [{"role": "system", "content": system}]
    user_assistant_msgs = [
        {"role": "assistant", "content": user_assistant[i]} if i % 2 else {"role": "user", "content": user_assistant[i]}

    for i in range(len(user_assistant))
            ]
    msgs = system_msg + user_assistant_msgs
    response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=msgs
        )
    status_code = response["choices"][0]["finish_reason"]
    assert status_code == "stop", f"The status code was {status_code}."
    return response["choices"][0]["message"]["content"]

In [None]:
gpt_ans = []
for i in range(len(text)):
    response_fn_test = chat(
    sys,
    [text[i]])
    gpt_ans.append(response_fn_test)
    print('index: ', i, 'response: ',  response_fn_test, '\n')
    time.sleep(21)

In [None]:
# in case of job interruption, continue at certain index

subset = df_main[16:]
subset = subset.reset_index(drop = True)
print(len(subset))
subset

In [None]:
gpt_ans1 = []
gpt_ans1 = gpt_ans
print(len(gpt_ans1))
print(gpt_ans1)

In [None]:
gpt_ans2 = []
gpt_ans2 = gpt_ans
print(len(gpt_ans2))
print(gpt_ans2)

In [None]:
gpt_final = []
gpt_final = gpt_ans1 + gpt_ans2
print(len(gpt_final))
print(gpt_final)

In [None]:
# df_main['stance_gpt'] = gpt_final

In [None]:
df_main = clean_dataframe(df_main, 'stance_gpt')
df_main['stance_gpt'].value_counts()

In [None]:
df_main['stance_gpt'] = df_main['stance_gpt'].map({'antirussian': 0, 'neutral': 1, 'prorussian': 0})

### Classification Results

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(df_main['stance_enc'], df_main['stance_gpt'])

cm_df = pd.DataFrame(cm)

plt.figure(figsize=(6,5))
sns.heatmap(cm_df,annot=True, fmt=".1f")
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
print('\nClassification Report:\n', classification_report(df_main['stance_enc'], df_main['stance_gpt']))

In [None]:
df_main.to_csv('df_main_LLMs.csv', index = False)

## BART + PALM 2

### BART Summarizer

In [None]:
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
bart_pretrained = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
result = []
for i in range(len(df_main)):
    input_tokens = tokenizer.batch_encode_plus([df_main['text'][i]], return_tensors = 'pt', max_length = 1024, truncation = True)['input_ids']

    encoded_ids = bart_pretrained.generate(input_tokens,
                                      num_beams = 4,
                                      length_penalty = 2.0,
                                      max_length = 497,
                                      min_length = 497,
                                      no_repeat_ngram_size = 3)
    summary = tokenizer.decode(encoded_ids.squeeze(), skip_special_tokens = True)
    result.append(summary)

In [None]:
df_main['summary'] = result
df_main.head()

### PALM 2

In [None]:
import google.generativeai as palm

palm.configure(api_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

def make_prompt(query, relevant_passage):
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ").replace(".", "").replace('"', "")
    prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

    return prompt

In [None]:
#query = "Provide a single word answer only. Does the stance of the following article is pro-russian, anti-russian, or neutral? "
#query = "Give me the main political narratives of the given political article? Be concise. "
#query = "In one sentence only, describe the main political narrative of the following article. "

question = "Given the following news article: "
question1 = ". Reply to me in a single word only such as yes or no. Does the given article contain the following political narratives: "

In [None]:
text = []
for i in range(len(df_main)):
    temp = question + df_main['text'][i] + question1 + df_main['narratives'][i] + "?"
    text.append(temp)
text

In [None]:
import textwrap

text = []
for i in range(len(df_main)):
    prompt = make_prompt(query, df_main['text'][i])
    text.append(prompt)

In [None]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]
text_model = text_models[0]

In [None]:
temperature = 0.5

answers = []
for i in range(len(df_main)):
    answer = palm.generate_text(prompt = text[i],
                                model = text_model,
                                candidate_count = 1,
                                temperature = temperature,
                                max_output_tokens = 1024,)
    time.sleep(3)
    #for i, candidate in enumerate(answer.candidates):
        #print(f"Candidate {i}: {candidate['output']}\n")
        #if candidate['output'] == '':
            #answers.append('none')
    answers.append(answer.result)
    print('index: ', i, 'response: ', answer.result)

In [None]:
df_main["PALM_narratives"] = answers
df_main.head()

In [None]:
nones = df_main[~df_main["PALM_narratives"].notna()]
df_main = df_main[df_main["PALM_narratives"].notna()]
print(len(nones))
print(len(df_main))

In [None]:
df_main["PALM_narratives"].value_counts()

In [None]:
df_main['PALM_narratives_enc'] = df_main['PALM_narratives'].map({'yes': 1, 'no': 0})
df_main["PALM_narratives_enc"].value_counts()

In [None]:
df_main['PALM_stance_enc'] = df_main['PALM_stance'].map({'anti-russian': 0, 'pro-russian': 2, 'neutral': 1})
df_main["PALM_stance_enc"].value_counts()

### Classification Results

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(df_main["stance_enc"], df_main['PALM_stance'])

cm_df = pd.DataFrame(cm, index = ['Anti-Russian','Neutral','Pro-Russian'], 
                     columns = ['Anti-Russian','Neutral','Pro-Russian'])

plt.figure(figsize=(6,5))
sns.heatmap(cm_df,annot=True, fmt=".1f")
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
print('\nClassification Report:\n', classification_report(df_main['stance_enc'], df_main['PALM_stance']))

In [None]:
df_main.to_csv('df_main_LLMs.csv', index = False)