In [None]:
import pandas as pd
import re
from transformers import pipeline
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

In [None]:
df_main = pd.read_csv('./labeled.csv')
#df_main = df_main[df_main['text'].notna()]
#df_main['index1'] = df_main.index

In [None]:
def clean_text(text):
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    text = text.replace('_', '')
    text = text.replace('-', '')

    # Convert to lowercase
    text = text.lower()

    return text

def clean_dataframe(df, column_name):
    # Clean text in the specified column of the DataFrame
    df[column_name] = df[column_name].apply(clean_text)
    return df

In [None]:
# clean data
df_main = clean_dataframe(df_main, 'text')

In [None]:
df_main['stance'].value_counts()

In [None]:
bart_pipeline = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli", pretrained = True)

#### One article classification with premise, labels and hypothesis

In [None]:
stance_freq = df_main['stance'].value_counts()
stance_names = stance_freq.index.tolist()
print(stance_names)

In [None]:
premise = df_main['text'][0]
labels = stance_names
hypothesis = " The stance of this text is anti-russian {}"

In [None]:
# without hypothesis
predictions = bart_pipeline(premise, labels, multi_label = False)
print(predictions)

In [None]:
# with hypothesis
predictions = bart_pipeline(premise, labels, multi_label = False, hypothesis_template = hypothesis)
print(predictions)

#### All articles classification with premise, labels and hypothesis

In [None]:
df_main1 = df_main.copy()
df_main1['BART_labels'] = df_main.apply(lambda x: bart_pipeline(x.text, labels, multi_label = False, hypothesis_template = hypothesis), axis = 1)

In [None]:
df_main1['BART_labels']

In [None]:
df_main1['BART_predicted_category'] = df_main1.apply(lambda row: row['BART_labels']['labels'][0], axis = 1)
df_main1['BART_score'] = df_main1.apply(lambda row: row['BART_labels']['scores'][0], axis = 1)

In [None]:
df_main1['BART_predicted_category'].value_counts()

In [None]:
df_main1['BART_enc'] = df_main1['BART_predicted_category'].map({'Neutral': 1, 'Pro-Russian': 2, 'Anti-Russian': 0})

In [None]:
cm = confusion_matrix(df_main1['stance_enc'], df_main1['BART_enc'])

cm_df = pd.DataFrame(cm)

plt.figure(figsize=(6,5))
sns.heatmap(cm_df,annot=True, fmt=".1f")
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
print('\nClassification Report:\n', classification_report(df_main1['stance_enc'], df_main1['BART_enc']))