# IBM stance detection with topics and arguments

Stance detection of the IBM datasets using topics and arguments as input to train the model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import shap

#shap.initjs()

In [None]:
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

## 1. Import datasets

In [None]:
train_path = '../data/ibm_train.csv'
test_path = '../data/ibm_test.csv'

plots_path = '../plots/topics and arguments/'
models_path = '../models/topics and arguments/'

### Training set

In [None]:
train = pd.read_csv(train_path) 

In [None]:
train.head()

### Test set

In [None]:
test = pd.read_csv(test_path) 

In [None]:
test.head()

Concatenate the topic and the argument for each example in the dataset

In [None]:
#train['argument'] = train['topic'] + ' ' + train['argument']
#test['argument'] = test['topic'] + ' ' + test['argument']

## 2. Preprocessing data

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from autocorrect import Speller
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
# lower
# remove extra whitespace
# tokenize
# spelling corrections
# remove stopwords (da verificare se migliora o peggiora)
# remove punctation
# lemmatization
# stemming 
# remove urls
# remove tags

In [None]:
class PreprocessArguments:
    def __init__(self):
        self.spell = Speller(lang='en')
        self.stopwords_set = set(stopwords.words('english'))
        self.punct_remover = RegexpTokenizer(r'\w+')
        self.porter = PorterStemmer()
        self.wnl = WordNetLemmatizer()
    
    def preprocess(self, s):    
        # lowercase
        s = s.lower()
        # remove double whitespaces
        s = ' '.join(s.split())
        # tokenize
        s = word_tokenize(s)
        # spell correction
        s = [self.spell(word) for word in s]
        # remove punctuation
        s = self.punct_remover.tokenize(' '.join(s))
        # remove stopwords
        s = [word for word in s if word not in self.stopwords_set]
        # stemming
        s = [self.porter.stem(word) for word in s]
        #lemmatization
        #s = [self.wnl.lemmatize(word) for word in s]
        
        return s

In [None]:
preproc = PreprocessArguments()

In [None]:
train['arg_tok'] = [preproc.preprocess(row['argument']) for idx, row in train.iterrows()]

In [None]:
test['arg_tok'] = [preproc.preprocess(row['argument']) for idx, row in test.iterrows()]

In [None]:
train.head()

## 3. Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

### 3.1 Baseline: Naive Bayes

#### Grid search

In [None]:
def dummy_tokenizer(sentence):
    return sentence

In [None]:
lb = LabelBinarizer()
y_train = lb.fit_transform(train['stance'])

In [None]:
scoring = ['accuracy', 'f1_macro', 'precision', 'recall']

In [None]:
pipe = Pipeline([('preproc', TfidfVectorizer()), ('nb', MultinomialNB())])

In [None]:
params = [
    {'preproc': [TfidfVectorizer()],
     'preproc__tokenizer': [dummy_tokenizer],
     'preproc__preprocessor': [dummy_tokenizer],
     'preproc__token_pattern': [None],
     #'preproc__min_df': [1, 10, 20, 50, 100, 200],
     'preproc__min_df': np.arange(1,6,1),
     #'preproc__max_features': [None, 100, 200, 500],
     #'preproc__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
     'preproc__ngram_range': [(1,1)],
     
     'nb': [MultinomialNB()],
     #'nb__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
     #'nb__alpha': [0.01, 0.05, 0.08, 0.1, 0.5, 0.8, 1, 5, 8]
     'nb__alpha': np.arange(0.01,0.2,0.01)
    }
]

In [None]:
clf = GridSearchCV(estimator=pipe, param_grid=params, scoring=scoring, refit='f1_macro',
                   cv=3, return_train_score=True, n_jobs=-1, verbose=0)

In [None]:
clf.fit(train['arg_tok'], y_train.ravel())

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [None]:
#pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_f1_macro')[['mean_test_f1_macro', 'param_nb__alpha', 'param_preproc__ngram_range', 'param_preproc__min_df']][:40]
#tmp = pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_f1_macro')[['mean_test_f1_macro', 'param_nb', 'param_preproc__ngram_range', 'param_preproc__min_df']][:600]
#tmp.groupby(by='param_preproc__min_df').count()


In [None]:
#pd.DataFrame(clf.cv_results_).to_csv(models_path+'nb_gridsearch.csv')

#### Evaluation

In [None]:
best_clf = clf.best_estimator_
best_clf.fit(train['arg_tok'], y_train.ravel())
pred_test = best_clf.predict(test['arg_tok'])
y_test = lb.transform(test['stance'])

In [None]:
cm = confusion_matrix(y_test, pred_test)
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lb.inverse_transform(clf.classes_)).plot(ax=ax)
plt.savefig(plots_path+'nb_cm.png', bbox_inches ="tight")

In [None]:
print(classification_report(y_test, pred_test))

In [None]:
tmp = test.copy()
tmp['pred'] = lb.inverse_transform(pred_test)

In [None]:
tmp['TP'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['TN'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FP'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FN'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['T'] = tmp.apply(lambda row: row['stance'] == row['pred'], axis=1)
tmp['F'] = tmp.apply(lambda row: row['stance'] != row['pred'], axis=1)
tmp = tmp.groupby(by='topic').agg({'TP': 'sum',
                                   'TN': 'sum',
                                   'FP': 'sum',
                                   'FN': 'sum',
                                   'T': 'sum',
                                   'F': 'sum'}).reset_index()
tmp.sort_values(by='topic', inplace=True)

In [None]:
plt.bar(tmp['topic'], tmp['T']/(tmp['T']+tmp['F'])*100, label='Correctly predicted')
plt.bar(tmp['topic'], tmp['F']/(tmp['T']+tmp['F'])*100, bottom=tmp['T']/(tmp['T']+tmp['F'])*100, label='Incorrectly predicted')
plt.title('Percentage of correctly and incorrectly predicted arguments by categories')
plt.ylabel('Percentage of arguments')
plt.yticks(np.arange(0,110,10))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'nb_prediction_percentage.png', bbox_inches ='tight')
plt.show()

In [None]:
plt.bar(tmp['topic'], tmp['TP'], label='TP')
plt.bar(tmp['topic'], tmp['TN'], bottom=tmp['TP'], label='TN')
plt.bar(tmp['topic'], tmp['FP'], bottom=tmp['TP']+tmp['TN'], label='FP')
plt.bar(tmp['topic'], tmp['FN'], bottom=tmp['TP']+tmp['TN']+tmp['FP'], label='FN')
plt.title('Confusion matrix by categories')
plt.ylabel('# of arguments')
plt.yticks(np.arange(0,65,5))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'nb_cm_categories.png', bbox_inches ='tight')
plt.show()

#### Shap analysis

In [None]:
explainer = shap.Explainer(best_clf.named_steps['nb'].predict,
                           best_clf.named_steps['preproc'].transform(train['arg_tok']).toarray(),
                           feature_names=best_clf.named_steps['preproc'].get_feature_names_out())

In [None]:
shap_values = explainer(best_clf.named_steps['preproc'].transform(test['arg_tok'][:20]).toarray(), max_evals='auto')

In [None]:
shap.plots.beeswarm(shap_values, max_display=10, order=shap_values.abs.max(0), show=False)
plt.savefig(plots_path+'nb_shap_beeswarm.png', bbox_inches ='tight')
plt.show()

In [None]:
test['stance'][:10]

In [None]:
#shap.force_plot(shap_values[0])
shap.force_plot(shap_values[5], link='logit', matplotlib=True, show=False) 
plt.savefig(plots_path+'nb_shap_force_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.force_plot(shap_values[18], link='logit', matplotlib=True, show=False) 
plt.savefig(plots_path+'nb_shap_force_CON.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.heatmap(shap_values, instance_order=shap_values.sum(1), max_display=10, show=False)
plt.savefig(plots_path+'nb_shap_heatmap.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.waterfall(shap_values[5], max_display=10, show=False)
plt.savefig(plots_path+'nb_shap_waterfall_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.waterfall(shap_values[18], max_display=10, show=False)
plt.savefig(plots_path+'nb_shap_waterfall_CON.png', bbox_inches ='tight')
plt.show()

### 3.2 SVM

#### Grid search

In [None]:
def dummy_tokenizer(sentence):
    return sentence

In [None]:
lb = LabelBinarizer()
y_train = lb.fit_transform(train['stance'])

In [None]:
scoring = ['accuracy', 'f1_macro', 'precision', 'recall']

In [None]:
pipe = Pipeline([('preproc', TfidfVectorizer()), ('svm', SVC())])

In [None]:
params = [
    {'preproc': [TfidfVectorizer()],
     'preproc__tokenizer': [dummy_tokenizer],
     'preproc__preprocessor': [dummy_tokenizer],
     'preproc__token_pattern': [None],
     #'preproc__min_df': [1, 10, 20, 50, 100],
     #'preproc__min_df': np.arange(1,6,1),
     'preproc__min_df': [1],
     #'preproc__max_features': [None, 100, 200, 300, 400, 500, 600],
     #'preproc__ngram_range': [(1,1), (1,2), (1,3), (2,3), (1,4)],
     #'preproc__ngram_range': [(1,1), (1,2), (1,3)],
     'preproc__ngram_range': [(1,1)],
     
     'svm': [SVC()],
     #'svm__C': [0.1, 1, 2, 5, 10, 50],
     'svm__C': np.arange(1,3,0.01),
     #'svm__C': np.arange(1,6,1),
     'svm__kernel': ['poly'],
     #'svm__degree': [2, 3, 4, 5],
     #'svm__degree': np.arange(2,5,1),
     'svm__degree': [2],
     'svm__gamma': ['scale'],
     #'svm__shrinking': [True, False],     
    },
    
    #{'preproc': [TfidfVectorizer()],
     #'preproc__tokenizer': [dummy_tokenizer],
     #'preproc__preprocessor': [dummy_tokenizer],
     #'preproc__token_pattern': [None],
     #'preproc__min_df': [1, 10, 20, 50, 100],
     #'preproc__min_df': np.arange(1,6,1),
     #'preproc__max_features': [None, 100, 200, 300, 400, 500, 600],
     #'preproc__ngram_range': [(1,1), (1,2), (1,3), (2,3), (1,4)],
     #'preproc__ngram_range': [(1,1), (1,2), (1,3)],
     #'preproc__ngram_range': [(1,1)],
     
     #'svm': [SVC()],
     #'svm__C': [0.1, 1, 2, 5, 10, 50],
     #'svm__C': np.arange(1,15,0.5),
     #'svm__kernel': ['rbf', 'sigmoid'],
     #'svm__gamma': ['scale'],
     #'svm__kernel': ['rbf'],
     #'svm__shrinking': [True, False],     
    #},
]

In [None]:
clf = GridSearchCV(estimator=pipe, param_grid=params, scoring=scoring, refit='f1_macro',
                   cv=3, return_train_score=True, n_jobs=-1, verbose=0)

In [None]:
clf.fit(train['arg_tok'], y_train.ravel())

In [None]:
clf.best_params_

In [None]:
clf.best_score_

In [None]:
#pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_f1_macro')[['mean_test_f1_macro', 'param_svm__kernel', 'param_svm__C', 'param_svm__degree', 'param_preproc__ngram_range', 'param_preproc__min_df']][:60]
#tmp = pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_f1_macro')[['mean_test_f1_macro', 'param_svm__C', 'param_svm__degree', 'param_preproc__min_df', 'param_preproc__ngram_range']][:120]
#tmp.groupby(by=['param_preproc__min_df', 'param_preproc__ngram_range']).count()


In [None]:
#pd.DataFrame(clf.cv_results_).to_csv(models_path+'svc_gridsearch3.csv')

#### Evaluation

In [None]:
best_clf = clf.best_estimator_
best_clf.fit(train['arg_tok'], y_train.ravel())
pred_test = best_clf.predict(test['arg_tok'])
y_test = lb.transform(test['stance'])

In [None]:
cm = confusion_matrix(y_test, pred_test)
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lb.inverse_transform(clf.classes_)).plot(ax=ax)
plt.savefig(plots_path+'svc_cm.png', bbox_inches ="tight")

In [None]:
print(classification_report(y_test, pred_test))

In [None]:
tmp = test.copy()
tmp['pred'] = lb.inverse_transform(pred_test)

In [None]:
tmp['TP'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['TN'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FP'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FN'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['T'] = tmp.apply(lambda row: row['stance'] == row['pred'], axis=1)
tmp['F'] = tmp.apply(lambda row: row['stance'] != row['pred'], axis=1)
tmp = tmp.groupby(by='topic').agg({'TP': 'sum',
                                   'TN': 'sum',
                                   'FP': 'sum',
                                   'FN': 'sum',
                                   'T': 'sum',
                                   'F': 'sum'}).reset_index()
tmp.sort_values(by='topic', inplace=True)

In [None]:
plt.bar(tmp['topic'], tmp['T']/(tmp['T']+tmp['F'])*100, label='Correctly predicted')
plt.bar(tmp['topic'], tmp['F']/(tmp['T']+tmp['F'])*100, bottom=tmp['T']/(tmp['T']+tmp['F'])*100, label='Incorrectly predicted')
plt.title('Percentage of correctly and incorrectly predicted arguments by categories')
plt.ylabel('Percentage of arguments')
plt.yticks(np.arange(0,110,10))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'svc_prediction_percentage.png', bbox_inches ='tight')
plt.show()

In [None]:
plt.bar(tmp['topic'], tmp['TP'], label='TP')
plt.bar(tmp['topic'], tmp['TN'], bottom=tmp['TP'], label='TN')
plt.bar(tmp['topic'], tmp['FP'], bottom=tmp['TP']+tmp['TN'], label='FP')
plt.bar(tmp['topic'], tmp['FN'], bottom=tmp['TP']+tmp['TN']+tmp['FP'], label='FN')
plt.title('Confusion matrix by categories')
plt.ylabel('# of arguments')
plt.yticks(np.arange(0,65,5))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'svc_cm_categories.png', bbox_inches ='tight')
plt.show()

#### Shap analysis

In [None]:
explainer = shap.Explainer(best_clf.named_steps['svm'].predict,
                           best_clf.named_steps['preproc'].transform(train['arg_tok']).toarray(),
                           feature_names=best_clf.named_steps['preproc'].get_feature_names_out())

In [None]:
shap_values = explainer(best_clf.named_steps['preproc'].transform(test['arg_tok'][:20]).toarray(),
                        max_evals='auto')

In [None]:
shap.plots.beeswarm(shap_values, max_display=10, order=shap_values.abs.max(0), show=False)
plt.savefig(plots_path+'svc_shap_beeswarm.png', bbox_inches ='tight')
plt.show()

In [None]:
test['stance'][:10]

In [None]:
#shap.force_plot(shap_values[0])
shap.force_plot(shap_values[5], link='logit', matplotlib=True, show=False) 
plt.savefig(plots_path+'svc_shap_force_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.force_plot(shap_values[2], link='logit', matplotlib=True, show=False) 
plt.savefig(plots_path+'svc_shap_force_CON.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.heatmap(shap_values, instance_order=shap_values.sum(1), max_display=10, show=False)
plt.savefig(plots_path+'svc_shap_heatmap.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.waterfall(shap_values[5], max_display=10, show=False)
plt.savefig(plots_path+'svc_shap_waterfall_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.waterfall(shap_values[2], max_display=10, show=False)
plt.savefig(plots_path+'svc_shap_waterfall_CON.png', bbox_inches ='tight')
plt.show()

### 3.3 BERT

In [None]:
from transformers import AutoTokenizer, pipeline, AutoModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score
import json
from tqdm.auto import tqdm
import copy

#### Load and encode the dataset

In [None]:
train_bert, val_bert = train_test_split(train, test_size=0.2, random_state=42, stratify=train[['stance', 'topic']])

In [None]:
train_bert = Dataset.from_pandas(train_bert[['argument', 'topic', 'stance']], split='train', preserve_index=False)
val_bert = Dataset.from_pandas(val_bert[['argument', 'topic', 'stance']], split='validation', preserve_index=False)
test_bert = Dataset.from_pandas(test[['argument', 'topic', 'stance']], split='test', preserve_index=False)

In [None]:
ibm_dataset = DatasetDict(train=train_bert, val=val_bert, test=test_bert)

In [None]:
id2label = {0: "CON", 1: "PRO"}
label2id = {"CON": 0, "PRO": 1}
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def preprocess_data(data):
    encoding = tokenizer(data['argument'],
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=64)
    encoding['labels'] = [label2id[l] for l in data['stance']]
    tok_topic = tokenizer(data['topic'],
                          add_special_tokens=True,
                          padding='max_length',
                          truncation=True,
                          max_length=64)
    encoding['topic_input_ids'] = tok_topic['input_ids']
    encoding['topic_attention_mask'] = tok_topic['attention_mask']
    encoding['topic_token_type_ids'] = tok_topic['token_type_ids']
    return encoding

In [None]:
tokenized_dataset = ibm_dataset.map(preprocess_data, batched=True, batch_size=16, remove_columns=['argument', 'topic', 'stance'])

In [None]:
tokenized_dataset.set_format('torch')

#### Finetune the model

In [None]:
class BERTStance(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size*2, 2)
        self.softmax = nn.Softmax(dim=1)

        # Freeze BERT parameters
        #for param in self.bert.parameters():
        #    param.requires_grad = False
        
    def forward(self, input_ids, attention_mask, token_type_ids, topic_input_ids, topic_attention_mask, topic_token_type_ids):
        out_arg = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        #out_arg = self.dropout(out_arg['pooler_output'])
        out_arg = out_arg['last_hidden_state'][:,0,:]
        #out_arg = out_arg['pooler_output']

        out_topic = self.bert(input_ids=topic_input_ids,
                              attention_mask=topic_attention_mask,
                              token_type_ids=topic_token_type_ids)

        #out_topic = self.dropout(out_topic['pooler_output'])
        out_topic = out_topic['last_hidden_state'][:,0,:]
        #out_topic = out_topic['pooler_output']
        
        out = torch.cat((out_arg, out_topic), dim=1)
        out = self.dropout(out)
        logits = self.classifier(out)
        probs = self.softmax(logits)
        return logits, probs

In [None]:
model = BERTStance()

In [None]:
def trainloop(model, train_data, val_data, learning_rate, epochs):

    best_f1 = 0.0
    best_model = None
    loss_epochs = {'train': [], 'val': []}
    f1_epochs = {'train': [], 'val': []}

    # Set parameters
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_data)*epochs)

    device = torch.device('cpu')
    model.to(device)

    for epoch in tqdm(range(epochs)):

        # Training loop
        loss = 0
        f1 = 0
        model.train()

        for batch_idx, batch in enumerate(train_data):
            
            # zero out previous gradients
            model.zero_grad()

            logits, probs = model(batch['input_ids'].to(device),
                                attention_mask=batch['attention_mask'].to(device),
                                token_type_ids=batch['token_type_ids'].to(device),
                                topic_input_ids=batch['topic_input_ids'].to(device),
                                topic_attention_mask=batch['topic_attention_mask'].to(device),
                                topic_token_type_ids=batch['topic_token_type_ids'].to(device))


            # Compute loss and backpropagte
            batch_loss = loss_fn(logits, batch['labels'].to(device))
            loss += batch_loss.item()
            batch_loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # update parameters and learning rate
            optimizer.step()
            scheduler.step()

            # compute f1 score
            pred = torch.argmax(probs, dim=1).cpu().numpy()
            f1 += f1_score(batch['labels'].cpu().numpy(), pred, average='macro')

        print(f"Train epoch {epoch} - Loss: {loss / len(train_data)} - F1: {f1 / len(train_data)}")

        loss_epochs['train'].append(loss / len(train_data))
        f1_epochs['train'].append(f1 / len(train_data))

        # Validation loop
        loss = 0
        f1 = 0
        model.eval()

        for batch_idx, batch in enumerate(val_data):
            with torch.no_grad():
                logits, probs = model(batch['input_ids'].to(device),
                                    attention_mask=batch['attention_mask'].to(device),
                                    token_type_ids=batch['token_type_ids'].to(device),
                                    topic_input_ids=batch['topic_input_ids'].to(device),
                                    topic_attention_mask=batch['topic_attention_mask'].to(device),
                                    topic_token_type_ids=batch['topic_token_type_ids'].to(device))

            # Compute loss
            loss += loss_fn(logits, batch['labels'].to(device)).item()

            # compute f1 score
            pred = torch.argmax(probs, dim=1).cpu().numpy()
            f1 += f1_score(batch['labels'].cpu().numpy(), pred, average='macro')

        print(f"Val epoch {epoch} - Loss: {loss / len(val_data)} - F1: {f1 / len(val_data)}")
        
        # Save best model
        if f1 / len(val_data) >= best_f1:
            best_f1 = f1 / len(val_data)
            best_model = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), models_path+'bert_stance_t.pt')

        loss_epochs['val'].append(loss / len(val_data))
        f1_epochs['val'].append(f1 / len(val_data))

    # Load best model
    model.load_state_dict(best_model)
    return model, loss_epochs, f1_epochs


In [None]:
batch_size = 16
epochs = 5
lr = 1e-4

In [None]:
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_dataset['val'], batch_size=batch_size)

In [None]:
model, loss_epochs, f1_epochs = trainloop(model, train_dataloader, eval_dataloader, lr, epochs)

In [None]:
f1_epochs

In [None]:
loss_epochs

In [None]:
with open(models_path+'bert_f1_t.json', 'w') as fp:
    json.dump(f1_epochs, fp)

with open(models_path+'bert_loss_t.json', 'w') as fp:
    json.dump(loss_epochs, fp)

In [None]:
#test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=batch_size)

In [None]:
#device = torch.device('cpu')
#model.to(device)

#all_pred = []
#model.eval()

#for batch_idx, batch in enumerate(test_dataloader):
#    with torch.no_grad():
#        logits, probs = model(batch['input_ids'].to(device),
#                            attention_mask=batch['attention_mask'].to(device),
#                            token_type_ids=batch['token_type_ids'].to(device),
#                            topic_input_ids=batch['topic_input_ids'].to(device),
#                            topic_attention_mask=batch['topic_attention_mask'].to(device),
#                            topic_token_type_ids=batch['topic_token_type_ids'].to(device))

#    pred = torch.argmax(probs, dim=1).cpu().numpy()
#    all_pred.extend(pred)

In [None]:
#y_test = tokenized_dataset['test']['labels'].numpy()
#y_pred = np.array(all_pred)

#### Evaluate the model

In [None]:
finetuned_model = BERTStance()
finetuned_model.load_state_dict(torch.load(models_path+'bert_stance_t.pt'))
id2label = {0: "CON", 1: "PRO"}
label2id = {"CON": 0, "PRO": 1}
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model_inputs = tokenizer(test['argument'].to_list(),
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=64,
                        return_tensors='pt')
tok_topic = tokenizer(test['topic'],
                        add_special_tokens=True,
                        padding='max_length',
                        truncation=True,
                        max_length=64)

In [None]:
finetuned_model.eval()
with torch.no_grad():
    logits, probs = finetuned_model(model_inputs['input_ids'],
                                    attention_mask=model_inputs['attention_mask'],
                                    token_type_ids=model_inputs['token_type_ids'],
                                    topic_input_ids=tok_topic['input_ids'],
                                    topic_attention_mask=tok_topic['attention_mask'],
                                    topic_token_type_ids=tok_topic['token_type_ids'])

In [None]:
y_pred = torch.argmax(probs, dim=1).numpy()

In [None]:
y_test = [label2id[l] for l in test['stance']]

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['CON', 'PRO']).plot(ax=ax)
plt.savefig(plots_path+'bert_cm_t.png', bbox_inches ="tight")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
tmp = test.copy()
tmp['pred'] = [id2label[i] for i in y_pred]

In [None]:
tmp['TP'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['TN'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FP'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FN'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['T'] = tmp.apply(lambda row: row['stance'] == row['pred'], axis=1)
tmp['F'] = tmp.apply(lambda row: row['stance'] != row['pred'], axis=1)
tmp = tmp.groupby(by='topic').agg({'TP': 'sum',
                                   'TN': 'sum',
                                   'FP': 'sum',
                                   'FN': 'sum',
                                   'T': 'sum',
                                   'F': 'sum'}).reset_index()
tmp.sort_values(by='topic', inplace=True)

In [None]:
plt.bar(tmp['topic'], tmp['T']/(tmp['T']+tmp['F'])*100, label='Correctly predicted')
plt.bar(tmp['topic'], tmp['F']/(tmp['T']+tmp['F'])*100, bottom=tmp['T']/(tmp['T']+tmp['F'])*100, label='Incorrectly predicted')
plt.title('Percentage of correctly and incorrectly predicted arguments by categories')
plt.ylabel('Percentage of arguments')
plt.yticks(np.arange(0,110,10))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'bert_prediction_percentage_t.png', bbox_inches ='tight')
plt.show()

In [None]:
plt.bar(tmp['topic'], tmp['TP'], label='TP')
plt.bar(tmp['topic'], tmp['TN'], bottom=tmp['TP'], label='TN')
plt.bar(tmp['topic'], tmp['FP'], bottom=tmp['TP']+tmp['TN'], label='FP')
plt.bar(tmp['topic'], tmp['FN'], bottom=tmp['TP']+tmp['TN']+tmp['FP'], label='FN')
plt.title('Confusion matrix by categories')
plt.ylabel('# of arguments')
plt.yticks(np.arange(0,65,5))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'bert_cm_categories_t.png', bbox_inches ='tight')
plt.show()

#### Shap analysis

In [None]:
pred = pipeline("text-classification", model=finetuned_model, tokenizer=tokenizer)

In [None]:
explainer = shap.Explainer(pred)

In [None]:
shap_values = explainer(test['argument'][:20])

In [None]:
shap.plots.bar(shap_values[:,:,1].mean(0), max_display=10, show=False)
plt.savefig(plots_path+'bert_shap_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.bar(shap_values[:,:,0].mean(0), max_display=10, show=False)
plt.savefig(plots_path+'bert_shap_CON.png', bbox_inches ='tight')
plt.show()

In [None]:
test['stance'][:10]

In [None]:
shap.plots.waterfall(shap_values[0,:,1], max_display=10, show=False)
plt.savefig(plots_path+'bert_shap_waterfall_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.waterfall(shap_values[11,:,0], max_display=10, show=False)
plt.savefig(plots_path+'bert_shap_waterfall_CON.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.text(shap_values[0,:,1])

### 3.4 Prompt tuning GPT2

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, pipeline
from peft import PromptTuningConfig, PromptTuningInit, PeftType, TaskType, get_peft_model, PromptEncoderConfig, PeftConfig, PeftModel
import torch
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import evaluate
import json

#### Load and encode the dataset

In [None]:
train_bert, val_bert = train_test_split(train, test_size=0.2, random_state=42)

In [None]:
train_bert = Dataset.from_pandas(train_bert[['argument', 'stance']], split='train', preserve_index=False)
val_bert = Dataset.from_pandas(val_bert[['argument', 'stance']], split='validation', preserve_index=False)
test_bert = Dataset.from_pandas(test[['argument', 'stance']], split='test', preserve_index=False)
ibm_dataset = DatasetDict(train=train_bert, val=val_bert, test=test_bert)

In [None]:
id2label = {0: "CON", 1: "PRO"}
label2id = {"CON": 0, "PRO": 1}
tokenizer = AutoTokenizer.from_pretrained("gpt2", truncation=True, padding_side='left')
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def preprocess_data(data):
    encoding = tokenizer(data['argument'], padding=True)
    encoding['labels'] = [label2id[l] for l in data['stance']]
    return encoding

In [None]:
tokenized_dataset = ibm_dataset.map(preprocess_data, batched=True, batch_size=16, remove_columns=['argument', 'stance'])
tokenized_dataset.set_format('torch')

#### Prompt tuning of the model

In [None]:
peft_config = PromptEncoderConfig(#PromptTuningConfig(
    #peft_type=PeftType.PROMPT_TUNING,
    peft_type=PeftType.P_TUNING,
    task_type=TaskType.SEQ_CLS,
    #prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=16,
    #prompt_tuning_init_text='Detect if the stance of this tweet is PRO or CON:',
    #tokenizer_name_or_path='gpt2',
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("gpt2",
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id)
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
model = get_peft_model(model, peft_config)

In [None]:
arguments = TrainingArguments(
    output_dir=models_path+'gpt2',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
)

In [None]:
metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metrics.compute(predictions=predictions, references=labels, average='macro')

In [None]:
trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

#trainer.add_callback(EarlyStoppingCallback())

In [None]:
trainer.train()

In [None]:
trainer.save_model(models_path+'gpt2/final_model')
trainer.model.config.to_json_file(models_path+'gpt2/final_model/config.json')

In [None]:
trainer.state.save_to_json(models_path+'gpt2/training_state.json')

In [None]:
with open(models_path+'gpt2/train_metrics.json', 'w') as fp:
    json.dump(trainer.evaluate(tokenized_dataset['train']), fp)

In [None]:
with open(models_path+'gpt2/val_metrics.json', 'w') as fp:
    json.dump(trainer.evaluate(tokenized_dataset['val']), fp)

In [None]:
with open(models_path+'gpt2/test_metrics.json', 'w') as fp:
    json.dump(trainer.evaluate(tokenized_dataset['test']), fp)

In [None]:
#test_pred = trainer.predict(tokenized_dataset['test'])
#y_pred = test_pred.predictions.argmax(axis=1)

In [None]:
#y_test = tokenized_dataset['test']['labels']

In [None]:
#print(classification_report(y_test, test_pred.predictions.argmax(axis=1)))

#### Evaluate the model

In [None]:
#config = PeftConfig.from_pretrained(models_path+'gpt2/final_model')
#inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
ptuned_model = AutoModelForSequenceClassification.from_pretrained(models_path+'gpt2/final_model')

tokenizer = AutoTokenizer.from_pretrained(models_path+'gpt2/final_model', padding_side='left')
tokenizer.pad_token_id = tokenizer.eos_token_id

#ptuned_model = PeftModel.from_pretrained(inference_model, models_path+'gpt2/final_model')
ptuned_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
id2label = {0: "CON", 1: "PRO"}
label2id = {"CON": 0, "PRO": 1}

In [None]:
model_inputs = tokenizer(test['argument'].to_list(), return_tensors='pt', padding=True, truncation=True)

In [None]:
with torch.no_grad():
    pred = ptuned_model(**model_inputs)

In [None]:
y_pred = torch.argmax(pred.logits, axis=1).numpy()
y_test = [label2id[l] for l in test['stance']]

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['CON', 'PRO']).plot(ax=ax)
plt.savefig(plots_path+'gpt2_cm.png', bbox_inches ="tight")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
tmp = test.copy()
tmp['pred'] = [id2label[i] for i in y_pred]

In [None]:
tmp['TP'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['TN'] = tmp.apply(lambda row: row['stance'] == row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FP'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'CON', axis=1)
tmp['FN'] = tmp.apply(lambda row: row['stance'] != row['pred'] and row['stance'] == 'PRO', axis=1)
tmp['T'] = tmp.apply(lambda row: row['stance'] == row['pred'], axis=1)
tmp['F'] = tmp.apply(lambda row: row['stance'] != row['pred'], axis=1)
tmp = tmp.groupby(by='topic').agg({'TP': 'sum',
                                   'TN': 'sum',
                                   'FP': 'sum',
                                   'FN': 'sum',
                                   'T': 'sum',
                                   'F': 'sum'}).reset_index()
tmp.sort_values(by='topic', inplace=True)

In [None]:
plt.bar(tmp['topic'], tmp['T']/(tmp['T']+tmp['F'])*100, label='Correctly predicted')
plt.bar(tmp['topic'], tmp['F']/(tmp['T']+tmp['F'])*100, bottom=tmp['T']/(tmp['T']+tmp['F'])*100, label='Incorrectly predicted')
plt.title('Percentage of correctly and incorrectly predicted arguments by categories')
plt.ylabel('Percentage of arguments')
plt.yticks(np.arange(0,110,10))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'gpt2_prediction_percentage.png', bbox_inches ='tight')
plt.show()

In [None]:
plt.bar(tmp['topic'], tmp['TP'], label='TP')
plt.bar(tmp['topic'], tmp['TN'], bottom=tmp['TP'], label='TN')
plt.bar(tmp['topic'], tmp['FP'], bottom=tmp['TP']+tmp['TN'], label='FP')
plt.bar(tmp['topic'], tmp['FN'], bottom=tmp['TP']+tmp['TN']+tmp['FP'], label='FN')
plt.title('Confusion matrix by categories')
plt.ylabel('# of arguments')
plt.yticks(np.arange(0,65,5))
plt.xticks(rotation=90)
plt.grid(axis='y', alpha=0.3)
plt.legend()
plt.savefig(plots_path+'gpt2_cm_categories.png', bbox_inches ='tight')
plt.show()

#### Shap analysis

In [None]:
pred = pipeline("text-classification", model=ptuned_model, tokenizer=tokenizer)
explainer = shap.Explainer(pred)

In [None]:
shap_values = explainer(test['argument'][:20])

In [None]:
shap.plots.bar(shap_values[:,:,1].mean(0), max_display=10, show=False)
plt.savefig(plots_path+'gpt2_shap_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.bar(shap_values[:,:,0].mean(0), max_display=10, show=False)
plt.savefig(plots_path+'gpt2_shap_CON.png', bbox_inches ='tight')
plt.show()

In [None]:
test['stance'][:10]

In [None]:
shap.plots.waterfall(shap_values[5,:,1], max_display=10, show=False)
plt.savefig(plots_path+'gpt2_shap_waterfall_PRO.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.waterfall(shap_values[2,:,0], max_display=10, show=False)
plt.savefig(plots_path+'gpt2_shap_waterfall_CON.png', bbox_inches ='tight')
plt.show()

In [None]:
shap.plots.text(shap_values[2,:,0])

### 3.5 ChatGPT

In [None]:
test['stance'][:50]

In [None]:
# Predicted stance only with arguments
pred_stances = [
    'PRO',  # knowledge should be "shared in solidarity"
    'CON',  # faith − belief that is not based on evidence − is one of the world's great evils
    'CON',  # dam construction requires the state to displace individual people
    'CON',  # China's gender imbalance is further increased by the One Child Policy
    'CON',  # laissez-faire capitalism creates social evils that harm its citizens
    'CON',  # it was Hamas that broke the truce
    'PRO',  # gambling increases aggregate demand for goods and services in the economy
    'CON',  # A large dam can cause the loss of entire ecospheres
    'CON',  # Damming can harm local ecosystems
    'PRO',  # the focus of China on population control helps provide a better health service for women
    'CON',  # A large dam can endanger ecosystems by restricting the motion of marine animals
    'CON',  # the right to self-defence requires that peaceful means are first exhausted before resorting to military force, something Israel "did not even contemplate doing
    'PRO',  # Monarchy provides continuity and stability
    'PRO',  # free trade gives optimal economic advantages
    'CON',  # Advertising's cumulative cultural effects, unless quickly checked, will be responsible for destroying the world as we know it
    'CON',  # video games allow children to act out crimes
    'CON',  # advertising attempts to equate the social with the material by utilizing images and slogans to link commodities with the real sources of human happiness
    'CON',  # a perfect God can have no need to create a world
    'PRO',  # Intellectual property is viewed as a necessary way of incentivising the creation of new creative works
    'CON',  # denying the existence of a god leads to moral relativism, leaving one with no moral or ethical foundation
    'CON',  # welfare not only increases poverty but also increases other problems
    'CON',  # The right to free speech conflicts with other rights
    'CON',  # abstinence-only programs deprive teenagers of critical information about sexuality
    'PRO',  # freedom of speech, in order to exist and function, necessarily extends to even the unpopular
    'CON',  # Unfettered markets undermine the social order
    'CON',  # Denying the existence of a god renders life meaningless and miserable
    'CON',  # the universe can be explained without any reference to the supernatural
    'CON',  # It is immoral to create children
    'PRO',  # In charity gambling profits from the venture go to the charity or group of charities, rather than to a municipality or private casino
    'CON',  # Faith is divisive and dangerous
    'CON',  # studies of abstinence programs have not produced sufficient evidence to justify their widespread dissemination
    'CON',  # Subsidies may distort production incentives
    'CON',  # The blockade action is a violation of international law
    'PRO',  # the average standard of living in a declining population, at least in terms of material possessions, will tend to rise
    'CON',  # advertising focuses on looking toward external rewards for a sense of self
    'CON',  # Providing safe-sex education promotes promiscuity
    'PRO',  # government intervention could serve a useful purpose
    'CON',  # democracy will result in the people's distrust and disrespect of governments
    'PRO',  # the expression of dissent or subversive views should be tolerated
    'CON',  # reliance on divine authority lends itself to authoritarianism and dogmatism
    'CON',  # the consequences of Israel’s failure to maintain the blockade would be “an Iranian port in Gaza, only a few dozen kilometers from Tel Aviv and Jerusalem
    'CON',  # A homogeneous community grounded on consensus may be unable to criticize the injustice and exclusionary practices that undermine it
    'CON',  # Certain restrictions on abortion could be used to form a slippery slope against all abortions
    'CON',  # allowing property rights in ideas and information creates artificial scarcity
    'CON',  # Multiculturalism would lead to acceptance of barbaric practices
    'CON',  # violent video games are significantly associated with: increased aggressive behavior, thoughts, and affect; increased physiological arousal; and decreased pro-social (helping) behavior
    'PRO',  # The possibility of getting shot by an armed victim is a substantial deterrent to crime
    'CON',  # abortion causes mental health problems
    'CON',  # prohibiting people from using, reproducing, and trading copyrighted material is an infringement of freedom of speech
    'CON'   # Freedom of speech does not allow a person to contempt the courts
]

In [None]:
# Predicted stances adding topics
pred_stances2 = [
    'PRO',  # intellectual property rights
    'CON',  # atheism
    'CON',  # build hydroelectric dams
    'CON',  # the one-child policy of the republic of China
    'CON',  # unleash the free market
    'CON',  # Israel's 2008-2009 military operations against Gaza
    'PRO',  # gambling
    'CON',  # build hydroelectric dams
    'CON',  # build hydroelectric dams
    'PRO',  # the one-child policy of the republic of China
    'CON',  # build hydroelectric dams
    'CON',  # Israel's 2008-2009 military operations against Gaza
    'PRO',  # the monarchy
    'CON',  # unleash the free market
    'CON',  # advertising
    'PRO',  # the sale of violent video games to minors
    'CON',  # advertising
    'CON',  # atheism
    'PRO',  # intellectual property rights
    'CON',  # atheism
    'CON',  # subsidize poor communities
    'CON',  # freedom of speech
    'CON',  # only teach abstinence for sex education in schools
    'PRO',  # freedom of speech
    'CON',  # unleash the free market
    'CON',  # atheism
    'CON',  # atheism
    'PRO',  # have children
    'PRO',  # gambling
    'CON',  # atheism
    'CON',  # only teach abstinence for sex education in schools
    'CON',  # subsidize poor communities
    'CON',  # the blockade of Gaza
    'PRO',  # have children
    'CON',  # advertising
    'CON',  # only teach abstinence for sex education in schools
    'PRO',  # unleash the free market
    'CON',  # democratization
    'PRO',  # freedom of speech
    'CON',  # atheism
    'CON',  # the blockade of Gaza
    'CON',  # multiculturalism
    'CON',  # partial birth abortions
    'PRO',  # intellectual property rights
    'CON',  # multiculturalism
    'PRO',  # the sale of violent video games to minors
    'PRO',  # the right to bear arms
    'CON',  # partial birth abortions
    'PRO',  # intellectual property rights
    'CON'   # freedom of speech
]


In [None]:
cm = confusion_matrix(test['stance'][:50], pred_stances, labels=['PRO', 'CON'])
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['PRO', 'CON']).plot(ax=ax)
plt.savefig(plots_path+'chatGPT_cm.png', dpi=300, bbox_inches='tight')

In [None]:
print(classification_report(test['stance'][:50], pred_stances))

In [None]:
cm = confusion_matrix(test['stance'][:50], pred_stances2, labels=['PRO', 'CON'])
fig, ax = plt.subplots(figsize=(5,5))
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['PRO', 'CON']).plot(ax=ax)
plt.savefig(plots_path+'chatGPT_cm2.png', dpi=300, bbox_inches='tight')

In [None]:
print(classification_report(test['stance'][:50], pred_stances))