<a href="https://colab.research.google.com/github/PaulaDCV/PropagandaDetection/blob/main/Propaganda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and setup

In [None]:
import pandas as pd
import altair as alt
import numpy as np
import string
import re
import itertools as it
from collections import Counter


#Wandb
!pip install wandb
import wandb
wandb.login()
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

#Sklearn imports
import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#Nltk imports
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

#Transformers
!pip install -q transformers
!pip install -q datasets
!pip install evaluate

from transformers import pipeline
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[34m[1mwandb[0m: Currently logged in as: [33mpauladelcastillovivero[0m ([33mpdc[0m). Use [1m`wandb login --relogin`[0m to force relogin


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Read the data and connvert into a dataframe
train_path = "/content/drive/MyDrive/ANLP/propaganda_train.tsv"
test_path = "/content/drive/MyDrive/ANLP/propaganda_val.tsv"

traindf = pd.read_csv(train_path,sep='\t')
testdf = pd.read_table(test_path)

In [None]:
#Add a column with a binary label 1-propaganda, 0-not propaganda
traindf["binary_label"] = np.where(traindf["label"] != "not_propaganda", 1,0)
testdf["binary_label"] = np.where(testdf["label"] != "not_propaganda", 1,0)

#Add a column removing the tags <BOS> and <EOS>
traindf["sentence"]=traindf["tagged_in_context"].str.replace('<BOS>', '')
traindf["sentence"]=traindf["sentence"].str.replace('<EOS>', '')
testdf["sentence"]=testdf["tagged_in_context"].str.replace('<BOS> ', '')
testdf["sentence"]=testdf["sentence"].str.replace('<EOS>', '')

#Extract span of text between BOS and EOS
traindf["span"] = traindf["tagged_in_context"].apply(lambda x: re.findall(r'<BOS> (.+?) <EOS>',str(x)))
traindf["span"] = traindf["span"].str[0]
traindf["span_length"] = traindf['span'].str.split().str.len()

#Calculate number of words in each span of text
testdf["span"] = testdf["tagged_in_context"].apply(lambda x: re.findall(r'<BOS> (.+?) <EOS>',str(x)))
testdf["span"] = testdf["span"].str[0]
testdf["span_length"] = testdf['span'].str.split().str.len()

#Length of sentence string
traindf["len"]=traindf["sentence"].str.len()
testdf["len"]=testdf["sentence"].str.len()

#Calculate number of words in each sentence
count = traindf['sentence'].str.split().apply(len).value_counts()
traindf["sentence_length"]=traindf['sentence'].str.split().str.len()
testdf["sentence_length"]=testdf['sentence'].str.split().str.len()

#DataFrame with all data
traindf["type"] = "Train"
testdf["type"] = "Test"
datadf = pd.concat([traindf,testdf])
datadf['binary_label'] = datadf['binary_label'].map({1: "Propaganda", 0:"Not Propaganda"})


In [None]:
#Dataframe with just the propaganda senteneces for multiclass classification
propaganda_train_df = traindf[traindf["binary_label"]==1]
propaganda_test_df = testdf[testdf["binary_label"]==1]
propaganda_df = datadf[datadf["binary_label"]=="Propaganda"]

In [None]:
datadf.to_csv("data.csv")

# EDA

### Class proportions

Training

In [None]:
#Binary class proportions
percentage_train = traindf.groupby("binary_label").count().reset_index()
percentage_train["percentage"] = (percentage_train["tagged_in_context"] / percentage_train['tagged_in_context'].sum()) * 100
percentage_train

Unnamed: 0,binary_label,label,tagged_in_context,sentence,span,span_length,len,sentence_length,type,percentage
0,0,1268,1268,1268,1268,1268,1268,1268,1268,49.569977
1,1,1290,1290,1290,1290,1290,1290,1290,1290,50.430023


In [None]:
#Binary class proportions
percentage_train = traindf.groupby("label").count().reset_index()
percentage_train["percentage"] = (percentage_train["tagged_in_context"] / percentage_train['tagged_in_context'].sum()) * 100
percentage_train

Unnamed: 0,label,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,type,percentage
0,appeal_to_fear_prejudice,157,157,157,157,157,157,157,157,6.137608
1,causal_oversimplification,165,165,165,165,165,165,165,165,6.450352
2,doubt,157,157,157,157,157,157,157,157,6.137608
3,"exaggeration,minimisation",169,169,169,169,169,169,169,169,6.606724
4,flag_waving,155,155,155,155,155,155,155,155,6.059421
5,loaded_language,161,161,161,161,161,161,161,161,6.29398
6,"name_calling,labeling",166,166,166,166,166,166,166,166,6.489445
7,not_propaganda,1268,1268,1268,1268,1268,1268,1268,1268,49.569977
8,repetition,160,160,160,160,160,160,160,160,6.254887


In [None]:
#Propaanada types class proportions
percentage_train = propaganda_train_df.groupby("label").count().reset_index()
percentage_train["percentage"] = (percentage_train["tagged_in_context"] / percentage_train['tagged_in_context'].sum()) * 100
percentage_train[["label","percentage"]]

Unnamed: 0,label,percentage
0,appeal_to_fear_prejudice,12.170543
1,causal_oversimplification,12.790698
2,doubt,12.170543
3,"exaggeration,minimisation",13.100775
4,flag_waving,12.015504
5,loaded_language,12.48062
6,"name_calling,labeling",12.868217
7,repetition,12.403101


Testing

In [None]:
percentage_test = testdf.groupby("label").count().reset_index()
percentage_test["percentage"] = (percentage_test["tagged_in_context"] / percentage_test['tagged_in_context'].sum()) * 100
percentage_test

Unnamed: 0,label,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,type,percentage
0,appeal_to_fear_prejudice,43,43,43,43,43,43,43,43,6.71875
1,causal_oversimplification,35,35,35,35,35,35,35,35,5.46875
2,doubt,43,43,43,43,43,43,43,43,6.71875
3,"exaggeration,minimisation",30,30,30,30,30,30,30,30,4.6875
4,flag_waving,45,45,45,45,45,45,45,45,7.03125
5,loaded_language,39,39,39,39,39,39,39,39,6.09375
6,"name_calling,labeling",34,34,34,34,34,34,34,34,5.3125
7,not_propaganda,331,331,331,331,331,331,331,331,51.71875
8,repetition,40,40,40,40,40,40,40,40,6.25


In [None]:
percentage_test = testdf.groupby("binary_label").count().reset_index()
percentage_test["percentage"] = (percentage_test["tagged_in_context"] / percentage_test['tagged_in_context'].sum()) * 100
percentage_test

Unnamed: 0,binary_label,label,tagged_in_context,sentence,span,span_length,len,sentence_length,type,percentage
0,0,331,331,331,331,331,331,331,331,51.71875
1,1,309,309,309,309,309,309,309,309,48.28125


In [None]:
#Propanada types class proportions
percentage_test = propaganda_test_df.groupby("label").count().reset_index()
percentage_test["percentage"] = (percentage_test["tagged_in_context"] / percentage_test['tagged_in_context'].sum()) * 100
percentage_test[["label","percentage"]]

Unnamed: 0,label,percentage
0,appeal_to_fear_prejudice,13.915858
1,causal_oversimplification,11.326861
2,doubt,13.915858
3,"exaggeration,minimisation",9.708738
4,flag_waving,14.563107
5,loaded_language,12.621359
6,"name_calling,labeling",11.003236
7,repetition,12.944984


Graphs representing proportions

In [None]:
data_per = datadf
data_per= data_per.groupby(["type", "label", "binary_label"]).size().reset_index(name="count")
data_per['binary_label'] = data_per['binary_label'].map({1: "Propaganda", 0:"Not Propaganda"})

test = data_per[data_per["type"]=="test"]
train = data_per[data_per["type"]=="train"]


test["percent"] = (test['count'] / test['count'].sum()) * 100
train["percent"] = (train['count'] / train['count'].sum()) * 100

data_per = pd.concat([test,train])
data_per


data =  alt.Chart(data_per, title = "     Class proportions", width={"step": 40}).mark_bar().encode(
    x=alt.X("binary_label:N", title = "Binary label"),
    y = alt.Y("percent", title = "Percentage(%)"),
    column = alt.Column("type", title="Dataset type"),
    color = alt.Color("label", title = "Label")
)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["percent"] = (test['count'] / test['count'].sum()) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["percent"] = (train['count'] / train['count'].sum()) * 100


### Train split

In [None]:
split = pd.DataFrame({"Type":["Test","Train","Train"],
              "Subtype":["Test","Train","Validation"],
              "Percentage":[0.2,0.6,0.2]})

In [None]:
source = pd.DataFrame({"category": [1, 2, 3, 4, 5, 6], "value": [4, 6, 10, 3, 7, 8]})

alt.Chart(split).mark_arc().encode(
    theta="Percentage",
    color="Subtype"
)




### Length of sentences

#### Barchart

Average words in a sentence.

In [None]:
#class
train_lendf = traindf.groupby("label").mean().reset_index()
test_lendf = testdf.groupby("label").mean().reset_index()

train = alt.Chart(train_lendf).mark_bar().encode(
    x = alt.X("label",title = "Label"),
    y = alt.Y("sentence_length", title = "Average sentence length")
)

test =  alt.Chart(test_lendf).mark_bar().encode(
    x = alt.X("label",title = "Label"),
    y = alt.Y("sentence_length", title = "Average sentence length")
)
train |test

In [None]:
#binary class
train_len_binary_df = traindf.groupby("binary_label").mean().reset_index()
test_len_binary_df = testdf.groupby("binary_label").mean().reset_index()

train = alt.Chart(train_len_binary_df, title ="Training data ").mark_bar().encode(
    x = alt.X("binary_label:N",title = "Label"),
    y = alt.Y("sentence_length", title = "Average character length")

)

test =  alt.Chart(test_len_binary_df, title ="Testing data ").mark_bar().encode(
    x = alt.X("binary_label:N",title = "Label"),
    y = alt.Y("sentence_length", title = "Average character length")
)
train | test

Average words in each span, this is every word in between the \<BOS> and \<EOS> tag

In [None]:
#class
train = alt.Chart(train_lendf, title ="Training data" ).mark_bar().encode(
    x = alt.X("label",title = "Label"),
    y = alt.Y("span_length", title = "Average span length")
)

test =  alt.Chart(test_lendf, title ="Testing data ").mark_bar().encode(
    x = alt.X("label",title = "Label"),
    y = alt.Y("span_length", title = "Average span length")
)
train |test

In [None]:
#binary class
train = alt.Chart(train_len_binary_df, title ="Training data ").mark_bar().encode(
    x = alt.X("binary_label:N",title = "Label"),
    y = alt.Y("sentence_length", title = "Average character length")

)

test =  alt.Chart(test_len_binary_df, title ="Testing data ").mark_bar().encode(
    x = alt.X("binary_label:N",title = "Label"),
    y = alt.Y("sentence_length", title = "Average words in a span")
)
train | test

#### Boxplot

In [None]:
#Length of sentence per class
alt.Chart(datadf).mark_boxplot(ticks = True, size = 10).encode(
    x= alt.X('type:O', axis = None,sort = ["Train, Test"]),
    y=alt.Y('sentence_length:Q', title = "Numnber of words",),
    color = "type:N",
    column = alt.Column("binary_label:N",align = "none",title = "Sentence length per class", spacing = 0.2)
).configure_view(
    stroke="transparent"
).configure_scale(
    bandPaddingInner=0,
    bandPaddingOuter=0.1,
).configure_header(
    labelOrient = "bottom",
    labelAlign = "right",
    labelAngle = -75,
)

In [None]:
#Length of span
alt.Chart(propaganda_df).mark_boxplot(ticks = True, size = 10).encode(
    x= alt.X('type:O', axis = None),
    y=alt.Y('span_length:Q', title = "Numnber of words"),
    color = "type:N",
    column = alt.Column("label:N",align = "none",title = "Span length class", spacing = 0.2)
).configure_view(
    stroke="transparent"
).configure_scale(
    bandPaddingInner=0,
    bandPaddingOuter=0.1,
).configure_header(
    labelOrient = "bottom",
    labelAlign = "right",
    labelAngle = -75,
)

# Task 1 - binary

## Baseline

In [None]:
from sklearn.dummy import DummyClassifier

corpus_train = traindf["sentence"]
labels_train = traindf["binary_label"].values

corpus_test = testdf["sentence"]
labels_test = testdf["binary_label"].values

# Create a dummy classifier that predicts classes randomly
random_clf = DummyClassifier(strategy='uniform', random_state=42)

# Train the classifier on your training data
random_clf.fit(corpus_train, labels_train)

# Test the classifier on your test data
labels_pred = random_clf.predict(corpus_test)
print(sklearn.metrics.classification_report(labels_test, labels_pred))


              precision    recall  f1-score   support

           0       0.52      0.48      0.50       331
           1       0.49      0.52      0.50       309

    accuracy                           0.50       640
   macro avg       0.50      0.50      0.50       640
weighted avg       0.50      0.50      0.50       640



## Random Forest

#### Data

In [None]:
corpus_train = [s.translate(str.maketrans('', '', string.punctuation)) for s in traindf["sentence"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
corpus_train = [s.translate(str.maketrans("”"," ","")) for s in corpus_train]
corpus_train= [s.translate(str.maketrans("“"," ","")) for s in corpus_train]
labels_train = traindf["binary_label"].values

corpus_test = [s.translate(str.maketrans('', '', string.punctuation)) for s in testdf["sentence"].apply(lambda x: x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
corpus_test = [s.translate(str.maketrans("”"," ","")) for s in corpus_test]
corpus_test = [s.translate(str.maketrans("“"," ","")) for s in corpus_test]
labels_test = testdf["binary_label"].values

#### Sweep

In [None]:
def run_sweep(name, max_depth, n_estimators, n_gram_range,max_features):
  vectorizer = CountVectorizer(analyzer = "word", ngram_range=n_gram_range)
  x_train = vectorizer.fit_transform(corpus_train)
  x_test = vectorizer.transform(corpus_test)
  feature_names = ["sentence"]
  labels = ["propaganda","not_propaganda"]
  test_size = 0.19372077488309952

  # train model
  model = RandomForestClassifier(n_estimators = n_estimators,
                                 max_features=max_features,
                                 max_depth =max_depth)
  model.fit(x_train, labels_train)
  model_params = model.get_params()
  model_params["n_gram_range"]=n_gram_range[1]
  # get predictions
  labels_pred = model.predict(x_test)
  labels_probas = model.predict_proba(x_test)
  importances = model.feature_importances_
  indices = np.argsort(importances)[::-1]

  # start a new wandb run and add your model hyperparameters
  wandb.init(project='NLP_RF1_', config=model_params, name = name)

  # Add additional configs to wandb
  wandb.config.update({"test_size" : test_size,
                      "train_len" : len(corpus_train),
                      "test_len" : len(corpus_test)})

  # log additional visualisations to wandb
  plot_class_proportions(labels_train, labels_test, labels)
  plot_learning_curve(model, x_train, labels_train)
  plot_roc(labels_test, labels_probas, labels)
  plot_precision_recall(labels_test, labels_probas, labels)
  plot_feature_importances(model)
  wandb.sklearn.plot_classifier(model, x_train, x_test,labels_train, labels_test, labels_pred, labels_probas, labels,
                                                         model_name=name, feature_names=None)

  #log metrics to wandb
  accuracy = sklearn.metrics.accuracy_score(labels_test, labels_pred)
  precision = sklearn.metrics.precision_score(labels_test, labels_pred)
  recall = sklearn.metrics.recall_score(labels_test, labels_pred)
  f1 = sklearn.metrics.f1_score(labels_test, labels_pred)

  wandb.summary["accuracy"] = accuracy
  wandb.summary["f1"] = f1
  wandb.summary["precision"] = precision
  wandb.summary["recall"] = recall

  # [optional] finish the wandb run, necessary in notebooks
  wandb.finish()

In [None]:
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
}

param_combinations = sorted(param_grid)
param_combinations = [item for item in it.product(*(param_grid[param] for param in param_grid))]


Run multiple parameter combinations

In [None]:
#Unigram
param_combinations
n=0
for param in param_combinations:
  print(param)
  run_sweep(
            name = "unigram"+str(x),
            n_estimators = param[0],
            n_gram_range = (1,1),
            max_features = param[1],
            max_depth = param[2])
  n+=1

In [None]:
#Bigram
param_combinations
n=0
for param in param_combinations:
  print(param)
  run_sweep(
            name = "unigram"+str(x),
            n_estimators = param[0],
            n_gram_range = (1,1),
            max_features = param[1],
            max_depth = param[2])
  n+=1

In [None]:
#Trigram
param_combinations
n=0
for param in param_combinations:
  print(param)
  run_sweep(
            name = "unigram"+str(x),
            n_estimators = param[0],
            n_gram_range = (1,1),
            max_features = param[1],
            max_depth = param[2])
  n+=1

#### Final model

In [None]:
vectorizer = CountVectorizer(analyzer = "word", ngram_range=(1,2))
x_train = vectorizer.fit_transform(corpus_train)
x_test = vectorizer.transform(corpus_test)
feature_names = ["sentence"]
labels = ["propaganda","not_propaganda"]
test_size = 0.19372077488309952

# train model
model = RandomForestClassifier(random_state = 21,
                               n_estimators = 6,
                               max_features="sqrt",
                               max_depth =200)
model.fit(x_train, labels_train)
model_params = model.get_params()
model_params["n_gram_range"]=2

# get predictions
labels_pred = model.predict(x_test)
labels_probas = model.predict_proba(x_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
# start a new wandb run and add your model hyperparameters
wandb.init(project='NLP_RF1_', config=model_params, name = "Final")
# Add additional configs to wandb
wandb.config.update({"test_size" : test_size,
                    "train_len" : len(corpus_train),
                    "test_len" : len(corpus_test)})

# log additional visualisations to wandb
plot_class_proportions(labels_train, labels_test, labels)
plot_learning_curve(model, x_train, labels_train)
plot_roc(labels_test, labels_probas, labels)
plot_precision_recall(labels_test, labels_probas, labels)
plot_feature_importances(model)
wandb.sklearn.plot_classifier(model, x_train, x_test,labels_train, labels_test, labels_pred, labels_probas, labels,
                                                        model_name="Final", feature_names=None)
print(sklearn.metrics.classification_report(labels_test, labels_pred))

#log metrics to wandb
accuracy = sklearn.metrics.accuracy_score(labels_test, labels_pred)
precision = sklearn.metrics.precision_score(labels_test, labels_pred)
recall = sklearn.metrics.recall_score(labels_test, labels_pred)
f1 = sklearn.metrics.f1_score(labels_test, labels_pred)

wandb.summary["accuracy"] = accuracy
wandb.summary["f1"] = f1
wandb.summary["precision"] = precision
wandb.summary["recall"] = recall

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mpauladelcastillovivero[0m ([33mpdc[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Final.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


              precision    recall  f1-score   support

           0       0.62      0.80      0.70       331
           1       0.69      0.48      0.57       309

    accuracy                           0.65       640
   macro avg       0.66      0.64      0.63       640
weighted avg       0.66      0.65      0.64       640



0,1
accuracy,0.64531
f1,0.56762
precision,0.68981
recall,0.4822


In [None]:
# get predictions
labels_pred_train = model.predict(x_train)
print(sklearn.metrics.classification_report(labels_train, labels_pred_train))


              precision    recall  f1-score   support

           0       0.84      0.97      0.90      1268
           1       0.97      0.81      0.88      1290

    accuracy                           0.89      2558
   macro avg       0.90      0.89      0.89      2558
weighted avg       0.90      0.89      0.89      2558



#### Results

In [None]:
labels_pred = model.predict(x_test)
#Predictions to
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results.csv")
results_df["prob_prop_RF1"] = labels_probas[:,1]
results_df["predictions_RF1"]=labels_pred
results_df["accuracy_RF1"] = np.where(results_df["binary_label"] != results_df["predictions_RF1"], 0,1)

wrong_predictions = testdf[testdf["accuracy_RF1"] ==0]
wrong_predictions["sentence_"] = [s.translate(str.maketrans('', '', string.punctuation)) for s in wrong_predictions["sentence"].apply(lambda x: x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
wrong_predictions["sentence_"] =  [s.translate(str.maketrans("”"," ","")) for s in wrong_predictions["sentence_"]]
wrong_predictions["sentence_"] = [s.translate(str.maketrans("“"," ","")) for s in wrong_predictions["sentence_"] ]

#predicted as not propaganda when propaganda
wrong_predictions_as_not_propaganda = wrong_predictions[wrong_predictions["binary_label"] ==1]

#predicted as propaganda when not propaganda
wrong_predictions_as_propaganda = wrong_predictions[wrong_predictions["binary_label"] ==0 ]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_predictions["sentence_"] = [s.translate(str.maketrans('', '', string.punctuation)) for s in wrong_predictions["sentence"].apply(lambda x: x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_predictions["sentence_"] =  [s.translate(str.maketrans("”"," ","")) for s in wrong_predictions["sentence_"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

In [None]:
results_df.to_csv("/content/drive/MyDrive/ANLP/results.csv", index = 0)

In [None]:
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results.csv")

##### Graphs classes

In [None]:
prop_answers = results_df.groupby(["predictions_RF1","accuracy_RF1"])["tagged_in_context"].count()
prop_answers = prop_answers.reset_index(name = "Count")
prop_answers["accuracy_RF1"] = prop_answers["accuracy_RF1"].map({1:"Correct",0:"Incorrect"})
prop_answers["predictions_RF1"] = prop_answers["predictions_RF1"].map({1:"Propaganda",0:"Not Propaganda"})

alt.Chart(prop_answers,
    title="Predictions per Class"
    ).mark_bar().encode(
    alt.Y("predictions_RF1:O",
          title = "Prediction"
          ),
    alt.X("Count",
          title="Percentage of answers(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    color =alt.Color("accuracy_RF1:O", title = ""))

In [None]:
right_answers =results_df.groupby(["binary_label","accuracy_RF1"])["tagged_in_context"].count()
right_answers = right_answers.reset_index(name ="count")
right_answers["accuracy_RF1"] = right_answers["accuracy_RF1"].map({1:"Correct",0:"Incorrect"})
right_answers["binary_label"] = right_answers["binary_label"].map({1:"Propaganda",0:"Not Propaganda"})

alt.Chart(right_answers,
    title= "Label"
    ).mark_bar().encode(
    y = alt.Y("binary_label:O",
          title = "True label"
          ),
    x = alt.X("count",
          title="Percentage of answers(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    color = alt.Color("accuracy_RF1:O", title = "")
 )


In [None]:
alt.Chart(
    wrong_predictions_as_not_propaganda,
    title= "Propaganda sentences labelled as Not propaganda"
    ).transform_joinaggregate(
    totalCount="count(*)"
    ).transform_calculate(
    PercentOfTotal="1 / datum.totalCount"
    ).mark_bar().encode(
    alt.Y("label",
          title = "True Label"),
    alt.X( "sum(PercentOfTotal):Q",
          title="Percentage of wrong answers(%)",
          axis=alt.Axis(format="%"),
          ),
    color =alt.Color("label", legend = None))

##### Frequency of words

In [None]:
#get the words that appeared most in wrongly predicted sentences
top_words_wrong_predictions = Counter(" ".join(wrong_predictions["sentence_"]).split()).most_common(7)
top_words_wrong_predictions


Most frequent words in sentences predicted as not propaganda when propaganda

In [None]:
top_words_wrong_predictions_as_not_propaganda = Counter(" ".join(wrong_predictions_as_not_propaganda["sentence_"]).split()).most_common(7)
top_words_wrong_predictions_as_not_propaganda

Most frequent words in sentences predicted as propaganda when not propaganda

In [None]:
top_words_wrong_predictions_as_propaganda = Counter(" ".join(wrong_predictions_as_propaganda["sentence_"]).split()).most_common(7)
top_words_wrong_predictions_as_propaganda

##### Length of sentences

In [None]:
lendf = results_df
lendf["accuracy_RF1"] =lendf["accuracy_RF1"].map({1:"Correct",0:"Incorrect"})
lendf["binary_label"] =lendf["binary_label"].map({1:"Propaganda",0:"Not_propaganda"})

#Histogram
alt.Chart(
    lendf,
    title = "Percentage of predictions per sentence length"
    ).transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total'
).mark_bar().encode(
    y=alt.Y("sentence_length:O",
            title = "Number of words in sentence",
            bin = alt.Bin(step = 10),
            ),
    x = alt.X("sum(pct):Q",axis=alt.Axis(format='%'),
              title = "Number of predictions",
              stack  = "normalize"
              ),
    color = "accuracy_RF1:O"
)

In [None]:
alt.Chart(lendf).mark_boxplot(ticks = True, size = 10).encode(
    y= alt.Y('accuracy_RF1:O', axis = None),
    x=alt.X('sentence_length:Q', title = "Numnber of words",),
    color = alt.Color("accuracy_RF1:N"),
    row = alt.Row("binary_label:N",align = "none",title = "", spacing = 0.2)
).configure_view(
    stroke="transparent"
).configure_scale(
    bandPaddingInner=0,
    bandPaddingOuter=0.1,
).configure_header(
    labelOrient = "left",
    labelAlign = "left",
    labelAngle = 0,
)

##### Boundary predictions

In [None]:
results_df[results_df["prob_prop_RF1"].between(0.46,0.54)]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,label,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,...,prob_prop_RF2,predictions_RF2,accuracy_RF2,predictions_roB2,accuracy_roB2,predictions_num_roB2,predictions_num_roB1,predictions_roB1,accuracy_roB1,accuracy_RF1
22,22,22,"name_calling,labeling","In Alexandra, Virginia – <BOS> the suburban ho...",Propaganda,"In Alexandra, Virginia – the suburban home of ...",the suburban home of America’s war-making elite,7,215,38,...,0.0,flag_waving,,"name_calling,labeling",1,6,1,1,1,Correct
52,52,52,doubt,<BOS> Instead of reaching out to the greatest ...,Propaganda,Instead of reaching out to the greatest demogr...,Instead of reaching out to the greatest demogr...,71,420,72,...,0.0,flag_waving,,causal_oversimplification,0,1,1,1,1,Incorrect
58,58,58,not_propaganda,And he may do so even though the implications ...,Not_propaganda,And he may do so even though the implications ...,so would render him an,5,180,31,...,0.0,not_propaganda,,not_propaganda,1,7,0,0,1,Correct
85,85,85,loaded_language,An outbreak of the plague in Madagascar is spr...,Propaganda,An outbreak of the plague in Madagascar is spr...,unprecedented,1,80,13,...,0.142857,not_propaganda,,not_propaganda,0,7,1,0,0,Incorrect
88,88,88,appeal_to_fear_prejudice,"And guess what, <BOS> next time a man is nomin...",Propaganda,"And guess what, next time a man is nominated t...",next time a man is nominated to a position who...,26,155,30,...,0.0,not_propaganda,,causal_oversimplification,0,1,1,1,1,Correct
114,114,114,loaded_language,As for my metaphorical reference to a “Leviath...,Propaganda,As for my metaphorical reference to a “Leviath...,Dr. Fastiggi has succumbed to an unfortunate l...,8,136,21,...,0.0,"name_calling,labeling",,doubt,0,2,1,1,1,Incorrect
123,123,123,not_propaganda,You don’t have to look far for fresh news abou...,Not_propaganda,You don’t have to look far for fresh news abou...,"these days,",2,145,26,...,0.0,not_propaganda,,doubt,0,2,0,0,1,Correct
125,125,125,not_propaganda,"After Ford’s identity became public, Grassley’...",Not_propaganda,"After Ford’s identity became public, Grassley’...",in,1,171,25,...,0.142857,not_propaganda,,not_propaganda,1,7,0,0,1,Incorrect
160,160,160,doubt,Now that ISIS has been driven out of Raqqa and...,Propaganda,Now that ISIS has been driven out of Raqqa and...,by what authority do U.S. forces remain to arm...,23,193,34,...,0.428571,causal_oversimplification,,doubt,1,2,1,1,1,Incorrect
229,229,229,not_propaganda,Hence it is a time when they’re supposed to gr...,Not_propaganda,Hence it is a time when they’re supposed to gr...,and kind toward their fellow Muslims.,6,102,18,...,0.0,not_propaganda,,appeal_to_fear_prejudice,0,0,0,1,0,Correct


## RoBERTa


In [None]:
#Prepare datasets
train_dataset = traindf.drop(["label","span","tagged_in_context","span_length",	"sentence_length","type","len"], axis = 1)
train_dataset.rename(columns={"binary_label": "label","sentence":"text"}, inplace=True)

#Split training into validation and training
train_dataset, validation_dataset = train_test_split(train_dataset, test_size=0.2)
train_dataset = Dataset.from_pandas(train_dataset)
validation_dataset = Dataset.from_pandas(validation_dataset)


test_dataset = testdf.drop(["label","span","tagged_in_context","span_length",	"sentence_length","type","len"], axis = 1)
test_dataset.rename(columns={"binary_label": "label","sentence":"text"}, inplace=True)
test_dataset = Dataset.from_pandas(test_dataset)
train_dataset

Dataset({
    features: ['label', 'text', '__index_level_0__'],
    num_rows: 2046
})

### Sweep to choose hyperparameters using wandb

In [None]:
sweep_config = {
    'method': 'random'
}


# hyperparameters
parameters_dict = {
    'epochs': {
        'value': 15
        },
    'learning_rate': {
        'distribution': 'uniform',
        'max': 0.0001,
        'min': 0.000016952978519689297
        },
    'weight_decay': {
        'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
        },

}


sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project='nlp_propaganda')

def model_init():
  model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
  return model

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy.compute(predictions = predictions, references = labels)
    precision = precision.compute(predictions = predictions, references = labels)
    recall = recall.compute(predictions = predictions, references = labels)
    f1 = f1.compute(predictions = predictions, references = labels)
    return {"accuracy": accuracy["accuracy"],
            "precision": precision["precision"],
            "recall": recall["recall"],
            "f1": f1["f1"]}


Create sweep with ID: ry4cruss
Sweep URL: https://wandb.ai/pdc/nlp_propaganda/sweeps/ry4cruss


In [None]:
def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config


    # set training arguments
    training_args = TrainingArguments(
        output_dir='/content/drive/MyDrive/ANLP/models/task1',
	      report_to='wandb',  # Turn on Weights & Biases logging
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        save_strategy='epoch',
        evaluation_strategy='epoch',
        logging_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model = "f1",
    )

    #define training loop
    trainer = Trainer(
        model= model_init(),
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_validation_dataset,
        compute_metrics=compute_metrics,
    )

    # start training loop
    trainer.train()


In [None]:
wandb.agent(sweep_id, train, count=15)

[34m[1mwandb[0m: Agent Starting Run: u64lqyy9 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 7.410297246846257e-05
[34m[1mwandb[0m: 	weight_decay: 0.1


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5041,0.521734,0.808594,0.756522,0.949091,0.841935
2,0.3976,0.529144,0.869141,0.972727,0.778182,0.864646
3,0.3051,0.377419,0.880859,0.924603,0.847273,0.88425
4,0.2528,0.393802,0.900391,0.911765,0.901818,0.906764
5,0.2319,0.374432,0.910156,0.904594,0.930909,0.917563
6,0.1773,0.414781,0.908203,0.910072,0.92,0.915009
7,0.13,0.36603,0.927734,0.931159,0.934545,0.932849
8,0.082,0.440871,0.923828,0.964567,0.890909,0.926276
9,0.0468,0.603642,0.908203,0.959677,0.865455,0.910134
10,0.0467,0.535734,0.910156,0.942085,0.887273,0.913858


VBox(children=(Label(value='0.001 MB of 0.019 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.057796…

0,1
eval/accuracy,▁▅▅▆▇▇██▇▇▆█▆▇▇
eval/f1,▁▃▄▆▇▇█▇▆▇▆█▅▇▇
eval/loss,▄▄▁▂▁▂▁▃▆▄▇▄█▅▆
eval/precision,▁█▆▆▆▆▇██▇▇▇▇▇▇
eval/recall,█▁▄▆▇▇▇▆▅▅▄▇▄▇▆
eval/runtime,▂▁▂█▁▁▁▁▁▁▂▁▁▂▂
eval/samples_per_second,▇█▇▁██████▇██▇▇
eval/steps_per_second,▇█▇▁██████▇██▇▇
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.91016
eval/f1,0.91513
eval/loss,0.62356
eval/precision,0.92884
eval/recall,0.90182
eval/runtime,19.2847
eval/samples_per_second,26.55
eval/steps_per_second,3.319
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: ezkehlns with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 4.597764598267162e-05
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4373,0.302138,0.871094,0.847176,0.927273,0.885417
2,0.2684,0.37714,0.914062,0.963855,0.872727,0.916031
3,0.1751,0.359536,0.910156,0.880399,0.963636,0.920139
4,0.1173,0.286631,0.935547,0.941606,0.938182,0.939891
5,0.0503,0.436328,0.921875,0.933579,0.92,0.92674
6,0.0347,0.461001,0.939453,0.965649,0.92,0.942272
7,0.0298,0.46304,0.9375,0.929329,0.956364,0.942652
8,0.017,0.497587,0.9375,0.945055,0.938182,0.941606
9,0.0169,0.509685,0.935547,0.935252,0.945455,0.940325
10,0.0018,0.540337,0.939453,0.917808,0.974545,0.945326


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▅▅▇▆▇▇▇▇▇▇▇███
eval/f1,▁▄▅▇▆▇▇▇▇█▇▇███
eval/loss,▁▃▃▁▅▅▅▆▆▇██▇██
eval/precision,▁█▃▇▆█▆▇▆▅▇▇▇▇▇
eval/recall,▅▁▇▅▄▄▇▅▆█▅▅▇▆▆
eval/runtime,▃▅▃▄█▁▂▆▁▃▆▅▂▆▂
eval/samples_per_second,▆▄▆▅▁█▇▃█▆▃▄▇▃▇
eval/steps_per_second,▆▄▆▅▁█▇▃█▆▃▅▇▃▇
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.94141
eval/f1,0.94545
eval/loss,0.56237
eval/precision,0.94545
eval/recall,0.94545
eval/runtime,19.2056
eval/samples_per_second,26.659
eval/steps_per_second,3.332
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: vov71ghj with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 6.354964289508763e-05
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4437,0.430518,0.796875,0.977654,0.636364,0.770925
2,0.3352,0.784125,0.830078,1.0,0.683636,0.812095
3,0.1944,0.277883,0.933594,0.9319,0.945455,0.938628
4,0.1264,0.329745,0.939453,0.951852,0.934545,0.943119
5,0.0755,0.374372,0.925781,0.94717,0.912727,0.92963
6,0.0387,0.36508,0.943359,0.939286,0.956364,0.947748
7,0.0128,0.431633,0.945312,0.969582,0.927273,0.947955
8,0.0246,0.467754,0.933594,0.947955,0.927273,0.9375
9,0.0092,0.407731,0.949219,0.94306,0.963636,0.953237
10,0.0131,0.33784,0.951172,0.946429,0.963636,0.954955


0,1
eval/accuracy,▁▂▇▇▇▇█▇███████
eval/f1,▁▃▇▇▇██▇███████
eval/loss,▃█▁▂▂▂▃▄▃▂▂▃▃▃▃
eval/precision,▆█▁▃▃▂▅▃▂▂▅▅▅▄▄
eval/recall,▁▂█▇▇█▇▇██▇▇▇██
eval/runtime,▁▂▂▁▁▂▁▁▂▂▂▂▂█▂
eval/samples_per_second,█▇▇██▇██▇▇▇▇▇▁▇
eval/steps_per_second,█▇▇██▇██▇▇▇▇▇▁▇
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.95117
eval/f1,0.9543
eval/loss,0.409
eval/precision,0.95956
eval/recall,0.94909
eval/runtime,19.3163
eval/samples_per_second,26.506
eval/steps_per_second,3.313
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: vzh9sp87 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 4.560497439369096e-05
[34m[1mwandb[0m: 	weight_decay: 0.1


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4294,0.234541,0.912109,0.919708,0.916364,0.918033
2,0.251,0.526767,0.902344,0.970711,0.843636,0.902724
3,0.1519,0.268532,0.9375,0.929329,0.956364,0.942652
4,0.0759,0.483586,0.923828,0.972,0.883636,0.925714
5,0.0371,0.412687,0.933594,0.935018,0.941818,0.938406
6,0.0325,0.532091,0.925781,0.94717,0.912727,0.92963
7,0.0148,0.67224,0.919922,0.971774,0.876364,0.921606
8,0.0029,0.553797,0.935547,0.951493,0.927273,0.939227
9,0.007,0.474705,0.945312,0.97318,0.923636,0.947761
10,0.0,0.561476,0.935547,0.976378,0.901818,0.937618


VBox(children=(Label(value='0.001 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.111948…

0,1
eval/accuracy,▃▁▇▅▆▅▄▆█▆▆▆▆▇▆
eval/f1,▃▁▇▅▇▅▄▇█▆▆▆▆▇▇
eval/loss,▁▆▂▅▄▆█▆▅▆▇▇▇▇▇
eval/precision,▁▇▂▇▃▄▇▅▇██▇▆▇▆
eval/recall,▆▁█▃▇▅▃▆▆▅▄▅▅▅▅
eval/runtime,▁█▅▅▄▅▅▂▆▆▅▂▂▁▄
eval/samples_per_second,█▁▄▄▅▄▄▇▃▃▄▇▇█▅
eval/steps_per_second,█▁▄▄▅▄▄▇▃▃▄▇▇█▅
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.93555
eval/f1,0.93832
eval/loss,0.60409
eval/precision,0.96538
eval/recall,0.91273
eval/runtime,19.2681
eval/samples_per_second,26.572
eval/steps_per_second,3.322
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: 1qsb4kvb with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 6.904675662739168e-05
[34m[1mwandb[0m: 	weight_decay: 0.3


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4719,0.282632,0.90625,0.903915,0.923636,0.913669
2,0.292,0.37026,0.916016,0.971545,0.869091,0.917466
3,0.2095,0.290807,0.9375,0.941818,0.941818,0.941818
4,0.1237,0.440585,0.933594,0.965251,0.909091,0.93633
5,0.1054,0.286978,0.933594,0.958175,0.916364,0.936803
6,0.0726,0.281411,0.945312,0.97318,0.923636,0.947761
7,0.0274,0.416606,0.943359,0.939286,0.956364,0.947748
8,0.0378,0.373055,0.9375,0.948339,0.934545,0.941392
9,0.0081,0.417688,0.943359,0.945652,0.949091,0.947368
10,0.0154,0.447613,0.943359,0.969466,0.923636,0.945996


0,1
eval/accuracy,▁▃▇▆▆██▇███▇▅▇▇
eval/f1,▁▂▇▆▆██▇███▇▅▆▇
eval/loss,▁▃▁▅▁▁▄▃▄▅▄▅█▆▆
eval/precision,▁▇▄▆▆▇▄▅▅▇▅▇█▆▆
eval/recall,▅▁▇▄▅▅█▆▇▅▇▅▂▅▅
eval/runtime,▁▁▁▂▁▂▂█▂▂▂▁▂▂▂
eval/samples_per_second,███▇█▇▇▁▇▇▇█▇▇▇
eval/steps_per_second,███▇▇▇▇▁▇▇▇█▇▇▇
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.9375
eval/f1,0.94074
eval/loss,0.50936
eval/precision,0.95849
eval/recall,0.92364
eval/runtime,19.3189
eval/samples_per_second,26.503
eval/steps_per_second,3.313
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: fpc09fip with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 4.2870314950120584e-05
[34m[1mwandb[0m: 	weight_decay: 0.2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4324,0.754861,0.867188,0.812689,0.978182,0.887789
2,0.2558,0.379816,0.923828,0.972,0.883636,0.925714
3,0.2034,0.304571,0.941406,0.920962,0.974545,0.946996
4,0.0967,0.408985,0.939453,0.969231,0.916364,0.942056
5,0.0641,0.257896,0.953125,0.937282,0.978182,0.957295
6,0.0362,0.381668,0.941406,0.939068,0.952727,0.945848
7,0.0184,0.333934,0.955078,0.966667,0.949091,0.957798
8,0.0117,0.434905,0.941406,0.969349,0.92,0.94403
9,0.0069,0.403152,0.953125,0.970037,0.941818,0.95572
10,0.0093,0.381457,0.955078,0.963235,0.952727,0.957952


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▅▇▇█▇█▇███▇▇▇█
eval/f1,▁▅▇▆█▇█▆███▇▇▇█
eval/loss,█▃▂▃▁▃▂▃▃▃▃▄▃▃▃
eval/precision,▁█▆█▆▇████▇▇██▇
eval/recall,█▁█▃█▆▆▄▅▆▇▇▅▅▆
eval/runtime,█▃▂▂▃▃▃▁▂▁▃▂▁▂▂
eval/samples_per_second,▁▆▇▇▆▆▆█▇█▆▇█▇▇
eval/steps_per_second,▁▆▇▇▆▆▆█▇█▆▇█▇▇
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.95508
eval/f1,0.95811
eval/loss,0.39903
eval/precision,0.95985
eval/recall,0.95636
eval/runtime,19.2983
eval/samples_per_second,26.531
eval/steps_per_second,3.316
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: 24u0ras8 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 5.1185868174421106e-05
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4328,0.267563,0.888672,0.881119,0.916364,0.898396
2,0.235,0.260266,0.927734,0.919014,0.949091,0.93381
3,0.165,0.374263,0.917969,0.987448,0.858182,0.918288
4,0.092,0.398599,0.931641,0.922535,0.952727,0.937388
5,0.0509,0.492062,0.929688,0.954373,0.912727,0.933086
6,0.0338,0.370329,0.939453,0.948529,0.938182,0.943327
7,0.0194,0.419668,0.939453,0.945255,0.941818,0.943534
8,0.0069,0.676782,0.919922,0.97561,0.872727,0.921305
9,0.0134,0.553346,0.933594,0.961686,0.912727,0.936567
10,0.015,0.510279,0.935547,0.929078,0.952727,0.940754


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▆▅▇▆██▅▇▇▇█▇██
eval/f1,▁▆▄▇▆██▄▇▇▇█▇██
eval/loss,▁▁▃▃▅▃▄█▆▅▆▆▆▆▆
eval/precision,▁▃█▄▆▅▅▇▆▄▇▅▅▅▅
eval/recall,▅█▁█▅▇▇▂▅█▅█▇▇▇
eval/runtime,█▂▁▂▂▁▁▂▂▁▁▁▅▄▂
eval/samples_per_second,▁▇█▇▇██▇▇███▄▅▇
eval/steps_per_second,▁▇█▇▇██▇▇███▄▅▇
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.94141
eval/f1,0.94545
eval/loss,0.55436
eval/precision,0.94545
eval/recall,0.94545
eval/runtime,19.2501
eval/samples_per_second,26.597
eval/steps_per_second,3.325
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 51s7bc2a with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 2.354764491680048e-05
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.393,0.247543,0.927734,0.934307,0.930909,0.932605
2,0.2256,0.370686,0.902344,0.959184,0.854545,0.903846
3,0.1815,0.246777,0.949219,0.933798,0.974545,0.953737
4,0.0859,0.333831,0.935547,0.948148,0.930909,0.93945
5,0.0316,0.368051,0.945312,0.962547,0.934545,0.948339
6,0.0239,0.501073,0.939453,0.948529,0.938182,0.943327
7,0.0202,0.463025,0.947266,0.942857,0.96,0.951351
8,0.0034,0.52465,0.939453,0.951852,0.934545,0.943119
9,0.001,0.485491,0.945312,0.949091,0.949091,0.949091
10,0.0097,0.577609,0.931641,0.96875,0.901818,0.934087


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▅▁█▆▇▇█▇▇▅▅▅▇▇▇
eval/f1,▅▁█▆▇▇█▇▇▅▅▅▇▇▇
eval/loss,▁▃▁▂▃▅▅▆▅▆██▇▆▆
eval/precision,▁▆▁▄▇▄▃▅▄█▅▅▅▄▄
eval/recall,▅▁█▅▆▆▇▆▇▄▄▄▆▆▆
eval/runtime,▂▁▂▁▂▁▃▃▂▁▂▂▂▃█
eval/samples_per_second,▇█▇█▇█▆▆▇█▇▇▇▆▁
eval/steps_per_second,▇█▇█▇█▆▆▇▇▆▇▇▆▁
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.94336
eval/f1,0.94718
eval/loss,0.55624
eval/precision,0.94891
eval/recall,0.94545
eval/runtime,19.9116
eval/samples_per_second,25.714
eval/steps_per_second,3.214
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: 4uy52mb7 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 7.668667529887219e-05
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5439,0.449374,0.771484,0.987654,0.581818,0.732265
2,0.597,0.690434,0.537109,0.537109,1.0,0.698856
3,0.7024,0.703398,0.462891,0.0,0.0,0.0
4,0.7039,0.730332,0.462891,0.0,0.0,0.0
5,0.7034,0.69043,0.537109,0.537109,1.0,0.698856
6,0.7,0.693851,0.462891,0.0,0.0,0.0
7,0.6981,0.691892,0.537109,0.537109,1.0,0.698856
8,0.6978,0.698871,0.462891,0.0,0.0,0.0
9,0.6977,0.690524,0.537109,0.537109,1.0,0.698856
10,0.6965,0.6906,0.537109,0.537109,1.0,0.698856


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0,1
eval/accuracy,▇▃▁▁▃▁▃▁▃▃▁▃▁▇█
eval/f1,█▇▁▁▇▁▇▁▇▇▁▇▁██
eval/loss,▁▇▇█▇▇▇▇▇▇▇▇▇▆▂
eval/precision,█▅▁▁▅▁▅▁▅▅▁▅▁▆█
eval/recall,▅█▁▁█▁█▁██▁█▁▇▆
eval/runtime,▆▄▃▁▇█▆▄▆▃▄▇▆▅▄
eval/samples_per_second,▃▄▆█▂▁▃▅▃▆▅▂▃▄▄
eval/steps_per_second,▃▅▆█▂▁▃▅▃▆▅▂▂▄▅
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.80078
eval/f1,0.77922
eval/loss,0.47567
eval/precision,0.96257
eval/recall,0.65455
eval/runtime,19.2482
eval/samples_per_second,26.6
eval/steps_per_second,3.325
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: agkd3ui3 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 9.549614678340753e-05
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6733,0.465494,0.773438,0.976048,0.592727,0.737557
2,0.5239,0.461887,0.773438,0.993789,0.581818,0.733945
3,0.4468,0.49189,0.773438,0.993789,0.581818,0.733945
4,0.4588,0.500826,0.777344,0.939891,0.625455,0.751092
5,0.4902,0.466347,0.761719,0.987261,0.563636,0.717593
6,0.4679,0.468961,0.751953,0.993333,0.541818,0.701176
7,0.4675,0.49039,0.763672,0.975309,0.574545,0.723112
8,0.4484,0.496679,0.765625,0.97546,0.578182,0.726027
9,0.4512,0.463564,0.767578,0.993671,0.570909,0.725173
10,0.4495,0.453092,0.767578,0.993671,0.570909,0.725173


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0,1
eval/accuracy,█████▇████▁▁▁▁▁
eval/f1,██████████▁▁▁▁▁
eval/loss,▁▁▂▂▁▁▂▂▁▁███▇▇
eval/precision,██████████▁▁▁▁▁
eval/recall,████▇▇▇▇▇▇▁▁▁▁▁
eval/runtime,▁▃▃▂▄▄▄▃▄█▆▂▁▆▄
eval/samples_per_second,█▆▆▇▅▅▅▆▅▁▃▇█▃▅
eval/steps_per_second,█▆▆▇▅▅▅▆▅▁▃▇█▃▅
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.46289
eval/f1,0.0
eval/loss,0.70133
eval/precision,0.0
eval/recall,0.0
eval/runtime,19.2938
eval/samples_per_second,26.537
eval/steps_per_second,3.317
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: oqyqgsh0 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 3.2023500598577546e-05
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4014,0.216677,0.919922,0.917857,0.934545,0.926126
2,0.2273,0.473795,0.916016,0.987395,0.854545,0.916179
3,0.1322,0.286297,0.931641,0.910959,0.967273,0.938272
4,0.0719,0.388801,0.947266,0.98062,0.92,0.949343
5,0.0279,0.348431,0.951172,0.940141,0.970909,0.955277
6,0.0135,0.419529,0.945312,0.930314,0.970909,0.950178
7,0.0056,0.540545,0.9375,0.976471,0.905455,0.939623
8,0.0042,0.491684,0.947266,0.976923,0.923636,0.949533
9,0.0134,0.35975,0.958984,0.966912,0.956364,0.961609
10,0.0013,0.499145,0.949219,0.930796,0.978182,0.953901


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▂▁▄▆▇▆▅▆█▆▆▆▆▆▇
eval/f1,▃▁▄▆▇▆▅▆█▇▆▆▆▆▇
eval/loss,▁▇▃▅▄▅█▇▄▇▇▆▆▆▆
eval/precision,▂█▁▇▄▃▇▇▆▃▅▆▆▅▅
eval/recall,▆▁▇▅██▄▅▇█▆▆▆▆▇
eval/runtime,█▃▁▁▂▃▄▃▅▄▂▄▄▃▆
eval/samples_per_second,▁▆██▇▆▅▆▄▅▇▅▅▆▃
eval/steps_per_second,▁▆██▇▆▅▆▃▅▇▅▅▆▃
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.95312
eval/f1,0.95652
eval/loss,0.45226
eval/precision,0.95307
eval/recall,0.96
eval/runtime,19.4154
eval/samples_per_second,26.371
eval/steps_per_second,3.296
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: 10wrrp2p with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 4.558718274149668e-05
[34m[1mwandb[0m: 	weight_decay: 0.3


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4126,0.286538,0.902344,0.924528,0.890909,0.907407
2,0.2605,0.451279,0.900391,0.96281,0.847273,0.901354
3,0.133,0.310941,0.925781,0.94052,0.92,0.930147
4,0.1047,0.44061,0.931641,0.954545,0.916364,0.935065
5,0.0469,0.487173,0.919922,0.964286,0.883636,0.922201
6,0.0263,0.481227,0.925781,0.912892,0.952727,0.932384
7,0.0347,0.605711,0.917969,0.979424,0.865455,0.918919
8,0.0181,0.610823,0.931641,0.96875,0.901818,0.934087
9,0.0137,0.486252,0.943359,0.939286,0.956364,0.947748
10,0.0076,0.588732,0.931641,0.951128,0.92,0.935305


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▅▆▄▅▄▆█▆▅▆▇▇▇
eval/f1,▂▁▅▆▄▆▄▆█▆▆▆▇▇▇
eval/loss,▁▄▂▄▅▅██▅▇██▇██
eval/precision,▂▆▄▅▆▁█▇▄▅▅▄▅▅▄
eval/recall,▄▁▆▅▃█▂▄█▆▅▆▇▇▇
eval/runtime,▆▅▃▄█▂▅▆▃▁▃▃▁▂▆
eval/samples_per_second,▃▄▆▅▁▇▃▃▆█▆▆█▆▃
eval/steps_per_second,▃▄▆▅▁▇▄▃▅█▆▆█▆▃
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.93555
eval/f1,0.93989
eval/loss,0.62263
eval/precision,0.94161
eval/recall,0.93818
eval/runtime,19.3905
eval/samples_per_second,26.405
eval/steps_per_second,3.301
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6x7h81p2 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 3.2809530408904146e-05
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3894,0.248615,0.935547,0.968992,0.909091,0.938086
2,0.2203,0.317757,0.931641,0.987805,0.883636,0.932821
3,0.1409,0.162973,0.949219,0.94306,0.963636,0.953237
4,0.0615,0.271709,0.953125,0.977186,0.934545,0.95539
5,0.0361,0.323551,0.947266,0.966165,0.934545,0.950092
6,0.0286,0.419912,0.945312,0.952381,0.945455,0.948905
7,0.0066,0.365874,0.957031,0.977358,0.941818,0.959259
8,0.0004,0.423603,0.955078,0.980916,0.934545,0.957169
9,0.008,0.377109,0.958984,0.977444,0.945455,0.961183
10,0.0001,0.377544,0.955078,0.95,0.967273,0.958559


0,1
eval/accuracy,▂▁▅▆▅▄▇▇█▇███▇▇
eval/f1,▂▁▆▆▅▅▇▇▇▇▇█▇▇▇
eval/loss,▃▅▁▄▅█▆█▇▇█▇██▇
eval/precision,▅█▁▆▅▂▆▇▆▂▆▅█▇▅
eval/recall,▃▁█▅▅▆▆▅▆█▆▇▆▆▆
eval/runtime,▁▁▁▁▁▂▂▁█▂██▂▂▃
eval/samples_per_second,█████▇▇█▁▇▁▁▇▇▆
eval/steps_per_second,█████▇▇█▁▇▁▁▇▇▆
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.95508
eval/f1,0.95764
eval/loss,0.39092
eval/precision,0.97015
eval/recall,0.94545
eval/runtime,19.4433
eval/samples_per_second,26.333
eval/steps_per_second,3.292
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jd01chwx with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 5.9193715985850456e-05
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5115,0.407545,0.826172,0.989474,0.683636,0.808602
2,0.489,0.568742,0.802734,0.988764,0.64,0.777042
3,0.3769,0.46447,0.871094,0.842623,0.934545,0.886207
4,0.2924,0.321199,0.912109,0.935606,0.898182,0.916512
5,0.1836,0.308842,0.923828,0.933824,0.923636,0.928702


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5115,0.407545,0.826172,0.989474,0.683636,0.808602
2,0.489,0.568742,0.802734,0.988764,0.64,0.777042
3,0.3769,0.46447,0.871094,0.842623,0.934545,0.886207
4,0.2924,0.321199,0.912109,0.935606,0.898182,0.916512
5,0.1836,0.308842,0.923828,0.933824,0.923636,0.928702
6,0.1401,0.38814,0.908203,0.963415,0.861818,0.909789
7,0.1064,0.329377,0.919922,0.968,0.88,0.921905
8,0.0877,0.429402,0.912109,0.932331,0.901818,0.916821
9,0.058,0.67025,0.892578,0.970085,0.825455,0.891945
10,0.0483,0.498459,0.919922,0.927007,0.923636,0.925319


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▂▁▅▇▇▆▇▇▆▇▇▇▇██
eval/f1,▂▁▆▇▇▇▇▇▆▇▇▇▇██
eval/loss,▃▆▄▁▁▃▁▃█▅▄▃▃▃▃
eval/precision,██▁▅▅▇▇▅▇▅▆▇▇▅▆
eval/recall,▂▁█▇█▆▇▇▅█▇▇▇██
eval/runtime,▂▁▅▂▃▄▄▃▂▅█▃▇▃▅
eval/samples_per_second,▇█▄▇▆▅▅▆▇▄▁▆▂▆▄
eval/steps_per_second,▇█▄▇▆▅▅▆▇▄▁▆▂▆▄
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.9375
eval/f1,0.94096
eval/loss,0.40468
eval/precision,0.95506
eval/recall,0.92727
eval/runtime,19.3524
eval/samples_per_second,26.457
eval/steps_per_second,3.307
train/epoch,15.0
train/global_step,3840.0


[34m[1mwandb[0m: Agent Starting Run: feo73s78 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 3.861271607392625e-05
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.431,0.263469,0.892578,0.898551,0.901818,0.900181
2,0.24,0.376745,0.896484,0.966387,0.836364,0.896686
3,0.1606,0.441412,0.917969,0.964143,0.88,0.920152
4,0.0871,0.455823,0.927734,0.940741,0.923636,0.93211
5,0.0295,0.492756,0.929688,0.96139,0.905455,0.932584
6,0.0248,0.471471,0.935547,0.984,0.894545,0.937143
7,0.0082,0.442429,0.947266,0.966165,0.934545,0.950092
8,0.005,0.473307,0.941406,0.95539,0.934545,0.944853
9,0.0054,0.792844,0.919922,0.991597,0.858182,0.920078


###  Final model

In [None]:
#chose dataset
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

#tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/2046 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

In [None]:
#select model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
#Select metrics for model evaluation

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy.compute(predictions = predictions, references = labels)
    precision = precision.compute(predictions = predictions, references = labels)
    recall = recall.compute(predictions = predictions, references = labels)
    f1 = f1.compute(predictions = predictions, references = labels)
    return {"accuracy": accuracy["accuracy"],
            "precision": precision["precision"],
            "recall": recall["recall"],
            "f1": f1["f1"]}

In [None]:
#Define arguments for training
training_args = TrainingArguments(output_dir="test_trainer",
                                  report_to = "wandb",
                                  save_strategy = "epoch",
                                  load_best_model_at_end = True,
                                  evaluation_strategy = "epoch",
                                  num_train_epochs = 7,
                                  weight_decay = 0.1,
                                  learning_rate = 0.00006355
                                  )

In [None]:
#Initisialise trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
#Train model
wandb.init(project = "nlp_propaganda")
trainer.train()


VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁
eval/f1,▁▁
eval/loss,▁▁
eval/precision,▁▁
eval/recall,▁▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁██
train/global_step,▁██

0,1
eval/accuracy,0.87305
eval/f1,0.86762
eval/loss,0.35203
eval/precision,0.8875
eval/recall,0.84861
eval/runtime,18.0554
eval/samples_per_second,28.357
eval/steps_per_second,3.545
train/epoch,2.0
train/global_step,512.0


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.464244,0.767578,1.0,0.525896,0.689295
2,0.661600,0.709687,0.490234,0.490234,1.0,0.657929
3,0.661600,0.69294,0.509766,0.0,0.0,0.0
4,0.672400,0.550726,0.839844,0.809524,0.880478,0.843511
5,0.672400,0.373976,0.847656,0.864979,0.816733,0.840164
6,0.390400,0.396261,0.853516,0.860656,0.836653,0.848485
7,0.390400,0.52388,0.804688,0.728097,0.960159,0.828179


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1792, training_loss=0.5485167077609471, metrics={'train_runtime': 1518.0314, 'train_samples_per_second': 9.435, 'train_steps_per_second': 1.18, 'total_flos': 3768276534865920.0, 'train_loss': 0.5485167077609471, 'epoch': 7.0})

In [None]:
#evaluate
labels_pred = trainer.predict(tokenized_test_dataset)

### Results

In [None]:
#Predictions to dataframe
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results.csv", index_col = 0)
results_df["label"] = testdf["label"]
results_df["predictions_num_roB1"] = labels_pred[1]
results_df["predictions_roB1"] = labels_pred[0].argmax(-1)
results_df["accuracy_roB1"] = np.where(results_df["binary_label"] != results_df["predictions_roB1"], 0,1)


In [None]:
#Dataframes to store wrong predictions
wrong_predictions = results_df[results_df["accuracy_roB1"] ==0]
wrong_predictions["sentence_"] = [s.translate(str.maketrans('', '', string.punctuation)) for s in wrong_predictions["sentence"].apply(lambda x: x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
wrong_predictions["sentence_"] =  [s.translate(str.maketrans("”"," ","")) for s in wrong_predictions["sentence_"]]
wrong_predictions["sentence_"] = [s.translate(str.maketrans("“"," ","")) for s in wrong_predictions["sentence_"] ]

#predicted as not propaganda when propaganda
wrong_predictions_as_not_propaganda = wrong_predictions[wrong_predictions["binary_label"] ==1]
#predicted as propaganda when not propaganda
wrong_predictions_as_propaganda = wrong_predictions[wrong_predictions["binary_label"] ==0 ]
#results_df["predictions_roB1"] = results_df["predictions_roB1"].map({1:"propaganda",0:"not_propaganda"})

right_predictions = results_df[results_df["accuracy_roB1"] ==1]


In [None]:
#Save file
results_df.to_csv("/content/drive/MyDrive/ANLP/results.csv")

In [None]:
#Predictions to dataframe
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results.csv",index_col=0)


In [None]:
#Metrics
print(sklearn.metrics.classification_report(results_df["predictions_roB1"], results_df["binary_label"]))


              precision    recall  f1-score   support

           0       0.89      0.85      0.87       345
           1       0.83      0.87      0.85       295

    accuracy                           0.86       640
   macro avg       0.86      0.86      0.86       640
weighted avg       0.86      0.86      0.86       640



In [None]:
prop_answers = results_df.groupby(["predictions_roB1","accuracy_roB1"])["tagged_in_context"].count()
prop_answers = prop_answers.reset_index(name="count")
prop_answers["accuracy_roB1"] = prop_answers["accuracy_roB1"].map({1:"Correct",0:"Incorrect"})
prop_answers["predictions_roB1"] = prop_answers["predictions_roB1"].map({1:"Propaganda",0:"Not Propaganda"})
alt.Chart(prop_answers,
    title= "Model predictions "
    ).mark_bar().encode(
    alt.Y("predictions_roB1",
          title = "Label predicted by model"
          ),
    alt.X("count",
          title="Percentage of answers(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    color =alt.Color("accuracy_roB1:O", title = ""))

In [None]:
right_answers =results_df.groupby(["binary_label","accuracy_roB1"])["tagged_in_context"].count()
right_answers = right_answers.reset_index(name ="count")

right_answers["accuracy_roB1"] = right_answers["accuracy_roB1"].map({1:"Correct",0:"Incorrect"})
right_answers["binary_label"] = right_answers["binary_label"].map({1:"Correct",0:"Incorrect"})

alt.Chart(right_answers,
    title= "Percentage of correct predictions per label"
    ).mark_bar().encode(
    y = alt.Y("binary_label:O",
          title = "True label"
          ),
    x = alt.X("count",
          title="Percentage of answers(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    color = alt.Color("accuracy_roB1:O", title = "")
 )

In [None]:
alt.Chart(
    wrong_predictions_as_not_propaganda,
    title= "Propaganda sentences labelled as Not propaganda"
    ).transform_joinaggregate(
    totalCount="count(*)"
    ).transform_calculate(
    PercentOfTotal="1 / datum.totalCount"
    ).mark_bar().encode(
    alt.Y("label",
          title = "True Label"),
    alt.X( "sum(PercentOfTotal):Q",
          title="Percentage of wrong answers(%)",
          axis=alt.Axis(format="%"),
          ),
    color =alt.Color("label", legend = None))



### length of sentence


In [None]:
lendf = results_df
lendf["accuracy_roB1"] =lendf["accuracy_roB1"].map({1:"Correct",0:"Incorrect"})
lendf["binary_label"] =lendf["binary_label"].map({1:"Propaganda",0:"Not_propaganda"})

#Histogram
alt.Chart(
    lendf,
    title = "Percentage of predictions per sentence length"
    ).transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total'
).mark_bar().encode(
    y=alt.Y("span_length:O",
            title = "Number of words in sentence",
            bin = alt.Bin(step = 5),

            ),
    x = alt.X("sum(pct):Q",axis=alt.Axis(format='%'),
              title = "Number of predictions",
               stack = "normalize"),
    color = "accuracy_roB1:O"
)

In [None]:
alt.Chart(lendf).mark_boxplot(ticks = True, size = 10).encode(
    y= alt.Y('accuracy_roB1:O', axis = None),
    x=alt.X('sentence_length:Q', title = "Numnber of words",),
    color = alt.Color("accuracy_roB1:N"),
    row = alt.Row("binary_label:N",align = "none",title = "", spacing = 0.2)
).configure_view(
    stroke="transparent"
).configure_scale(
    bandPaddingInner=0,
    bandPaddingOuter=0.1,
).configure_header(
    labelOrient = "left",
    labelAlign = "left",
    labelAngle = 0,
)

## Comparison

In [None]:
both_wrong = results_df[(results_df.accuracy_RF1 == "Incorrect") & (results_df.accuracy_roB1 == "Incorrect")]
both_right = results_df[(results_df.accuracy_RF1 == "Correct") & (results_df.accuracy_roB1 == "Correct")]
rob_right = results_df[(results_df.accuracy_RF1 == "Incorrect") & (results_df.accuracy_roB1 == "Correct")]
rf_right = results_df[(results_df.accuracy_RF1 == "Correct") & (results_df.accuracy_roB1 == "Incorrect")]

In [None]:
both_wrong

In [None]:
both_right

In [None]:
rob_right

In [None]:
rf_right

# Task 2 -multiclass

## Baseline

In [None]:
from sklearn.dummy import DummyClassifier


corpus_train = propaganda_train_df["span"]
labels_train = propaganda_train_df["label"].values

corpus_test = propaganda_test_df["span"]
labels_test = propaganda_test_df["label"].values

# Create a dummy classifier that predicts classes randomly
random_clf = DummyClassifier(strategy='uniform', random_state=42)

# Train the classifier on your training data
random_clf.fit(corpus_train, labels_train)

# Test the classifier on your test data
labels_pred = random_clf.predict(corpus_test)
print(sklearn.metrics.classification_report(labels_test, labels_pred))

                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.02      0.02      0.02        43
causal_oversimplification       0.11      0.11      0.11        35
                    doubt       0.14      0.12      0.13        43
exaggeration,minimisation       0.13      0.20      0.16        30
              flag_waving       0.14      0.11      0.12        45
          loaded_language       0.28      0.26      0.27        39
    name_calling,labeling       0.19      0.24      0.21        34
               repetition       0.17      0.15      0.16        40

                 accuracy                           0.15       309
                macro avg       0.15      0.15      0.15       309
             weighted avg       0.15      0.15      0.14       309



## Random Forest

#### Implementation

In [None]:
corpus_train = [s.translate(str.maketrans('', '', string.punctuation)) for s in propaganda_train_df["span"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
corpus_train = [s.translate(str.maketrans("”"," ","")) for s in corpus_train]
corpus_train= [s.translate(str.maketrans("“"," ","")) for s in corpus_train]
labels_train = propaganda_train_df["label"].values
corpus_train_sub, corpus_validation, labels_train_sub, labels_validation = train_test_split(corpus_train, labels_train, test_size=0.2, random_state=1)

corpus_test = [s.translate(str.maketrans('', '', string.punctuation)) for s in propaganda_test_df["span"].apply(lambda x: x.lower()).apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))]
corpus_test = [s.translate(str.maketrans("”"," ","")) for s in corpus_test]
corpus_test = [s.translate(str.maketrans("“"," ","")) for s in corpus_test]
labels_test = propaganda_test_df["label"].values


#### Function to perform a run with chosen parameters.


In [None]:
def run_sweep(x_tr, y_tr, x_te, y_te,name, max_depth, n_estimators, n_gram_range,max_features):
  vectorizer = CountVectorizer(analyzer = "word", ngram_range=n_gram_range)
  x_train = vectorizer.fit_transform(x_tr)
  x_test = vectorizer.transform(x_te)
  feature_names = ["span"]

  test_size = 0.19372077488309952

  # train model
  model = RandomForestClassifier(n_estimators = n_estimators,
                                 random_state = 21,
                                 max_features=max_features,
                                 max_depth =max_depth,
                                 )
  model.fit(x_train, y_tr)
  labels =  model.classes_
  model_params = model.get_params()
  model_params["n_gram_range"]=n_gram_range[1]
  # get predictions
  labels_pred = model.predict(x_test)
  labels_probas = model.predict_proba(x_test)
  importances = model.feature_importances_
  indices = np.argsort(importances)[::-1]

  print(sklearn.metrics.classification_report(y_te, labels_pred))

  # start a new wandb run and add your model hyperparameters
  wandb.init(project='NLP_techniques', tags = ["keep"],config=model_params, name = name,settings=wandb.Settings(start_method="thread") )



  #log metrics to wandb
  accuracy = sklearn.metrics.accuracy_score(y_te, labels_pred)
  precision = sklearn.metrics.precision_score(y_te, labels_pred, average = "macro")
  recall = sklearn.metrics.recall_score(y_te, labels_pred, average = "macro")
  f1 = sklearn.metrics.f1_score(y_te, labels_pred, average = "macro")

  wandb.summary["accuracy"] = accuracy
  wandb.summary["f1"] = f1
  wandb.summary["precision"] = precision
  wandb.summary["recall"] = recall

  # [optional] finish the wandb run, necessary in notebooks
  wandb.finish()

In [None]:
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,5,7]
}

param_combinations = sorted(param_grid)
param_combinations = [item for item in it.product(*(param_grid[param] for param in param_grid))]


Run multiple parameter combinations

In [None]:
#Unigram
param_combinations
n=0
for param in param_combinations:
  print(param)
  run_sweep(x_tr= corpus_train_sub,
            y_tr = labels_train_sub,
            x_te = corpus_validation,
            y_te = labels_validation,
            name = "RFunigram"+str(n),
            n_estimators = param[0],
            n_gram_range = (1,1),
            max_features = param[1],
            max_depth = param[2],

  )
  n+=1

(200, 'auto', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.14      0.20        28
causal_oversimplification       0.64      0.17      0.27        40
                    doubt       0.52      0.45      0.48        29
exaggeration,minimisation       1.00      0.03      0.05        40
              flag_waving       0.71      0.50      0.59        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.25        28
               repetition       1.00      0.10      0.18        30

                 accuracy                           0.28       258
                macro avg       0.54      0.29      0.25       258
             weighted avg       0.57      0.28      0.25       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.27519
f1,0.25255
precision,0.54261
recall,0.28996


(200, 'auto', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.40      0.14      0.21        28
causal_oversimplification       0.61      0.28      0.38        40
                    doubt       0.55      0.38      0.45        29
exaggeration,minimisation       0.67      0.10      0.17        40
              flag_waving       0.51      0.53      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.16      0.93      0.27        28
               repetition       1.00      0.10      0.18        30

                 accuracy                           0.30       258
                macro avg       0.49      0.31      0.27       258
             weighted avg       0.50      0.30      0.28       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.29845
f1,0.27304
precision,0.48734
recall,0.30689


(200, 'auto', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.41      0.25      0.31        28
causal_oversimplification       0.57      0.33      0.41        40
                    doubt       0.55      0.38      0.45        29
exaggeration,minimisation       0.70      0.17      0.28        40
              flag_waving       0.53      0.56      0.54        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.18      0.93      0.30        28
               repetition       0.60      0.10      0.17        30

                 accuracy                           0.33       258
                macro avg       0.44      0.34      0.31       258
             weighted avg       0.46      0.33      0.32       258



0,1
accuracy,0.33333
f1,0.30824
precision,0.44161
recall,0.33959


(200, 'sqrt', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.14      0.20        28
causal_oversimplification       0.64      0.17      0.27        40
                    doubt       0.52      0.45      0.48        29
exaggeration,minimisation       1.00      0.03      0.05        40
              flag_waving       0.71      0.50      0.59        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.25        28
               repetition       1.00      0.10      0.18        30

                 accuracy                           0.28       258
                macro avg       0.54      0.29      0.25       258
             weighted avg       0.57      0.28      0.25       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.27519
f1,0.25255
precision,0.54261
recall,0.28996


(200, 'sqrt', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.40      0.14      0.21        28
causal_oversimplification       0.61      0.28      0.38        40
                    doubt       0.55      0.38      0.45        29
exaggeration,minimisation       0.67      0.10      0.17        40
              flag_waving       0.51      0.53      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.16      0.93      0.27        28
               repetition       1.00      0.10      0.18        30

                 accuracy                           0.30       258
                macro avg       0.49      0.31      0.27       258
             weighted avg       0.50      0.30      0.28       258



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669394683337183, max=1.0…

  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.29845
f1,0.27304
precision,0.48734
recall,0.30689


(200, 'sqrt', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.41      0.25      0.31        28
causal_oversimplification       0.57      0.33      0.41        40
                    doubt       0.55      0.38      0.45        29
exaggeration,minimisation       0.70      0.17      0.28        40
              flag_waving       0.53      0.56      0.54        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.18      0.93      0.30        28
               repetition       0.60      0.10      0.17        30

                 accuracy                           0.33       258
                macro avg       0.44      0.34      0.31       258
             weighted avg       0.46      0.33      0.32       258



0,1
accuracy,0.33333
f1,0.30824
precision,0.44161
recall,0.33959


(200, 'log2', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.07      0.12        28
causal_oversimplification       0.75      0.07      0.14        40
                    doubt       0.62      0.17      0.27        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       1.00      0.03      0.06        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      1.00      0.21        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.16       258
                macro avg       0.48      0.18      0.11       258
             weighted avg       0.48      0.16      0.11       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.15891
f1,0.11492
precision,0.47844
recall,0.17687


(200, 'log2', 5)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.36      0.14      0.21        28
causal_oversimplification       0.71      0.12      0.21        40
                    doubt       0.43      0.10      0.17        29
exaggeration,minimisation       0.67      0.10      0.17        40
              flag_waving       0.60      0.18      0.27        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.13      0.96      0.23        28
               repetition       0.40      0.07      0.11        30

                 accuracy                           0.20       258
                macro avg       0.41      0.21      0.17       258
             weighted avg       0.44      0.20      0.17       258



0,1
accuracy,0.19767
f1,0.17167
precision,0.41279
recall,0.20984


(200, 'log2', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.14      0.20        28
causal_oversimplification       0.62      0.20      0.30        40
                    doubt       0.50      0.14      0.22        29
exaggeration,minimisation       0.45      0.12      0.20        40
              flag_waving       0.65      0.38      0.48        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.96      0.26        28
               repetition       0.38      0.10      0.16        30

                 accuracy                           0.25       258
                macro avg       0.38      0.26      0.23       258
             weighted avg       0.40      0.25      0.23       258



0,1
accuracy,0.24806
f1,0.22619
precision,0.38448
recall,0.25655


(500, 'auto', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.31      0.14      0.20        28
causal_oversimplification       0.67      0.15      0.24        40
                    doubt       0.56      0.34      0.43        29
exaggeration,minimisation       1.00      0.07      0.14        40
              flag_waving       0.74      0.50      0.60        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.24        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.26       258
                macro avg       0.55      0.28      0.25       258
             weighted avg       0.58      0.26      0.25       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.26357
f1,0.24564
precision,0.55074
recall,0.27599


(500, 'auto', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.55      0.21      0.31        28
causal_oversimplification       0.67      0.25      0.36        40
                    doubt       0.50      0.31      0.38        29
exaggeration,minimisation       0.60      0.15      0.24        40
              flag_waving       0.54      0.56      0.55        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.16      0.93      0.27        28
               repetition       0.67      0.07      0.12        30

                 accuracy                           0.30       258
                macro avg       0.46      0.31      0.28       258
             weighted avg       0.48      0.30      0.29       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.30233
f1,0.27929
precision,0.45978
recall,0.30984


(500, 'auto', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.21      0.30        28
causal_oversimplification       0.67      0.30      0.41        40
                    doubt       0.55      0.38      0.45        29
exaggeration,minimisation       0.70      0.17      0.28        40
              flag_waving       0.53      0.59      0.56        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.17      0.93      0.29        28
               repetition       0.50      0.10      0.17        30

                 accuracy                           0.33       258
                macro avg       0.45      0.34      0.31       258
             weighted avg       0.47      0.33      0.31       258



0,1
accuracy,0.32946
f1,0.30714
precision,0.45204
recall,0.33568


(500, 'sqrt', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.31      0.14      0.20        28
causal_oversimplification       0.67      0.15      0.24        40
                    doubt       0.56      0.34      0.43        29
exaggeration,minimisation       1.00      0.07      0.14        40
              flag_waving       0.74      0.50      0.60        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.24        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.26       258
                macro avg       0.55      0.28      0.25       258
             weighted avg       0.58      0.26      0.25       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.26357
f1,0.24564
precision,0.55074
recall,0.27599


(500, 'sqrt', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.55      0.21      0.31        28
causal_oversimplification       0.67      0.25      0.36        40
                    doubt       0.50      0.31      0.38        29
exaggeration,minimisation       0.60      0.15      0.24        40
              flag_waving       0.54      0.56      0.55        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.16      0.93      0.27        28
               repetition       0.67      0.07      0.12        30

                 accuracy                           0.30       258
                macro avg       0.46      0.31      0.28       258
             weighted avg       0.48      0.30      0.29       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.30233
f1,0.27929
precision,0.45978
recall,0.30984


(500, 'sqrt', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.21      0.30        28
causal_oversimplification       0.67      0.30      0.41        40
                    doubt       0.55      0.38      0.45        29
exaggeration,minimisation       0.70      0.17      0.28        40
              flag_waving       0.53      0.59      0.56        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.17      0.93      0.29        28
               repetition       0.50      0.10      0.17        30

                 accuracy                           0.33       258
                macro avg       0.45      0.34      0.31       258
             weighted avg       0.47      0.33      0.31       258



0,1
accuracy,0.32946
f1,0.30714
precision,0.45204
recall,0.33568


(500, 'log2', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.33      0.03      0.05        40
                    doubt       0.33      0.03      0.06        29
exaggeration,minimisation       0.67      0.05      0.09        40
              flag_waving       1.00      0.03      0.06        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.21        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.13       258
                macro avg       0.31      0.14      0.06       258
             weighted avg       0.34      0.13      0.06       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.12791
f1,0.05804
precision,0.30595
recall,0.14236


(500, 'log2', 5)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.42      0.18      0.25        28
causal_oversimplification       0.57      0.10      0.17        40
                    doubt       0.40      0.07      0.12        29
exaggeration,minimisation       0.30      0.07      0.12        40
              flag_waving       0.67      0.12      0.20        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.13      1.00      0.23        28
               repetition       0.67      0.07      0.12        30

                 accuracy                           0.19       258
                macro avg       0.39      0.20      0.15       258
             weighted avg       0.40      0.19      0.15       258



0,1
accuracy,0.18605
f1,0.15131
precision,0.39403
recall,0.20086


(500, 'log2', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.45      0.18      0.26        28
causal_oversimplification       0.70      0.17      0.28        40
                    doubt       0.38      0.10      0.16        29
exaggeration,minimisation       0.42      0.12      0.19        40
              flag_waving       0.80      0.35      0.49        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      1.00      0.25        28
               repetition       0.50      0.07      0.12        30

                 accuracy                           0.24       258
                macro avg       0.42      0.25      0.22       258
             weighted avg       0.44      0.24      0.22       258



0,1
accuracy,0.24031
f1,0.21854
precision,0.42363
recall,0.2502


In [None]:
#Bigram
param_combinations
n=0
for param in param_combinations:
  print(param)
  run_sweep(
            x_tr= corpus_train_sub,
            y_tr = labels_train_sub,
            x_te = corpus_validation,
            y_te = labels_validation,
            name = "RFbigram"+str(n),
            n_estimators = param[0],
            n_gram_range = (1,2),
            max_features = param[1],
            max_depth = param[2],

            )
  n+=1

(200, 'auto', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.11      0.16        28
causal_oversimplification       0.50      0.03      0.05        40
                    doubt       0.40      0.14      0.21        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       0.74      0.50      0.60        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      0.93      0.22        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.21       258
                macro avg       0.39      0.22      0.17       258
             weighted avg       0.39      0.21      0.16       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.20543
f1,0.16925
precision,0.38696
recall,0.22066




(200, 'auto', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.30      0.11      0.16        28
causal_oversimplification       0.56      0.12      0.20        40
                    doubt       0.45      0.17      0.25        29
exaggeration,minimisation       0.38      0.07      0.12        40
              flag_waving       0.50      0.50      0.50        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.25        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.24       258
                macro avg       0.42      0.25      0.20       258
             weighted avg       0.43      0.24      0.20       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.23643
f1,0.20091
precision,0.4158
recall,0.24685


(200, 'auto', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.14      0.20        28
causal_oversimplification       0.64      0.17      0.27        40
                    doubt       0.40      0.21      0.27        29
exaggeration,minimisation       0.55      0.15      0.24        40
              flag_waving       0.53      0.50      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.93      0.26        28
               repetition       0.75      0.10      0.18        30

                 accuracy                           0.27       258
                macro avg       0.42      0.28      0.24       258
             weighted avg       0.44      0.27      0.25       258



0,1
accuracy,0.26744
f1,0.24177
precision,0.41845
recall,0.27542


(200, 'sqrt', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.11      0.16        28
causal_oversimplification       0.50      0.03      0.05        40
                    doubt       0.40      0.14      0.21        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       0.74      0.50      0.60        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      0.93      0.22        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.21       258
                macro avg       0.39      0.22      0.17       258
             weighted avg       0.39      0.21      0.16       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.20543
f1,0.16925
precision,0.38696
recall,0.22066


(200, 'sqrt', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.30      0.11      0.16        28
causal_oversimplification       0.56      0.12      0.20        40
                    doubt       0.45      0.17      0.25        29
exaggeration,minimisation       0.38      0.07      0.12        40
              flag_waving       0.50      0.50      0.50        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.25        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.24       258
                macro avg       0.42      0.25      0.20       258
             weighted avg       0.43      0.24      0.20       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.23643
f1,0.20091
precision,0.4158
recall,0.24685


(200, 'sqrt', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.14      0.20        28
causal_oversimplification       0.64      0.17      0.27        40
                    doubt       0.40      0.21      0.27        29
exaggeration,minimisation       0.55      0.15      0.24        40
              flag_waving       0.53      0.50      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.93      0.26        28
               repetition       0.75      0.10      0.18        30

                 accuracy                           0.27       258
                macro avg       0.42      0.28      0.24       258
             weighted avg       0.44      0.27      0.25       258



VBox(children=(Label(value='0.003 MB of 0.010 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.256948…

0,1
accuracy,0.26744
f1,0.24177
precision,0.41845
recall,0.27542


(200, 'log2', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.00      0.00      0.00        40
                    doubt       1.00      0.03      0.07        29
exaggeration,minimisation       0.33      0.03      0.05        40
              flag_waving       0.00      0.00      0.00        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.20        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.12       258
                macro avg       0.18      0.13      0.04       258
             weighted avg       0.18      0.12      0.04       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.11628
f1,0.03897
precision,0.18045
recall,0.13244


(200, 'log2', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.50      0.07      0.13        40
                    doubt       0.75      0.10      0.18        29
exaggeration,minimisation       0.25      0.03      0.05        40
              flag_waving       1.00      0.06      0.11        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      1.00      0.21        28
               repetition       0.50      0.03      0.06        30

                 accuracy                           0.15       258
                macro avg       0.39      0.16      0.09       258
             weighted avg       0.40      0.15      0.09       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.14729
f1,0.09263
precision,0.38964
recall,0.16195


(200, 'log2', 7)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.04      0.06        28
causal_oversimplification       0.50      0.05      0.09        40
                    doubt       0.67      0.14      0.23        29
exaggeration,minimisation       0.20      0.03      0.04        40
              flag_waving       1.00      0.06      0.11        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      1.00      0.21        28
               repetition       0.60      0.10      0.17        30

                 accuracy                           0.16       258
                macro avg       0.43      0.18      0.12       258
             weighted avg       0.43      0.16      0.11       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.15891
f1,0.11569
precision,0.42752
recall,0.17593


(500, 'auto', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.07      0.12        28
causal_oversimplification       1.00      0.12      0.22        40
                    doubt       0.54      0.24      0.33        29
exaggeration,minimisation       0.25      0.03      0.05        40
              flag_waving       0.57      0.50      0.53        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.13      0.93      0.23        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.23       258
                macro avg       0.48      0.24      0.20       258
             weighted avg       0.50      0.23      0.20       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.23256
f1,0.20062
precision,0.47747
recall,0.24476


(500, 'auto', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.43      0.11      0.17        28
causal_oversimplification       0.91      0.25      0.39        40
                    doubt       0.50      0.24      0.33        29
exaggeration,minimisation       0.43      0.07      0.13        40
              flag_waving       0.57      0.50      0.53        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.24        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.26       258
                macro avg       0.50      0.27      0.24       258
             weighted avg       0.52      0.26      0.25       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.26357
f1,0.23937
precision,0.49649
recall,0.2711


(500, 'auto', 7)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.44      0.14      0.22        28
causal_oversimplification       0.75      0.23      0.35        40
                    doubt       0.47      0.24      0.32        29
exaggeration,minimisation       0.64      0.17      0.27        40
              flag_waving       0.53      0.50      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.93      0.25        28
               repetition       0.67      0.07      0.12        30

                 accuracy                           0.28       258
                macro avg       0.46      0.28      0.26       258
             weighted avg       0.48      0.28      0.27       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.27907
f1,0.25579
precision,0.45539
recall,0.28493


(500, 'sqrt', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.07      0.12        28
causal_oversimplification       1.00      0.12      0.22        40
                    doubt       0.54      0.24      0.33        29
exaggeration,minimisation       0.25      0.03      0.05        40
              flag_waving       0.57      0.50      0.53        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.13      0.93      0.23        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.23       258
                macro avg       0.48      0.24      0.20       258
             weighted avg       0.50      0.23      0.20       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.23256
f1,0.20062
precision,0.47747
recall,0.24476


(500, 'sqrt', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.43      0.11      0.17        28
causal_oversimplification       0.91      0.25      0.39        40
                    doubt       0.50      0.24      0.33        29
exaggeration,minimisation       0.43      0.07      0.13        40
              flag_waving       0.57      0.50      0.53        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.93      0.24        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.26       258
                macro avg       0.50      0.27      0.24       258
             weighted avg       0.52      0.26      0.25       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.26357
f1,0.23937
precision,0.49649
recall,0.2711


(500, 'sqrt', 7)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.44      0.14      0.22        28
causal_oversimplification       0.75      0.23      0.35        40
                    doubt       0.47      0.24      0.32        29
exaggeration,minimisation       0.64      0.17      0.27        40
              flag_waving       0.53      0.50      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.93      0.25        28
               repetition       0.67      0.07      0.12        30

                 accuracy                           0.28       258
                macro avg       0.46      0.28      0.26       258
             weighted avg       0.48      0.28      0.27       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.27907
f1,0.25579
precision,0.45539
recall,0.28493


(500, 'log2', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.50      0.03      0.05        40
                    doubt       0.00      0.00      0.00        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       1.00      0.03      0.06        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.20        28
               repetition       1.00      0.03      0.06        30

                 accuracy                           0.12       258
                macro avg       0.33      0.14      0.05       258
             weighted avg       0.34      0.12      0.04       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.12016
f1,0.04607
precision,0.32633
recall,0.13597


(500, 'log2', 5)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.67      0.05      0.09        40
                    doubt       0.67      0.07      0.12        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       1.00      0.09      0.16        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.21        28
               repetition       1.00      0.03      0.06        30

                 accuracy                           0.14       258
                macro avg       0.43      0.16      0.08       258
             weighted avg       0.44      0.14      0.08       258



0,1
accuracy,0.13953
f1,0.08123
precision,0.43095
recall,0.15507


(500, 'log2', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.33      0.04      0.06        28
causal_oversimplification       0.67      0.05      0.09        40
                    doubt       0.60      0.10      0.18        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       0.89      0.24      0.37        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      1.00      0.21        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.17       258
                macro avg       0.45      0.19      0.13       258
             weighted avg       0.45      0.17      0.13       258



0,1
accuracy,0.17054
f1,0.13071
precision,0.45113
recall,0.18639


In [None]:
#Trigram
param_combinations
n=0
for param in param_combinations:
  print(param)
  run_sweep(x_tr= corpus_train_sub,
            y_tr = labels_train_sub,
            x_te = corpus_validation,
            y_te = labels_validation,
            name = "RFtrigram"+str(n),
            n_estimators = param[0],
            n_gram_range = (1,3),
            max_features = param[1],
            max_depth = param[2],

            )
  n+=1

(200, 'auto', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.56      0.18      0.27        28
causal_oversimplification       1.00      0.05      0.10        40
                    doubt       0.60      0.21      0.31        29
exaggeration,minimisation       0.25      0.03      0.05        40
              flag_waving       0.81      0.38      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.13      1.00      0.23        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.21       258
                macro avg       0.42      0.23      0.18       258
             weighted avg       0.44      0.21      0.18       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.21318
f1,0.1834
precision,0.41839
recall,0.23035


(200, 'auto', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.62      0.18      0.28        28
causal_oversimplification       0.71      0.12      0.21        40
                    doubt       0.56      0.31      0.40        29
exaggeration,minimisation       0.75      0.15      0.25        40
              flag_waving       0.48      0.35      0.41        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.96      0.25        28
               repetition       0.25      0.03      0.06        30

                 accuracy                           0.25       258
                macro avg       0.44      0.26      0.23       258
             weighted avg       0.47      0.25      0.23       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.25194
f1,0.23173
precision,0.44049
recall,0.26431


(200, 'auto', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.56      0.18      0.27        28
causal_oversimplification       0.60      0.15      0.24        40
                    doubt       0.50      0.28      0.36        29
exaggeration,minimisation       0.70      0.17      0.28        40
              flag_waving       0.48      0.38      0.43        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.96      0.26        28
               repetition       0.20      0.03      0.06        30

                 accuracy                           0.26       258
                macro avg       0.40      0.27      0.24       258
             weighted avg       0.42      0.26      0.24       258



0,1
accuracy,0.25969
f1,0.2361
precision,0.39838
recall,0.26993


(200, 'sqrt', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.56      0.18      0.27        28
causal_oversimplification       1.00      0.05      0.10        40
                    doubt       0.60      0.21      0.31        29
exaggeration,minimisation       0.25      0.03      0.05        40
              flag_waving       0.81      0.38      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.13      1.00      0.23        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.21       258
                macro avg       0.42      0.23      0.18       258
             weighted avg       0.44      0.21      0.18       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.21318
f1,0.1834
precision,0.41839
recall,0.23035


(200, 'sqrt', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.62      0.18      0.28        28
causal_oversimplification       0.71      0.12      0.21        40
                    doubt       0.56      0.31      0.40        29
exaggeration,minimisation       0.75      0.15      0.25        40
              flag_waving       0.48      0.35      0.41        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.96      0.25        28
               repetition       0.25      0.03      0.06        30

                 accuracy                           0.25       258
                macro avg       0.44      0.26      0.23       258
             weighted avg       0.47      0.25      0.23       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.25194
f1,0.23173
precision,0.44049
recall,0.26431


(200, 'sqrt', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.56      0.18      0.27        28
causal_oversimplification       0.60      0.15      0.24        40
                    doubt       0.50      0.28      0.36        29
exaggeration,minimisation       0.70      0.17      0.28        40
              flag_waving       0.48      0.38      0.43        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.96      0.26        28
               repetition       0.20      0.03      0.06        30

                 accuracy                           0.26       258
                macro avg       0.40      0.27      0.24       258
             weighted avg       0.42      0.26      0.24       258



0,1
accuracy,0.25969
f1,0.2361
precision,0.39838
recall,0.26993


(200, 'log2', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.00      0.00      0.00        40
                    doubt       1.00      0.03      0.07        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       1.00      0.06      0.11        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.20        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.12       258
                macro avg       0.26      0.14      0.05       258
             weighted avg       0.26      0.12      0.04       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.12016
f1,0.04722
precision,0.26389
recall,0.13666


(200, 'log2', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.25      0.04      0.06        28
causal_oversimplification       0.67      0.05      0.09        40
                    doubt       0.75      0.10      0.18        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       0.67      0.06      0.11        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      1.00      0.21        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.14       258
                macro avg       0.31      0.16      0.08       258
             weighted avg       0.32      0.14      0.08       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.13953
f1,0.08151
precision,0.30607
recall,0.156


(200, 'log2', 7)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.67      0.07      0.13        28
causal_oversimplification       0.50      0.03      0.05        40
                    doubt       0.60      0.10      0.18        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       0.89      0.24      0.37        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      1.00      0.21        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.16       258
                macro avg       0.35      0.18      0.12       258
             weighted avg       0.35      0.16      0.11       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.16279
f1,0.11707
precision,0.34671
recall,0.1794


(500, 'auto', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.11      0.18        28
causal_oversimplification       1.00      0.03      0.05        40
                    doubt       0.62      0.17      0.27        29
exaggeration,minimisation       0.50      0.03      0.05        40
              flag_waving       0.73      0.32      0.45        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      0.96      0.21        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.19       258
                macro avg       0.56      0.21      0.17       258
             weighted avg       0.58      0.19      0.16       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.1938
f1,0.16643
precision,0.55986
recall,0.2105


(500, 'auto', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.07      0.12        28
causal_oversimplification       0.86      0.15      0.26        40
                    doubt       0.53      0.28      0.36        29
exaggeration,minimisation       0.50      0.05      0.09        40
              flag_waving       0.54      0.41      0.47        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.96      0.24        28
               repetition       0.40      0.07      0.11        30

                 accuracy                           0.24       258
                macro avg       0.43      0.25      0.21       258
             weighted avg       0.46      0.24      0.21       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.23643
f1,0.20698
precision,0.43325
recall,0.24875


(500, 'auto', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.44      0.14      0.22        28
causal_oversimplification       0.75      0.23      0.35        40
                    doubt       0.53      0.31      0.39        29
exaggeration,minimisation       0.62      0.12      0.21        40
              flag_waving       0.55      0.50      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.93      0.26        28
               repetition       0.40      0.07      0.11        30

                 accuracy                           0.28       258
                macro avg       0.43      0.29      0.26       258
             weighted avg       0.46      0.28      0.26       258



0,1
accuracy,0.27907
f1,0.2571
precision,0.43083
recall,0.28731


(500, 'sqrt', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.11      0.18        28
causal_oversimplification       1.00      0.03      0.05        40
                    doubt       0.62      0.17      0.27        29
exaggeration,minimisation       0.50      0.03      0.05        40
              flag_waving       0.73      0.32      0.45        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.12      0.96      0.21        28
               repetition       1.00      0.07      0.12        30

                 accuracy                           0.19       258
                macro avg       0.56      0.21      0.17       258
             weighted avg       0.58      0.19      0.16       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.1938
f1,0.16643
precision,0.55986
recall,0.2105


(500, 'sqrt', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.07      0.12        28
causal_oversimplification       0.86      0.15      0.26        40
                    doubt       0.53      0.28      0.36        29
exaggeration,minimisation       0.50      0.05      0.09        40
              flag_waving       0.54      0.41      0.47        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.14      0.96      0.24        28
               repetition       0.40      0.07      0.11        30

                 accuracy                           0.24       258
                macro avg       0.43      0.25      0.21       258
             weighted avg       0.46      0.24      0.21       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.23643
f1,0.20698
precision,0.43325
recall,0.24875


(500, 'sqrt', 7)
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.44      0.14      0.22        28
causal_oversimplification       0.75      0.23      0.35        40
                    doubt       0.53      0.31      0.39        29
exaggeration,minimisation       0.62      0.12      0.21        40
              flag_waving       0.55      0.50      0.52        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.15      0.93      0.26        28
               repetition       0.40      0.07      0.11        30

                 accuracy                           0.28       258
                macro avg       0.43      0.29      0.26       258
             weighted avg       0.46      0.28      0.26       258



0,1
accuracy,0.27907
f1,0.2571
precision,0.43083
recall,0.28731


(500, 'log2', 3)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.04      0.07        28
causal_oversimplification       0.00      0.00      0.00        40
                    doubt       0.00      0.00      0.00        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       0.00      0.00      0.00        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.20        28
               repetition       0.00      0.00      0.00        30

                 accuracy                           0.11       258
                macro avg       0.08      0.13      0.03       258
             weighted avg       0.07      0.11      0.03       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.1124
f1,0.03307
precision,0.07623
recall,0.12946


(500, 'log2', 5)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.04      0.07        28
causal_oversimplification       0.00      0.00      0.00        40
                    doubt       1.00      0.03      0.07        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       1.00      0.03      0.06        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.20        28
               repetition       1.00      0.03      0.06        30

                 accuracy                           0.12       258
                macro avg       0.45      0.14      0.06       258
             weighted avg       0.43      0.12      0.05       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.12403
f1,0.05687
precision,0.45139
recall,0.14162


(500, 'log2', 7)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.00      0.00      0.00        28
causal_oversimplification       0.50      0.03      0.05        40
                    doubt       0.50      0.03      0.06        29
exaggeration,minimisation       0.00      0.00      0.00        40
              flag_waving       1.00      0.03      0.06        34
          loaded_language       0.00      0.00      0.00        29
    name_calling,labeling       0.11      1.00      0.20        28
               repetition       1.00      0.03      0.06        30

                 accuracy                           0.12       258
                macro avg       0.39      0.14      0.05       258
             weighted avg       0.39      0.12      0.05       258



  _warn_prf(average, modifier, msg_start, len(result))


0,1
accuracy,0.12403
f1,0.0544
precision,0.389
recall,0.14028


#### Final model

In [None]:
vectorizer = CountVectorizer(analyzer = "word", ngram_range=(1,1))
x_train = vectorizer.fit_transform(corpus_train)
x_test = vectorizer.transform(corpus_test)
feature_names = ["sentence"]

test_size = 0.19372077488309952

# train model
model = RandomForestClassifier(n_estimators = 7,
                                 max_features="auto",
                                 max_depth =200)
model.fit(x_train, labels_train)
labels = model.classes_
model_params = model.get_params()
model_params["n_gram_range"]=2

# get predictions
labels_pred = model.predict(x_test)
labels_probas = model.predict_proba(x_test)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
# start a new wandb run and add your model hyperparameters
wandb.init(project='NLP_RF1_', config=model_params, name = "Final")
# Add additional configs to wandb
wandb.config.update({"test_size" : test_size,
                    "train_len" : len(corpus_train),
                    "test_len" : len(corpus_test)})

# log additional visualisations to wandb
plot_class_proportions(labels_train, labels_test, labels)
plot_learning_curve(model, x_train, labels_train)
plot_roc(labels_test, labels_probas, labels)
plot_precision_recall(labels_test, labels_probas, labels)
plot_feature_importances(model)
wandb.sklearn.plot_classifier(model, x_train, x_test,labels_train, labels_test, labels_pred, labels_probas,labels ,
                                                        model_name="Final", feature_names=None)
print(sklearn.metrics.classification_report(labels_test, labels_pred))

#log metrics to wandb
accuracy = sklearn.metrics.accuracy_score(labels_test, labels_pred)
precision = sklearn.metrics.precision_score(labels_test, labels_pred, average ="macro")
recall = sklearn.metrics.recall_score(labels_test, labels_pred,average ="macro")
f1 = sklearn.metrics.f1_score(labels_test, labels_pred,average ="macro")

wandb.summary["accuracy"] = accuracy
wandb.summary["f1"] = f1
wandb.summary["precision"] = precision
wandb.summary["recall"] = recall

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting Final.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.
[34m[1mwandb[0m: Logged calibration curve.
[34m[1mwandb[0m: Logged roc curve.
[34m[1mwandb[0m: Logged precision-recall curve.


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.45      0.23      0.31        43
causal_oversimplification       0.41      0.20      0.27        35
                    doubt       0.58      0.33      0.42        43
exaggeration,minimisation       0.27      0.37      0.31        30
              flag_waving       0.69      0.69      0.69        45
          loaded_language       0.29      0.85      0.43        39
    name_calling,labeling       0.50      0.32      0.39        34
               repetition       0.52      0.30      0.38        40

                 accuracy                           0.42       309
                macro avg       0.46      0.41      0.40       309
             weighted avg       0.48      0.42      0.41       309



0,1
accuracy,0.41748
f1,0.3995
precision,0.46444
recall,0.41042


In [None]:
labels_pred =

#### Results

##### Save new result

In [None]:
results_task2_df = propaganda_test_df
results_task2_df.to_csv("/content/drive/MyDrive/ANLP/results_task2.csv", index = 0)

In [None]:
results_df.reset_index()

In [None]:
#Predictions to dataframe
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results_task2.csv")
results_df["predictions_RF"] = labels_pred
results_df["accuracy_RF"] = np.where(results_df["label"] != results_df["predictions_RF"], "Incorrect","Correct")

wrong_predictions = results_df[results_df["accuracy_RF"] =="Incorrect"]

right_predictions = results_df[results_df["accuracy_RF"] =="Correct"]



In [None]:
results_df.to_csv("/content/drive/MyDrive/ANLP/results_task2.csv")

#### Get old result

In [None]:
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results_task2.csv", index_col = 0)

In [None]:
print(sklearn.metrics.classification_report(results_df["label"],results_df["predictions_RF"]))

                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.45      0.23      0.31        43
causal_oversimplification       0.41      0.20      0.27        35
                    doubt       0.58      0.33      0.42        43
exaggeration,minimisation       0.27      0.37      0.31        30
              flag_waving       0.69      0.69      0.69        45
          loaded_language       0.29      0.85      0.43        39
    name_calling,labeling       0.50      0.32      0.39        34
               repetition       0.52      0.30      0.38        40

                 accuracy                           0.42       309
                macro avg       0.46      0.41      0.40       309
             weighted avg       0.48      0.42      0.41       309



#### Graphs classes

In [None]:
prop_answers = results_df.groupby(["predictions_RF","accuracy_RF"]).count()
prop_answers = prop_answers.reset_index()

pred_acc = alt.Chart(prop_answers,
    title= "Precision"
    ).mark_bar().encode(
    alt.Y("predictions_RF",
          title = "Label predicted by model"
          ),
    alt.X("sum(tagged_in_context)",
          title="Percentage of predictions(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    color =alt.Color("accuracy_RF:O", title = "Prediction"))
pred_acc

In [None]:
right_answers =results_df.groupby(["label","accuracy_RF"])["tagged_in_context"].count()
right_answers = right_answers.reset_index(name ="count")

label_acc = alt.Chart(right_answers,
    title= "Recall"
    ).mark_bar().encode(
    alt.Y("label",
          title = "True label"
          ),
    alt.X("count",
          title="Percentage of sentences(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    alt.Color("accuracy_RF:O", title = "Prediction", )
 )
label_acc


#### Length of wrong sentences

In [None]:
len_df = results_df

In [None]:
#Histogram
alt.Chart(
    len_df,
    title = "Percentage of predictions per span length"
    ).transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total'
).mark_bar().encode(
    y=alt.Y("span_length:O",
            title = "Number of words in span",
            bin = alt.Bin(step = 10),
            ),
    x = alt.X("sum(pct):Q",axis=alt.Axis(format='%'),
              title = "Percentage of spans",
               stack = "normalize"),
    color = alt.Color("accuracy_RF:O", title = "Prediction")
)

In [None]:
alt.Chart(len_df).mark_boxplot(ticks = True, size = 10).encode(
    x= alt.X('accuracy_RF:O', axis = None, title = "Prediction"),
    y=alt.Y('span_length:Q', title = "Numnber of words",),
    color = alt.Color("accuracy_RF:N", title = "Prediction"),
    column = alt.Column("label:N",align = "none",title = "True label", spacing = 0.2)
).configure_view(
    stroke="transparent"
).configure_scale(
    bandPaddingInner=0,
    bandPaddingOuter=0.2,
).configure_header(
    labelOrient = "bottom",
    labelAlign = "right",
    labelAngle = -90,
    titleOrient = "bottom"
)

Examples of predictions

#### Boundary predictions

In [None]:
results_df[results_df["prob_prop_RF2"].between(0.46,0.54)]

## RoBERTa

### Default


In [None]:
#Prepare datasets
train_dataset = propaganda_train_df.drop(["binary_label","sentence","tagged_in_context","span_length",	"sentence_length","type","len"], axis = 1)
train_dataset['label'] = train_dataset.label.astype('category')
train_dataset['label'] = train_dataset['label'].cat.codes
train_dataset.rename(columns={"span":"text"}, inplace=True)

#Split training into validation and training
train_dataset, validation_dataset = train_test_split(train_dataset, test_size=0.2, stratify = train_dataset["label"])
print(validation_dataset.groupby("label").count())

print(train_dataset.groupby("label").count())
train_dataset = train_dataset.reset_index(drop=True)
train_dataset = Dataset.from_pandas(train_dataset)


validation_dataset = validation_dataset.reset_index(drop=True)
validation_dataset = Dataset.from_pandas(validation_dataset)


test_dataset = propaganda_test_df.drop(["binary_label","sentence","tagged_in_context","span_length",	"sentence_length","type","len"], axis = 1)
test_dataset['label'] = test_dataset.label.astype('category')
test_dataset['label'] = test_dataset['label'].cat.codes
test_dataset.rename(columns={"span":"text"}, inplace=True)
test_dataset = test_dataset.reset_index(drop=True)
test_dataset = Dataset.from_pandas(test_dataset)


In [None]:
#chose tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

#tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/1032 [00:00<?, ? examples/s]

Map:   0%|          | 0/258 [00:00<?, ? examples/s]

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

In [None]:
#select model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=8)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
def model_init():
  model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=8)
  return model

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy.compute(predictions = predictions, references = labels)
    precision = precision.compute(predictions = predictions, references = labels, average="macro")
    recall = recall.compute(predictions = predictions, references = labels, average="macro")
    f1 = f1.compute(predictions = predictions, references = labels, average="macro")
    return {"accuracy": accuracy["accuracy"],
            "precision": precision["precision"],
            "recall": recall["recall"],
            "f1": f1["f1"]}


### sweep w&b

In [None]:
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'epochs': {
        'distribution': 'int_uniform',
        'max': 25,
        'min': 10
        },
    'learning_rate': {
        'distribution': 'uniform',
        'max': 0.0001,
        'min': 0.000016952978519689297
        },
    'weight_decay': {
        'values': [0.0, 0.1, 0.2  , 0.3, 0.4, 0.5]
        },
    "train_batch_size": {
      "distribution": "int_uniform",
      "max": 16,
      "min": 4
      }
}


sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project='NLP_tecniques_roberta')



Create sweep with ID: 1fchmwd0
Sweep URL: https://wandb.ai/pdc/NLP_tecniques_roberta/sweeps/1fchmwd0


In [None]:
def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config


    # set training arguments
    training_args = TrainingArguments(
        output_dir='/content/drive/MyDrive/ANLP/models/task2',
	      report_to='wandb',  # Turn on Weights & Biases logging
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        save_total_limit = 2,
        save_strategy = "no",
        load_best_model_at_end=False,
        logging_strategy='epoch',
        metric_for_best_model = "f1",
        per_device_train_batch_size = config.train_batch_size,
        evaluation_strategy="epoch"
    )

    #define training loop
    trainer = Trainer(
        model= model_init(),
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_validation_dataset,
        compute_metrics=compute_metrics,
    )
    # start training loop
    trainer.train()


In [None]:
wandb.agent(sweep_id, train, count=15)

[34m[1mwandb[0m: Agent Starting Run: accodx20 with config:
[34m[1mwandb[0m: 	epochs: 21
[34m[1mwandb[0m: 	learning_rate: 8.707693824609009e-05
[34m[1mwandb[0m: 	train_batch_size: 9
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9344,1.909027,0.186047,0.049548,0.18915,0.076064
2,1.9814,2.089044,0.124031,0.015504,0.125,0.027586
3,2.0938,2.085886,0.120155,0.015019,0.125,0.026817
4,2.0007,1.871635,0.22093,0.057382,0.218987,0.090155
5,1.8552,1.954969,0.197674,0.059785,0.19697,0.085858
6,1.8957,1.906487,0.193798,0.051068,0.195312,0.078645
7,1.9584,1.963511,0.189922,0.065327,0.195312,0.088408
8,2.0178,1.908227,0.217054,0.063644,0.21875,0.094782
9,1.9257,1.899631,0.232558,0.058152,0.230705,0.092882
10,2.0049,2.133468,0.124031,0.015504,0.125,0.027586


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

VBox(children=(Label(value='0.001 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.098544…

0,1
eval/accuracy,▅▁▁▇▆▆▅▇█▁▁▂▂▁▁▁▁▃▄▄▅
eval/f1,▆▁▁▇▇▆▇█▇▁▁▃▁▁▁▁▁▅▅▆█
eval/loss,▂▄▄▁▂▂▂▂▁▅▇█▅▆▇█▇▄▂▃▂
eval/precision,▃▁▁▃▃▃▄▄▃▁▁█▁▁▁▁▁▃▃▃▄
eval/recall,▅▁▁▇▆▆▆▇█▁▁▂▁▁▁▁▁▃▄▄▅
eval/runtime,▅█▄▄▄▃▃▃▁▁▁▁▂▁▁▁▁▂▁▁▁
eval/samples_per_second,▄▁▅▅▅▆▆▆█▇██▇████▇███
eval/steps_per_second,▄▁▅▅▅▆▆▆█▇██▇████▇███
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.18605
eval/f1,0.09872
eval/loss,1.9223
eval/precision,0.07202
eval/recall,0.18311
eval/runtime,12.554
eval/samples_per_second,20.551
eval/steps_per_second,2.629
train/epoch,21.0
train/global_step,2415.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mqia1sa6 with config:
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 5.455396823725867e-05
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7855,1.357826,0.503876,0.5125,0.502528,0.488556
2,1.2218,1.210639,0.589147,0.617445,0.587799,0.575728
3,0.8412,1.274365,0.631783,0.647435,0.631399,0.632892
4,0.5563,2.131204,0.565891,0.650394,0.564771,0.561147
5,0.4395,2.158435,0.585271,0.594271,0.584004,0.583366
6,0.1815,2.410754,0.635659,0.668044,0.633979,0.639368
7,0.1042,2.3386,0.658915,0.67959,0.658039,0.66415
8,0.0659,2.518712,0.651163,0.66998,0.648901,0.649584
9,0.0266,2.562528,0.651163,0.664629,0.649132,0.648316
10,0.0272,2.554895,0.655039,0.671181,0.652682,0.652662


VBox(children=(Label(value='0.001 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.055082…

0,1
eval/accuracy,▁▅▇▄▅▇████
eval/f1,▁▄▇▄▅▇█▇▇█
eval/loss,▂▁▁▆▆▇▇███
eval/precision,▁▅▇▇▄███▇█
eval/recall,▁▅▇▄▅▇████
eval/runtime,▅▃▅▅▄▆█▁▃▃
eval/samples_per_second,▄▆▄▄▅▃▁█▆▆
eval/steps_per_second,▄▆▄▄▅▃▁█▆▅
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███

0,1
eval/accuracy,0.65504
eval/f1,0.65266
eval/loss,2.55489
eval/precision,0.67118
eval/recall,0.65268
eval/runtime,12.5411
eval/samples_per_second,20.572
eval/steps_per_second,2.631
train/epoch,10.0
train/global_step,2580.0


[34m[1mwandb[0m: Agent Starting Run: l19ba8d2 with config:
[34m[1mwandb[0m: 	epochs: 24
[34m[1mwandb[0m: 	learning_rate: 8.800510091587816e-05
[34m[1mwandb[0m: 	train_batch_size: 10
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7652,1.438102,0.472868,0.500084,0.470051,0.431568
2,1.2695,1.401651,0.503876,0.569218,0.504691,0.483201
3,0.9256,1.383665,0.565891,0.627106,0.566334,0.547579
4,0.6189,1.397636,0.600775,0.629666,0.599696,0.589519
5,0.4461,1.42683,0.620155,0.629806,0.616673,0.61283
6,0.3055,1.973823,0.608527,0.659119,0.609116,0.611629
7,0.2697,2.319012,0.608527,0.626939,0.605711,0.591956
8,0.201,2.383728,0.612403,0.631484,0.612149,0.602963
9,0.1828,2.310781,0.643411,0.650508,0.642347,0.635857
10,0.083,2.441525,0.647287,0.66022,0.646833,0.641752


VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.087375…

0,1
eval/accuracy,▁▂▄▅▆▅▅▆▇▇▇▇▇▇███▇██████
eval/f1,▁▂▄▅▆▆▅▆▇▇▇▇█▇███▇██████
eval/loss,▁▁▁▁▁▅▇▇▇▇▆▇▆█▇▆▇▆▆▆▆▆▆▆
eval/precision,▁▄▆▆▆▇▆▆▇▇▇▇▇▇██████████
eval/recall,▁▂▄▅▆▅▅▆▇▇▇▇▇▇███▇██████
eval/runtime,▁▂▄▁▂▁▂▂▁▂▇█▂▁▃▃▂▂▃▂█▂▃▂
eval/samples_per_second,▇▇▅█▇▇▇▇█▇▂▁▇█▆▆▇▇▆▇▁▇▆▇
eval/steps_per_second,▇▇▅█▇▇▇▇█▇▂▁▇█▆▆▇▇▆▇▁▇▆▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.67829
eval/f1,0.67496
eval/loss,2.29846
eval/precision,0.68427
eval/recall,0.67773
eval/runtime,12.5623
eval/samples_per_second,20.538
eval/steps_per_second,2.627
train/epoch,24.0
train/global_step,2496.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 51r1gzn1 with config:
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	learning_rate: 4.535439133939471e-05
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8663,1.486685,0.453488,0.523941,0.452661,0.433536
2,1.2838,1.455603,0.457364,0.56738,0.458047,0.443133
3,0.8328,1.556819,0.515504,0.583884,0.513883,0.498374
4,0.496,1.611777,0.554264,0.567765,0.553719,0.549295
5,0.3656,1.901005,0.585271,0.588609,0.584858,0.581785
6,0.2661,2.43929,0.573643,0.60791,0.573716,0.572074
7,0.1681,2.629796,0.577519,0.598969,0.577793,0.569432
8,0.1236,2.595621,0.593023,0.60085,0.593152,0.59209
9,0.1065,2.776947,0.596899,0.602123,0.597207,0.590529
10,0.0698,2.849965,0.596899,0.600969,0.596244,0.589116


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▃▅▆▆▆▆▆▆▇▇▇█▇▇▇████▇███
eval/f1,▁▁▃▅▆▆▆▇▆▆▇▇▇█▇▇▇████▇███
eval/loss,▁▁▁▂▃▅▆▆▇▇▆▇█▇▇▇▇▇▇▇▇█▇▇▇
eval/precision,▁▄▄▄▅▆▅▅▆▅▇▆▆▇▇▆▇▇███▇█▇█
eval/recall,▁▁▃▅▆▆▆▆▆▆▇▇▇█▇▇▇████▇███
eval/runtime,▁▂▂▂▁▂██▁▂▂▁▁▁▁▂▁▁▂█▂▂▂▂▂
eval/samples_per_second,█▇▇▇█▇▁▁█▇▇████▆██▇▁▇▇▇▇▇
eval/steps_per_second,█▇▇▇█▇▁▁█▇▇████▇██▇▁▇▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.63566
eval/f1,0.63057
eval/loss,2.91141
eval/precision,0.64144
eval/recall,0.63425
eval/runtime,12.6232
eval/samples_per_second,20.439
eval/steps_per_second,2.614
train/epoch,25.0
train/global_step,3225.0


[34m[1mwandb[0m: Agent Starting Run: top8yfx6 with config:
[34m[1mwandb[0m: 	epochs: 21
[34m[1mwandb[0m: 	learning_rate: 8.21386637353798e-05
[34m[1mwandb[0m: 	train_batch_size: 14
[34m[1mwandb[0m: 	weight_decay: 0.2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7799,1.358973,0.503876,0.558248,0.505308,0.491844
2,1.1463,1.199977,0.585271,0.619239,0.585309,0.581777
3,0.8017,1.261209,0.600775,0.640785,0.599334,0.595157
4,0.4939,1.369892,0.600775,0.652021,0.602054,0.599854
5,0.2984,1.482699,0.627907,0.6298,0.627361,0.621049
6,0.178,1.910311,0.620155,0.63457,0.619442,0.618922
7,0.1378,2.2817,0.577519,0.593854,0.578971,0.576094
8,0.1206,2.287466,0.639535,0.651825,0.638635,0.633885
9,0.1144,2.266023,0.658915,0.68031,0.659877,0.658355
10,0.075,2.372235,0.631783,0.638754,0.632481,0.630548


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▄▅▅▆▆▄▇█▆▆▇▆▇▆█▇█▇██
eval/f1,▁▅▅▅▆▆▄▇█▇▆▇▆▇▆█▇█▇██
eval/loss,▂▁▁▂▂▅▆▆▆▇█▇█████████
eval/precision,▁▄▆▆▅▅▃▆█▆▅▇▅▆▆▇▇█▇██
eval/recall,▁▅▅▅▆▆▄▇█▇▆▇▆▇▆███▇██
eval/runtime,▁▃▁▁▂▂▁▂▂▃▁▁▂█▂▂▃▂█▂▂
eval/samples_per_second,█▆██▇▇▇▇▇▆██▇▁▇▇▆▇▁▇▇
eval/steps_per_second,█▆██▇▇▇▇▇▆██▇▁▇▇▆▇▁▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.66667
eval/f1,0.66484
eval/loss,2.58056
eval/precision,0.68061
eval/recall,0.66528
eval/runtime,12.5344
eval/samples_per_second,20.583
eval/steps_per_second,2.633
train/epoch,21.0
train/global_step,1554.0


[34m[1mwandb[0m: Agent Starting Run: bbtndr6y with config:
[34m[1mwandb[0m: 	epochs: 16
[34m[1mwandb[0m: 	learning_rate: 8.59719435512324e-05
[34m[1mwandb[0m: 	train_batch_size: 8
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9973,2.045384,0.143411,0.101555,0.145035,0.05629
2,1.9252,1.911763,0.224806,0.05705,0.22313,0.090653
3,1.9036,1.789457,0.255814,0.161116,0.249909,0.14201
4,1.8626,1.840427,0.228682,0.060085,0.230469,0.094422
5,1.8706,1.860867,0.224806,0.061357,0.226562,0.09458
6,1.8255,1.862252,0.224806,0.117179,0.225873,0.105871
7,1.7844,1.798905,0.228682,0.092307,0.231781,0.120438
8,1.7685,1.784424,0.25969,0.120048,0.255543,0.14996
9,1.7718,1.829002,0.228682,0.121319,0.226625,0.138428
10,1.7359,1.730129,0.263566,0.104779,0.258175,0.144827


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.001 MB of 0.025 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.044960…

0,1
eval/accuracy,▁▅▆▅▅▅▅▆▅▆███▆▇▇
eval/f1,▁▃▆▃▃▄▅▇▆▇████▇▇
eval/loss,█▅▂▃▄▄▃▂▃▁▃▂▁▄▄▄
eval/precision,▃▁▅▁▁▃▂▄▄▃▃▃▄▄█▄
eval/recall,▁▅▆▅▅▅▅▆▅▆███▆▇▇
eval/runtime,▁▁▂▂▂▂▂█▁▂▁▂▂▂▂▃
eval/samples_per_second,██▆▇▇▇▇▁█▇█▇▇▇▇▆
eval/steps_per_second,██▆▇▇▇▇▁█▇█▇▇▇▇▆
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇█████

0,1
eval/accuracy,0.27132
eval/f1,0.1592
eval/loss,1.87699
eval/precision,0.13816
eval/recall,0.26375
eval/runtime,12.6427
eval/samples_per_second,20.407
eval/steps_per_second,2.61
train/epoch,16.0
train/global_step,2064.0


[34m[1mwandb[0m: Agent Starting Run: nwptqd73 with config:
[34m[1mwandb[0m: 	epochs: 24
[34m[1mwandb[0m: 	learning_rate: 9.660994667457556e-05
[34m[1mwandb[0m: 	train_batch_size: 5
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1179,2.083498,0.127907,0.015988,0.125,0.028351
2,2.1026,2.083661,0.124031,0.015504,0.125,0.027586
3,2.1027,2.08132,0.131783,0.016473,0.125,0.02911
4,2.0958,2.09339,0.127907,0.015988,0.125,0.028351
5,2.0973,2.088044,0.124031,0.015504,0.125,0.027586
6,2.0905,2.085271,0.120155,0.015019,0.125,0.026817
7,2.091,2.086215,0.124031,0.015504,0.125,0.027586
8,2.099,2.080493,0.131783,0.016473,0.125,0.02911
9,2.0931,2.081267,0.131783,0.016473,0.125,0.02911
10,2.0914,2.080871,0.127907,0.015988,0.125,0.028351


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

0,1
eval/accuracy,▆▃█▆▃▁▃██▆███▆▆▆▆▆█▆▆███
eval/f1,▆▃█▆▃▁▃██▆███▆▆▆▆▆█▆▆███
eval/loss,▃▃▂█▅▄▄▂▂▂▁▂▂▂▂▁▁▁▁▁▁▁▁▁
eval/precision,▆▃█▆▃▁▃██▆███▆▆▆▆▆█▆▆███
eval/recall,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▁▂▁▁▂▁▁█▁▁▁▆▁▁▂▁▁▁▂▂▁▂▁
eval/samples_per_second,▇█▇██▇██▁███▃█▇▇█▇█▇▇█▇▇
eval/steps_per_second,▇█▇██▇██▁███▃█▇▇█▇█▇▇█▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.13178
eval/f1,0.02911
eval/loss,2.07927
eval/precision,0.01647
eval/recall,0.125
eval/runtime,12.5869
eval/samples_per_second,20.498
eval/steps_per_second,2.622
train/epoch,24.0
train/global_step,4968.0


[34m[1mwandb[0m: Agent Starting Run: pbmg0yp6 with config:
[34m[1mwandb[0m: 	epochs: 19
[34m[1mwandb[0m: 	learning_rate: 2.302700680628132e-05
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9159,1.571536,0.426357,0.45404,0.426257,0.386021
2,1.3232,1.193614,0.593023,0.587944,0.592255,0.580379
3,0.9057,1.14145,0.620155,0.639863,0.619125,0.614293
4,0.5634,1.148643,0.639535,0.660048,0.638014,0.630803
5,0.3566,1.273822,0.624031,0.638545,0.62481,0.621556
6,0.2237,1.370499,0.600775,0.609708,0.600764,0.599772
7,0.14,1.459301,0.600775,0.604231,0.601133,0.596573
8,0.0854,1.698449,0.635659,0.652793,0.634847,0.636336
9,0.0637,1.757578,0.627907,0.661403,0.626175,0.62692
10,0.0438,1.908319,0.620155,0.639206,0.618769,0.616762


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▆▇█▇▇▇██▇▇▇▇█▇█▇██
eval/f1,▁▆▇██▇▇██▇▇▇▇█▇████
eval/loss,▄▁▁▁▂▂▃▅▅▆▇▇▇▇█████
eval/precision,▁▆▇█▇▆▆██▇▆▇▆▇▆▇▇██
eval/recall,▁▆▇██▇▇██▇▇▇▇█▇████
eval/runtime,▂▂▁▂▂▂▁▂▂▁▁█▂▂▂▂▂█▂
eval/samples_per_second,▇▇█▇▇▇█▇▇██▁▇▇▇▇▇▁▇
eval/steps_per_second,▇▇█▇▇▇█▇▇██▁▇▇▇▇▇▁▇
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████

0,1
eval/accuracy,0.62791
eval/f1,0.63068
eval/loss,2.22585
eval/precision,0.649
eval/recall,0.62776
eval/runtime,12.5601
eval/samples_per_second,20.541
eval/steps_per_second,2.627
train/epoch,19.0
train/global_step,1235.0


[34m[1mwandb[0m: Agent Starting Run: 0adygn0h with config:
[34m[1mwandb[0m: 	epochs: 21
[34m[1mwandb[0m: 	learning_rate: 9.421825085201011e-05
[34m[1mwandb[0m: 	train_batch_size: 15
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9104,1.617682,0.375969,0.372324,0.377142,0.354268
2,1.4993,1.315271,0.562016,0.595308,0.561058,0.547374
3,1.0473,1.312133,0.531008,0.61196,0.526761,0.518951
4,0.6895,1.396843,0.589147,0.646046,0.589571,0.585704
5,0.4577,1.329407,0.593023,0.603394,0.592544,0.594012
6,0.3029,2.100934,0.554264,0.612838,0.553718,0.546423
7,0.2391,1.957156,0.612403,0.636877,0.611157,0.613103
8,0.1974,2.288209,0.608527,0.651698,0.608584,0.606684
9,0.1378,2.198614,0.627907,0.630181,0.626782,0.622067
10,0.1294,2.613764,0.604651,0.673954,0.604144,0.605183


  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▆▅▆▆▅▇▇▇▇▇████▇▇▇▇▇▇
eval/f1,▁▆▅▆▇▆▇▇▇▇▇████▇▇▇▇▇▇
eval/loss,▃▁▁▁▁▅▄▆▆█▇▇▆▇▇██████
eval/precision,▁▆▇▇▆▇▇▇▇█▇▇█▇▇▇▇▇▇▇▇
eval/recall,▁▆▅▆▆▅▇▇▇▇▇████▇▇▇▇▇▇
eval/runtime,▂▄▇▃▃▆▆▁▁▂▆▇▂▂█▅▅▇▆▄▂
eval/samples_per_second,▇▅▂▆▆▃▃██▇▃▂▇▇▁▄▄▂▃▅▇
eval/steps_per_second,▇▄▂▅▆▃▃██▇▃▂▇▇▁▄▄▂▃▅▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.62791
eval/f1,0.62455
eval/loss,2.5818
eval/precision,0.63502
eval/recall,0.62596
eval/runtime,12.4735
eval/samples_per_second,20.684
eval/steps_per_second,2.646
train/epoch,21.0
train/global_step,1449.0


[34m[1mwandb[0m: Agent Starting Run: i6106g4k with config:
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	learning_rate: 4.475850022474699e-05
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	weight_decay: 0.5


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7542,1.318525,0.511628,0.609428,0.509763,0.485164
2,1.1655,1.142034,0.604651,0.618111,0.60513,0.595688
3,0.749,1.285448,0.635659,0.668232,0.636055,0.623043
4,0.5061,1.643476,0.647287,0.674449,0.645951,0.647344
5,0.326,1.986536,0.624031,0.641515,0.622854,0.61675
6,0.1876,2.35866,0.643411,0.667054,0.641653,0.639251
7,0.1337,2.569426,0.627907,0.656778,0.62653,0.630322
8,0.1054,2.32677,0.662791,0.668114,0.661014,0.656163
9,0.0609,2.430798,0.658915,0.665714,0.65733,0.65391
10,0.0491,2.461552,0.655039,0.664121,0.654127,0.651415


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▅▇▇▆▇▆█████
eval/f1,▁▅▆▇▆▇▇█████
eval/loss,▂▁▂▃▅▇█▇▇▇██
eval/precision,▁▂▇█▄▇▆▇▇▇▇█
eval/recall,▁▅▇▇▆▇▆█████
eval/runtime,▃▂▂▁▄▃▂▂▆▆█▃
eval/samples_per_second,▆▇▇█▅▆▇▇▃▃▁▆
eval/steps_per_second,▆▇▇█▅▆▇▇▃▃▁▆
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███

0,1
eval/accuracy,0.66279
eval/f1,0.66143
eval/loss,2.50115
eval/precision,0.67782
eval/recall,0.66147
eval/runtime,12.5753
eval/samples_per_second,20.516
eval/steps_per_second,2.624
train/epoch,12.0
train/global_step,3096.0


[34m[1mwandb[0m: Agent Starting Run: ts64o1z8 with config:
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 6.638263376997823e-05
[34m[1mwandb[0m: 	train_batch_size: 5
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8163,1.580028,0.434109,0.432534,0.434331,0.382113
2,1.3721,1.467064,0.453488,0.478261,0.455428,0.415722
3,1.0359,1.455697,0.515504,0.584209,0.516566,0.505402
4,0.72,1.565278,0.569767,0.6068,0.568782,0.550169
5,0.5009,1.938022,0.589147,0.602597,0.586475,0.581523
6,0.3289,2.071242,0.596899,0.604545,0.594555,0.587627
7,0.2327,2.290622,0.631783,0.63967,0.630704,0.626343
8,0.1128,2.380417,0.612403,0.610471,0.610787,0.607095
9,0.0783,2.44153,0.596899,0.595818,0.595495,0.593398
10,0.0537,2.45654,0.616279,0.614832,0.61516,0.610622


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.083666…

0,1
eval/accuracy,▁▂▄▆▆▇█▇▇▇
eval/f1,▁▂▅▆▇▇█▇▇█
eval/loss,▂▁▁▂▄▅▇▇██
eval/precision,▁▃▆▇▇▇█▇▇▇
eval/recall,▁▂▄▆▆▇█▇▇▇
eval/runtime,▁▂█▁▂▂▂▁▁▁
eval/samples_per_second,█▇▁█▇▇▇██▇
eval/steps_per_second,█▇▁█▇▇▇███
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███

0,1
eval/accuracy,0.61628
eval/f1,0.61062
eval/loss,2.45654
eval/precision,0.61483
eval/recall,0.61516
eval/runtime,12.5118
eval/samples_per_second,20.621
eval/steps_per_second,2.638
train/epoch,10.0
train/global_step,2070.0


[34m[1mwandb[0m: Agent Starting Run: 63zs5hp4 with config:
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	learning_rate: 6.987478369900612e-05
[34m[1mwandb[0m: 	train_batch_size: 11
[34m[1mwandb[0m: 	weight_decay: 0.2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9093,1.665966,0.344961,0.305463,0.339552,0.264231
2,1.4666,1.206972,0.581395,0.618574,0.580108,0.563252
3,0.9683,1.265661,0.600775,0.632451,0.601372,0.599539
4,0.6226,1.218516,0.635659,0.653256,0.634609,0.633318
5,0.3762,1.48718,0.616279,0.63004,0.61604,0.603949
6,0.2704,1.80551,0.643411,0.678352,0.64231,0.649263
7,0.1816,2.12045,0.643411,0.667052,0.642696,0.641085
8,0.1368,2.090328,0.631783,0.650818,0.629636,0.628694
9,0.0721,2.065521,0.674419,0.674113,0.673221,0.671437
10,0.0766,2.296896,0.662791,0.682713,0.661946,0.6658


  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.001 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.054267…

0,1
eval/accuracy,▁▆▆▇▇▇▇▇███████
eval/f1,▁▆▇▇▇█▇▇███████
eval/loss,▄▁▁▁▃▄▆▆▆▇▇████
eval/precision,▁▇▇▇▇██▇███████
eval/recall,▁▆▆▇▇▇▇▇███████
eval/runtime,▁▁▁▁▂▁▁▂▂▁▂█▆▂▄
eval/samples_per_second,████▇██▇▇█▇▁▃▇▅
eval/steps_per_second,████▇██▇▇█▇▁▃▇▅
train/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███

0,1
eval/accuracy,0.65504
eval/f1,0.65725
eval/loss,2.44169
eval/precision,0.68111
eval/recall,0.6539
eval/runtime,12.8664
eval/samples_per_second,20.052
eval/steps_per_second,2.565
train/epoch,15.0
train/global_step,1410.0


[34m[1mwandb[0m: Agent Starting Run: r902qrno with config:
[34m[1mwandb[0m: 	epochs: 18
[34m[1mwandb[0m: 	learning_rate: 6.705642682871215e-05
[34m[1mwandb[0m: 	train_batch_size: 4
[34m[1mwandb[0m: 	weight_decay: 0.2


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.1143,2.085491,0.127907,0.015988,0.125,0.028351
2,2.0972,2.089026,0.124031,0.015504,0.125,0.027586
3,2.0862,2.147961,0.124031,0.015504,0.125,0.027586
4,2.0654,2.093948,0.124031,0.015504,0.125,0.027586
5,2.1089,2.090335,0.124031,0.015504,0.125,0.027586
6,2.0874,2.109479,0.124031,0.015504,0.125,0.027586
7,1.9972,2.572953,0.124031,0.015504,0.125,0.027586
8,1.9843,2.254955,0.124031,0.015504,0.125,0.027586
9,1.9708,2.364423,0.124031,0.015504,0.125,0.027586
10,1.9597,2.344047,0.124031,0.015504,0.125,0.027586


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0,1
eval/accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██
eval/f1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██
eval/loss,▂▂▃▂▂▂█▄▅▅█▇▄▇▆▄▁▁
eval/precision,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██
eval/recall,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁██
eval/runtime,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁███████████████▇█
eval/steps_per_second,▁███████████████▇█
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████

0,1
eval/accuracy,0.18605
eval/f1,0.0766
eval/loss,1.98941
eval/precision,0.04846
eval/recall,0.18313
eval/runtime,12.5313
eval/samples_per_second,20.588
eval/steps_per_second,2.633
train/epoch,18.0
train/global_step,4644.0


[34m[1mwandb[0m: Agent Starting Run: t8xii04w with config:
[34m[1mwandb[0m: 	epochs: 14
[34m[1mwandb[0m: 	learning_rate: 8.385782089574071e-05
[34m[1mwandb[0m: 	train_batch_size: 14
[34m[1mwandb[0m: 	weight_decay: 0.1


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9022,1.609454,0.403101,0.490875,0.408323,0.371586
2,1.4025,1.363692,0.507752,0.563263,0.506867,0.467856
3,1.0116,1.315979,0.550388,0.572599,0.549574,0.538906
4,0.6547,1.321496,0.616279,0.637468,0.615752,0.615106
5,0.4684,1.49804,0.604651,0.623288,0.606212,0.603456
6,0.3409,1.735492,0.612403,0.642717,0.610454,0.60612
7,0.2427,1.886594,0.639535,0.662689,0.637716,0.632259
8,0.1172,2.344172,0.596899,0.631148,0.593613,0.590018
9,0.1362,2.151412,0.631783,0.651608,0.629741,0.625949
10,0.0843,2.114134,0.658915,0.677006,0.657715,0.655926


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▄▅▇▆▇▇▆▇█████
eval/f1,▁▃▅▇▇▇▇▆▇█████
eval/loss,▃▁▁▁▂▄▅█▇▆▇▇▇▇
eval/precision,▁▄▄▆▆▇▇▆▇█████
eval/recall,▁▄▅▇▆▇▇▆▇█████
eval/runtime,▂▁▅▁█▅▄▄▅▆▃▆▃▃
eval/samples_per_second,▇█▄█▁▄▅▅▄▃▆▃▆▆
eval/steps_per_second,▇█▄█▁▄▅▅▄▃▆▃▆▆
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███

0,1
eval/accuracy,0.66667
eval/f1,0.66138
eval/loss,2.23763
eval/precision,0.67824
eval/recall,0.66493
eval/runtime,12.4517
eval/samples_per_second,20.72
eval/steps_per_second,2.65
train/epoch,14.0
train/global_step,1036.0


[34m[1mwandb[0m: Agent Starting Run: 2cftuedj with config:
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	learning_rate: 6.846828291307634e-05
[34m[1mwandb[0m: 	train_batch_size: 11
[34m[1mwandb[0m: 	weight_decay: 0.4


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.9006,1.555283,0.476744,0.446366,0.479677,0.434194
2,1.378,1.24934,0.542636,0.579564,0.542342,0.525797
3,0.8978,1.194937,0.573643,0.628577,0.572227,0.567592
4,0.565,1.229265,0.616279,0.634383,0.614308,0.607909
5,0.3266,1.482028,0.616279,0.650185,0.613989,0.615566
6,0.201,1.998983,0.616279,0.646973,0.614995,0.61579
7,0.1337,2.259254,0.624031,0.648074,0.622535,0.620707
8,0.0968,2.266238,0.631783,0.64594,0.630599,0.629758
9,0.0582,2.215196,0.624031,0.638898,0.623336,0.623749
10,0.0409,2.192443,0.635659,0.645615,0.635469,0.635314


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▄▅▇▇▇▇▇▇▇██
eval/f1,▁▄▅▇▇▇▇▇▇███
eval/loss,▃▁▁▁▃▆██████
eval/precision,▁▅▇▇████▇███
eval/recall,▁▄▅▇▇▇▇▇▇▇██
eval/runtime,▁▂▃▃█▄▆▃▅▅▄▆
eval/samples_per_second,█▇▆▆▁▅▃▆▄▄▅▃
eval/steps_per_second,█▇▆▆▁▄▃▆▄▄▅▃
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇███

0,1
eval/accuracy,0.64341
eval/f1,0.64134
eval/loss,2.22825
eval/precision,0.64736
eval/recall,0.64232
eval/runtime,12.5202
eval/samples_per_second,20.607
eval/steps_per_second,2.636
train/epoch,12.0
train/global_step,1128.0


Error in callback <function _WandbInit._pause_backend at 0x7f07c51c0e50> (for post_run_cell):


BrokenPipeError: ignored

### Final model


#### Training

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/ANLP/models/task2',
    report_to='wandb',  # Turn on Weights & Biases logging
    num_train_epochs=12,
    learning_rate=0.00004476,
    weight_decay=0.5,
    save_total_limit = 2,
    save_strategy = "epoch",
    load_best_model_at_end=True,
    logging_strategy='epoch',
    metric_for_best_model = "f1",
    per_device_train_batch_size = 4,
    evaluation_strategy="epoch"
)

#define training loop
trainer = Trainer(
    model= model_init(),
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)
# start training loop
trainer.train()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.868,1.835836,0.267442,0.285718,0.264171,0.172617
2,1.4778,1.395947,0.523256,0.55784,0.522632,0.512627
3,1.006,1.280571,0.593023,0.62406,0.595581,0.584493
4,0.6484,1.694787,0.635659,0.656766,0.631727,0.622359
5,0.4513,2.395609,0.573643,0.658674,0.572446,0.554919
6,0.2779,2.297819,0.651163,0.660231,0.649078,0.643141
7,0.1764,2.357485,0.651163,0.657512,0.649886,0.647499
8,0.1308,2.231292,0.658915,0.664804,0.657113,0.654345
9,0.0847,2.458382,0.658915,0.680627,0.657557,0.653944
10,0.0704,2.484221,0.643411,0.65119,0.641858,0.640328


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
labels_pred = trainer.predict(tokenized_test_dataset)

#### Loading pre-trained model

### Results

In [None]:
codes  = dict(enumerate(propaganda_train_df['label'].astype('category').cat.categories))

In [None]:
#Predictions to dataframe
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results_task2.csv")
results_df["preidctions_num_roberta"] = labels_pred[0].argmax(-1)
results_df["predictions_roberta"] = results_df["preidctions_num_roberta"].map(codes)
results_df["accuracy_roberta"] = np.where(results_df["label"] != results_df["predictions_roberta"], "Incorrect","Correct")

wrong_predictions = results_df[results_df["accuracy_roberta"] =="Incorrect"]

right_predictions = results_df[results_df["accuracy_roberta"] =="Correct"]



In [None]:
#Save file
results_df.to_csv("/content/drive/MyDrive/ANLP/results_task2.csv")

In [None]:
#Predictions to dataframe
results_df = pd.read_csv("/content/drive/MyDrive/ANLP/results_task2.csv",index_col=0)


In [None]:
print(sklearn.metrics.classification_report(results_df["predictions_roberta"], results_df["label"]))


                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.70      0.68      0.69        44
causal_oversimplification       0.69      0.67      0.68        36
                    doubt       0.65      0.76      0.70        37
exaggeration,minimisation       0.73      0.56      0.64        39
              flag_waving       0.80      0.73      0.77        49
          loaded_language       0.46      0.58      0.51        31
    name_calling,labeling       0.71      0.71      0.71        34
               repetition       0.53      0.54      0.53        39

                 accuracy                           0.66       309
                macro avg       0.66      0.65      0.65       309
             weighted avg       0.67      0.66      0.66       309



In [None]:
results_df[results_df["accuracy_roberta"]==0]

In [None]:
codes  = dict(enumerate(traindf['label'].astype('category').cat.categories))


#### Classes

In [None]:
prop_answers = results_df.groupby(["predictions_roberta","accuracy_roberta"]).count()
prop_answers = prop_answers.reset_index()


In [None]:
alt.Chart(prop_answers,
    title= "Recall "
    ).mark_bar().encode(
    alt.Y("predictions_roberta",
          title = "True label"
          ),
    alt.X("tagged_in_context",
          title="Percentage of sentences(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    color =alt.Color("accuracy_roberta:O", title = "Predictions"))

In [None]:
prop_answers = results_df.groupby(["label","accuracy_roberta"])["tagged_in_context"].count()
prop_answers = prop_answers.reset_index(name = "count")

In [None]:
alt.Chart(prop_answers,
    title= "Precision"
    ).mark_bar().encode(
    alt.Y("label:O",
          title = "Prediction"
          ),
    alt.X("count",
          title="Percentage of predictions(%)",
          stack = "normalize",
          axis=alt.Axis(format='%')
          ),
    alt.Color("accuracy_roberta:O", title = "Prediction")

 )


#### Length

In [None]:
#Histogram
alt.Chart(
    results_df,
    title = "Percentage of predictions per span length"
    ).transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total'
).mark_bar().encode(
    y=alt.Y("span_length:O",
            title = "Number of words in span",
            bin = alt.Bin(step = 10),
           ),
    x = alt.X("sum(pct):Q",axis=alt.Axis(format='%'),
              title = "Number of spans",
               stack = "normalize"),
    color = "accuracy_roberta:O"
)

In [None]:
alt.Chart(results_df).mark_boxplot(ticks = True, size = 10).encode(
    y= alt.Y('label:O', axis = None),
    x=alt.X('span_length:Q', title = "Number of words",),
    color = alt.Color("accuracy_roberta:N", title = "Prediction"),
    row = alt.Row("binary_label:N",align = "none",title = "", spacing = 0.2)
).configure_view(
    stroke="transparent"
).configure_scale(
    bandPaddingInner=0,
    bandPaddingOuter=0.1,
).configure_header(
    labelOrient = "left",
    labelAlign = "left",
    labelAngle = 0,
)

## Comparison




In [None]:
both_wrong = results_df[(results_df.accuracy_RF == "Incorrect") & (results_df.accuracy_roberta == "Incorrect")]
both_right = results_df[(results_df.accuracy_RF == "Correct") & (results_df.accuracy_roberta == "Correct")]
rob_right = results_df[(results_df.accuracy_RF == "Incorrect") & (results_df.accuracy_roberta == "Correct")]
rf_right = results_df[(results_df.accuracy_RF == "Correct") & (results_df.accuracy_roberta == "Incorrect")]

In [None]:
both_wrong.groupby("label")["tagged_in_context"].count()
both_wrong = both_wrong.groupby("label")["tagged_in_context"].count().reset_index(name = "count")
comparison_df =

Unnamed: 0_level_0,Unnamed: 0,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,type,prob_prop_RF1,...,RF1_accuracy,prob_prop_RF2,predictions_RF2,accuracy_RF2,predictions_roB2,accuracy_roB2,predictions_num_roB2,predictions_num_roB1,predictions_roB1,accuracy_roB1
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
appeal_to_fear_prejudice,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
doubt,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
"exaggeration,minimisation",7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
flag_waving,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
loaded_language,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
"name_calling,labeling",9,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
not_propaganda,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
repetition,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [None]:
both_right.groupby("label").count()

Unnamed: 0_level_0,Unnamed: 0,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,type,prob_prop_RF1,...,RF1_accuracy,prob_prop_RF2,predictions_RF2,accuracy_RF2,predictions_roB2,accuracy_roB2,predictions_num_roB2,predictions_num_roB1,predictions_roB1,accuracy_roB1
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
not_propaganda,15,15,15,15,15,15,15,15,15,15,...,15,15,15,15,15,15,15,15,15,15


In [None]:
rob_right.groupby("label").count()

Unnamed: 0_level_0,Unnamed: 0,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,type,prob_prop_RF1,...,RF1_accuracy,prob_prop_RF2,predictions_RF2,accuracy_RF2,predictions_roB2,accuracy_roB2,predictions_num_roB2,predictions_num_roB1,predictions_roB1,accuracy_roB1
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
doubt,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5
"exaggeration,minimisation",3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
flag_waving,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
loaded_language,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"name_calling,labeling",2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
not_propaganda,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
repetition,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


In [None]:
rf_right

Unnamed: 0.1,Unnamed: 0,label,tagged_in_context,binary_label,sentence,span,span_length,len,sentence_length,type,...,RF1_accuracy,prob_prop_RF2,predictions_RF2,accuracy_RF2,predictions_roB2,accuracy_roB2,predictions_num_roB2,predictions_num_roB1,predictions_roB1,accuracy_roB1
17,17,not_propaganda,@drawandstrike theory looking plausiblehttps:/...,0,@drawandstrike theory looking plausiblehttps:/...,—,1,78,8,Test,...,0,0.0,not_propaganda,1,repetition,0,8,0,1,0
30,30,not_propaganda,"Rare but deadly, the viral disease is most com...",0,"Rare but deadly, the viral disease is most com...",humans.,1,84,13,Test,...,0,0.0,not_propaganda,1,appeal_to_fear_prejudice,0,0,0,1,0
132,132,not_propaganda,"At least, Israel has a more sympathetic ear in...",0,"At least, Israel has a more sympathetic ear in...",to U.S. interests,3,234,40,Test,...,0,0.0,not_propaganda,1,flag_waving,0,4,0,1,0
211,211,not_propaganda,And I don’t want to go <BOS> around insulting ...,0,And I don’t want to go around insulting people.,around insulting people.,3,49,9,Test,...,0,0.0,not_propaganda,1,doubt,0,2,0,1,0
229,229,not_propaganda,Hence it is a time when they’re supposed to gr...,0,Hence it is a time when they’re supposed to gr...,and kind toward their fellow Muslims.,6,102,18,Test,...,0,0.0,not_propaganda,1,appeal_to_fear_prejudice,0,0,0,1,0
268,268,not_propaganda,ICE arrests 20 in Kansas City during 4-day ope...,0,ICE arrests 20 in Kansas City during 4-day ope...,criminal aliens,2,80,12,Test,...,0,0.0,not_propaganda,1,doubt,0,2,0,1,0
281,281,not_propaganda,"I sold him 120 rounds of .556 tracer, again, s...",0,"I sold him 120 rounds of .556 tracer, again, s...","military,""",1,90,16,Test,...,0,0.0,not_propaganda,1,flag_waving,0,4,0,1,0
291,291,not_propaganda,To <BOS> achieve that and hold onto power the ...,0,To achieve that and hold onto power the prime ...,achieve that and hold onto power the prime min...,28,178,29,Test,...,0,0.0,not_propaganda,1,doubt,0,2,0,1,0
431,431,not_propaganda,The meeting covered <BOS> Carmels of <EOS> the...,0,"The meeting covered Carmels of the Teresian, ...",Carmels of,2,207,32,Test,...,0,0.0,not_propaganda,1,"name_calling,labeling",0,6,0,1,0
442,442,doubt,"As noted above, at this point literally every ...",1,"As noted above, at this point literally every ...",cannot be trusted and should be considered as ...,9,231,35,Test,...,0,0.0,doubt,1,not_propaganda,0,7,1,0,0
