<a href="https://colab.research.google.com/github/Nouran-Khallaf/Arabic-Readability-Corpus/blob/main/roberta_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
#     !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Creating Dataset

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
import numpy as np
import csv
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch


In [None]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list


In [None]:
!wget http://www.inf.ed.ac.uk/teaching/courses/tts/labs/lab7/tweetsclassification.zip
!unzip tweetsclassification.zip

--2022-06-17 14:03:31--  http://www.inf.ed.ac.uk/teaching/courses/tts/labs/lab7/tweetsclassification.zip
Resolving www.inf.ed.ac.uk (www.inf.ed.ac.uk)... 129.215.33.176
Connecting to www.inf.ed.ac.uk (www.inf.ed.ac.uk)|129.215.33.176|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.inf.ed.ac.uk/teaching/courses/tts/labs/lab7/tweetsclassification.zip [following]
--2022-06-17 14:03:31--  https://www.inf.ed.ac.uk/teaching/courses/tts/labs/lab7/tweetsclassification.zip
Connecting to www.inf.ed.ac.uk (www.inf.ed.ac.uk)|129.215.33.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 209770 (205K) [application/zip]
Saving to: ‘tweetsclassification.zip.1’


2022-06-17 14:03:32 (569 KB/s) - ‘tweetsclassification.zip.1’ saved [209770/209770]

Archive:  tweetsclassification.zip
replace Tweets.14cat.train? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Tweets.14cat.train      
replace Tweets.14cat.test? [y]es, [n]o

In [None]:
train = pd.read_csv('Tweets.14cat.train',encoding='latin-1', sep='\t',header=None,quoting=csv.QUOTE_NONE)
test = pd.read_csv('Tweets.14cat.test',encoding='latin-1', sep='\t', header=None,quoting=csv.QUOTE_NONE)

# Assigning names to our columns
train.columns=['id','tweet','category']
test.columns=['id','tweet','category']

In [None]:
DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'category'

labels= list(train[LABEL_COLUMN].unique())
print(labels)
print(train[LABEL_COLUMN].value_counts())

dataset = Dataset("example", train, test, labels)


['Pets & Animals', 'Comedy', 'Autos & Vehicles', 'Science & Technology', 'News & Politics', 'Gaming', 'Nonprofits & Activism', 'Music', 'Film & Animation', 'Education', 'Travel & Events', 'Entertainment', 'Sports', 'Howto & Style']
Gaming                   220
Autos & Vehicles         210
Howto & Style            207
Sports                   203
Travel & Events          196
Science & Technology     189
Film & Animation         178
Pets & Animals           177
News & Politics          168
Music                    160
Entertainment            159
Comedy                   153
Education                142
Nonprofits & Activism    141
Name: category, dtype: int64


In [None]:
train

Unnamed: 0,id,tweet,category
0,45029314109075046,Furniture for - so cute! gotta show my #grandd...,Pets & Animals
1,45033090867215155,"""#Sunday aww"""": Mr Peebles",Pets & Animals
2,45036625162627481,CATS ART http://t.co/cJre1jn2Bl #creative #fel...,Pets & Animals
3,45086603513077350,RT @Masala_chaai: Keep Calm & Hug your Dog ! #...,Pets & Animals
4,45138968053405286,RT @TheSoulfulEMU: RETWEET if you love your do...,Pets & Animals
...,...,...,...
2498,551069446257655808,Series of Car Window Breakages Under Investiga...,News & Politics
2499,551156371031199744,night Maltese club offer Evans contract: Conv...,News & Politics
2500,551166096598781952,All of today's news headlines in one place. ht...,News & Politics
2501,551617864805781504,ISBPL: Affordable housing scheme for EPFO subs...,News & Politics


In [None]:
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)

In [None]:
# preprocessing
import re
import string
def preprocess(sent):
    x="".join([i for i in sent if i not in string.punctuation])
    return x.lower()
dataset.train[DATA_COLUMN] = dataset.train[DATA_COLUMN].apply(lambda x: preprocess(x))
dataset.test[DATA_COLUMN] = dataset.test[DATA_COLUMN].apply(lambda x: preprocess(x))


In [None]:
dataset.train[DATA_COLUMN][:20]

0     furniture for  so cute gotta show my granddog ...
1                                 sunday aww mr peebles
2     cats art httptcocjre1jn2bl creative feline art...
3     rt masalachaai keep calm  hug your dog  petlov...
4     rt thesoulfulemu retweet if you love your dog ...
5     missing cat atlantic gardens httptcoe2mu2yiv6h...
6     rt doggystylin firsttime customers receive a 1...
7                                      rt petsweekly so
8     the first movie with a drakonia included in th...
9     bmw plans to invest up to us1 billion a108 bil...
10    so to oxygen for being a total pimp and bondin...
11                                                   us
12    your twitter conversations fall into one of th...
13    capture in hd capture everything order now htt...
14    hockenheim veterama 2014 parts teile marked ve...
15    african women lag men in activism httptcoaii9x...
16    sytner opens lamborghini dealership in leicest...
17    27off deal 1799 looking into you a tribute

In [None]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(BERTDataset).__init__()
      self.text = text
      self.target = target
      self.tokenizer_name = model_name
      self.tokenizer = AutoTokenizer.from_pretrained(model_name)
      self.max_len = max_len
      self.label_map = label_map


    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())



      input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
      )

      attention_mask = [1] * len(input_ids)

      # Zero-pad up to the sequence length.
      padding_length = self.max_len - len(input_ids)
      input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
      attention_mask = attention_mask + ([0] * padding_length)

      return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [None]:
# some specs of our model
num_of_epochs=8
model_name = 'roberta-large'
task_name = 'classification'
max_len = 128 #128#256



In [None]:
label_map = { v:index for index, v in enumerate(dataset.label_list) }
print(label_map)
train_dataset = BERTDataset(dataset.train[DATA_COLUMN].to_list(),dataset.train[LABEL_COLUMN].to_list(),model_name,max_len,label_map)
test_dataset = BERTDataset(dataset.test[DATA_COLUMN].to_list(),dataset.test[LABEL_COLUMN].to_list(),model_name,max_len,label_map)

Could not locate the tokenizer configuration file, will try to use the model config instead.


{'Pets & Animals': 0, 'Comedy': 1, 'Autos & Vehicles': 2, 'Science & Technology': 3, 'News & Politics': 4, 'Gaming': 5, 'Nonprofits & Activism': 6, 'Music': 7, 'Film & Animation': 8, 'Education': 9, 'Travel & Events': 10, 'Entertainment': 11, 'Sports': 12, 'Howto & Style': 13}


loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta-large/resol

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(label_map))

In [None]:
def compute_metrics(p): #p should be of type EvalPrediction
  preds = np.argmax(p.predictions, axis=1)
  assert len(preds) == len(p.label_ids)
  macro_f1 = f1_score(p.label_ids,preds,average='macro')
  macro_precision = precision_score(p.label_ids,preds,average='macro')
  macro_recall = recall_score(p.label_ids,preds,average='macro')
  acc = accuracy_score(p.label_ids,preds)
  return {

      'macro_f1' : macro_f1,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
  }

In [None]:

# training arguements
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5#2e-4
training_args.fp16 = True
training_args.per_device_train_batch_size = 8
training_args.per_device_eval_batch_size = 8
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= num_of_epochs


steps_per_epoch = (len(dataset.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly

training_args._n_gpu = 1
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


156
1248


In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LA

In [None]:
print(model_name)

roberta-large


In [None]:
%time
trainer.train()

loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LA

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 10 µs


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classi

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1248, training_loss=0.6547261659915631, metrics={'train_runtime': 638.0086, 'train_samples_per_second': 31.385, 'train_steps_per_second': 1.956, 'total_flos': 4663806613522944.0, 'train_loss': 0.6547261659915631, 'epoch': 8.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 625
  Batch size = 8


{'epoch': 8.0,
 'eval_accuracy': 0.8,
 'eval_loss': 1.142515778541565,
 'eval_macro_f1': 0.7952967545814499,
 'eval_macro_precision': 0.7970369595477268,
 'eval_macro_recall': 0.7965738274745929,
 'eval_runtime': 4.9624,
 'eval_samples_per_second': 125.946,
 'eval_steps_per_second': 15.92}

In [None]:
result=trainer.predict(test_dataset)


***** Running Prediction *****
  Num examples = 625
  Batch size = 8


In [None]:
pred = np.argmax(result.predictions, axis=1)

In [None]:
mapper={y:x for x,y in label_map.items()}

In [None]:
predictions=[mapper[p]for p in pred]

In [None]:
print(classification_report(test_dataset.target,predictions))

                       precision    recall  f1-score   support

     Autos & Vehicles       0.98      0.88      0.93        51
               Comedy       0.75      0.79      0.77        38
            Education       0.72      0.83      0.77        41
        Entertainment       0.76      0.76      0.76        49
     Film & Animation       0.67      0.63      0.65        46
               Gaming       0.88      0.88      0.88        50
        Howto & Style       0.78      0.90      0.84        40
                Music       0.79      0.78      0.78        40
      News & Politics       0.74      0.68      0.70        37
Nonprofits & Activism       0.85      0.74      0.79        38
       Pets & Animals       0.84      0.96      0.90        45
 Science & Technology       0.71      0.70      0.71        43
               Sports       0.80      0.81      0.80        53
      Travel & Events       0.88      0.83      0.86        54

             accuracy                           0.80 



| Model | Accuracy | macro F1 |
| --- | ----------- | ----|
| Naive Bayes | 0.62 | 0.61|
| Decision Tree | 0.50 | 0.50 |
| Linear SVM | 0.69 | 0.68 |
| Non-linear SVM| 0.67 | 0.67 |
| BiLSTM| 0.72 | 0.71 |
|RoBERTa| 0.80 | 0.80 |
