In [1]:
import pandas as pd
from custom_utils import load_and_concatenate_parquet_files

df  = load_and_concatenate_parquet_files('data/preprocessed_big_training_df')

display(df)

Unnamed: 0,preprocessed_text,label
0,donald trump respond mockery fake swedish atta...,1
1,tweetwavethis time true pantstweetwave anthony...,1
2,rubio prospect trump president worrisome reute...,0
3,trump lifts cyber command status boost cyber d...,0
4,big republican lie economy tear apart minute v...,1
...,...,...
63116,half briton want stay eu polledinburgh reuters...,0
63117,bill hillary clinton inc sale right pricein sp...,1
63118,orlando gunman shoot time autopsy find new yor...,0
63119,lethal gap supreme court handle death penalty ...,0


In [2]:
df = df.rename(columns={'preprocessed_text': 'text'})
df["label_names"] = df["label"].apply(lambda x: "real" if x == 1 else "fake")
display(df)

Unnamed: 0,text,label,label_names
0,donald trump respond mockery fake swedish atta...,1,real
1,tweetwavethis time true pantstweetwave anthony...,1,real
2,rubio prospect trump president worrisome reute...,0,fake
3,trump lifts cyber command status boost cyber d...,0,fake
4,big republican lie economy tear apart minute v...,1,real
...,...,...,...
63116,half briton want stay eu polledinburgh reuters...,0,fake
63117,bill hillary clinton inc sale right pricein sp...,1,real
63118,orlando gunman shoot time autopsy find new yor...,0,fake
63119,lethal gap supreme court handle death penalty ...,0,fake


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, AutoTokenizer, AutoModelForSequenceClassification
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from sklearn.model_selection import train_test_split

train,test      = train_test_split(df,test_size=0.3,stratify=df['label'])
test,validation = train_test_split(test,test_size=1/3,stratify=test['label'])

train.shape, test.shape, validation.shape

((44184, 3), (12624, 3), (6313, 3))

In [5]:
from datasets import Dataset, DatasetDict
dataset = DatasetDict(
    {'train':Dataset.from_pandas(train,preserve_index=False),
     'test':Dataset.from_pandas(test,preserve_index=False),
     'validation': Dataset.from_pandas(validation,preserve_index=False)
     }    
)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 44184
    })
    test: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 12624
    })
    validation: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 6313
    })
})

In [6]:
def tokenize(batch):
    temp = tokenizer(batch['text'],padding=True,truncation=True)
    return temp

In [7]:
encoded_dataset = dataset.map(tokenize,batched=True,batch_size=None)

Map: 100%|██████████| 44184/44184 [00:19<00:00, 2318.62 examples/s]
Map: 100%|██████████| 12624/12624 [00:03<00:00, 3929.69 examples/s]
Map: 100%|██████████| 6313/6313 [00:01<00:00, 4161.01 examples/s]


In [8]:
label2id = {x['label_names']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}
label2id, id2label

({'real': 1, 'fake': 0}, {1: 'real', 0: 'fake'})

In [9]:
from peft import PeftModel,PeftConfig,get_peft_model,LoraConfig
import torch
from transformers import AutoConfig, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_ckpt,label2id=label2id,id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,config=config).to(device)

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    # Rank: original matrix decomposed into 32 rows and columns
    r=32,
    # lora_alpha: scaling factor
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules = ["query"]
)    

model = get_peft_model(model,peft_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import Trainer, TrainingArguments
batch_size = 16

training_dir = "bert_base_train_dir"

training_args = TrainingArguments(output_dir=training_dir,
                                  overwrite_output_dir = True,
                                  num_train_epochs = 5,
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  weight_decay = 0.01,
                                  evaluation_strategy = 'epoch',
                                  disable_tqdm = False
)



In [12]:
from sklearn.metrics import accuracy_score
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels,preds)

    return {"accuracy":acc}

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 20.3MB/s]


In [13]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=encoded_dataset['train'],
                  eval_dataset=encoded_dataset['validation'],
                  tokenizer=tokenizer,
                  data_collator=data_collator)

trainer.train()

  trainer = Trainer(model=model,


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
preds_output = trainer.predict(encoded_dataset['test'])

In [None]:
preds_output.metrics

In [None]:
import numpy as np
y_pred = np.argmax(preds_output.predictions,axis=1)

y_true = encoded_dataset['test'][:]['labels']

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_true,y_pred,target_names=list(label2id)))