In [1]:
!pip install -U transformers --q
!pip install sentencepiece --q
!pip install tokenizers  --q

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
# from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [4]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/urdu/AT Utf-8.csv")
df

Unnamed: 0,Title,Label
0,حالیہ ہفتوں میں، کئی ذرائع ابلاغ نے چین کی جان...,Real
1,یہ گرم ! کیا آپ بھی فلپائن میں گرمی کی شدت مح...,Real
2,حکام نے سابق الائنس فار ایڈوانسمنٹ آف پیپلز را...,Real
3,جناب کا یہ ردعمل ویڈیو دیکھیں۔ کلارو تیسرا۔ یہ...,Real
4,ان بہن بھائیوں کے لیے جو ہمیشہ بحث کرتے تھے خا...,Real
...,...,...
29998,بھاپ سے چلنے والی وہیل چیئرز کو روایتی طور پر ...,Fake
29999,امريکا کا طاقتور ترین فون لانچ کر دیا گیا ہے,Fake
30000,سوال و جواب کی ویب سائٹ کورا( یواو آراے) نے اع...,Fake
30001,ہواوے اور زیڈ ٹی ای جاسوسی اور سیکورٹی سکینڈلز...,Fake


In [5]:
df['Label'] = df['Label'].map({'Fake':0,'Real':1})

In [6]:
df = df.sample(frac=1).reset_index()
df = df[['Title','Label']]
df = df[:5000]
df.Label.value_counts()

0    2529
1    2471
Name: Label, dtype: int64

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_path = 'urduhack/roberta-urdu-small'
nli_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, num_labels=2)

Some weights of the model checkpoint at urduhack/roberta-urdu-small were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at urduhack/roberta-urdu-small and are new

In [8]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42)
df_train.shape, df_val.shape, df_test.shape

((4500, 2), (250, 2), (250, 2))

In [9]:
def tokenize_df(tokenizer, data, has_label=True):
        #convert to list
        fake_list = df['Title'].tolist()
        #returns dictionary with keys: input_ids, attention_mask)
        encoding_dict = tokenizer(fake_list, padding="max_length", max_length=64,
                                  truncation=True, return_token_type_ids=False)
        #add 'label' key if we are making train or validation data
        if has_label: 
            encoding_dict['label'] = df['Label'].tolist()
        #convert dictionary of lists into list of dictionaries
        return [dict(zip(encoding_dict, t)) for t in zip(*encoding_dict.values())]

#apply above function to create lists to use for our data:
X_train_tokenized = tokenize_df(tokenizer, df_train)
X_val_tokenized = tokenize_df(tokenizer, df_val)
X_test_tokenized = tokenize_df(tokenizer, df_test, has_label=False)
train_dataset = X_train_tokenized
eval_dataset = X_val_tokenized
test_dataset = X_test_tokenized

In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
nli_model.to(device)
print(f'Using {device}')

Using cuda


In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall}

training_args = TrainingArguments(
                output_dir="/content/drive/MyDrive/urdu",
                learning_rate=1e-4,
                num_train_epochs=2, 
                report_to="none",
                evaluation_strategy="steps", 
                eval_steps=600,
                save_steps=600,
                logging_steps=600,
                load_best_model_at_end=True
                )
trainer = Trainer(
    model=nli_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics)
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=4)])
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1250


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
600,0.6812,0.597344,0.7342,0.683797,0.829677,0.581546
1200,0.5846,0.551551,0.7182,0.625764,0.910355,0.47673


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/urdu/checkpoint-600
Configuration saved in /content/drive/MyDrive/urdu/checkpoint-600/config.json
Model weights saved in /content/drive/MyDrive/urdu/checkpoint-600/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/urdu/checkpoint-1200
Configuration saved in /content/drive/MyDrive/urdu/checkpoint-1200/config.json
Model weights saved in /content/drive/MyDrive/urdu/checkpoint-1200/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/MyDrive/urdu/checkpoint-1200 (score: 0.5515508651733398).


TrainOutput(global_step=1250, training_loss=0.6303945388793946, metrics={'train_runtime': 217.0119, 'train_samples_per_second': 46.08, 'train_steps_per_second': 5.76, 'total_flos': 328888819200000.0, 'train_loss': 0.6303945388793946, 'epoch': 2.0})

In [12]:
metrics=trainer.evaluate()
print(metrics)

***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8


{'eval_loss': 0.5515508651733398, 'eval_accuracy': 0.7182, 'eval_f1': 0.6257636122177954, 'eval_precision': 0.910355486862442, 'eval_recall': 0.47673006879805746, 'eval_runtime': 20.1194, 'eval_samples_per_second': 248.517, 'eval_steps_per_second': 31.065, 'epoch': 2.0}


In [13]:
predictions = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 5000
  Batch size = 8


In [14]:
y_pred = predictions.predictions
y_pred = [np.argmax(pred) for pred in y_pred]
y_pred[:20]

[0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1]

In [15]:
import numpy as np

In [16]:
df.head(10)

Unnamed: 0,Title,Label
0,ایسوسی ایٹڈ پریس کے سالانہ اجلاس میں اوباما کے...,1
1,"ٹم کین کا کہنا ہے کہ جارج ایلن کا فلیٹ ٹیکس ""م...",0
2,دمشق کی جامع مسجدکے امام کو دہشت گرد دھماکے می...,1
3,کیا ٹرمپ کا ٹیکس پلان کم آمدنی والوں کے لیے کچ...,0
4,دی پرفیکشنسٹ ریلیز کی تاریخ، کاسٹ، پلاٹ، ٹریل...,1
5,ٹرمپ کا کہنا ہے کہ بریگزٹ ایک عظیم چیز ہے، برط...,1
6,بیتھنی فرینکلکے سابق شوہر جیسن ہوپی کی کل مالی...,1
7,جارجاو ر امل کلونی کی غیر متوقع محبت کی کہانی,1
8,شکیرااو ر مالوما کا 'ٹریپ' میوزک ویڈ یوآخر کار...,1
9,امریکی سینیٹ نے جےف سیشنز کو اٹارنی جنرل کی حی...,1


In [17]:
df.Title[8]

"شکیرااو ر مالوما کا 'ٹریپ' میوزک ویڈ یوآخر کار یہاں آ گیا ہے۔"

In [18]:
df.Title[7]

'جارجاو ر امل کلونی کی غیر متوقع محبت کی کہانی'

In [19]:
df.Title[3]

'کیا ٹرمپ کا ٹیکس پلان کم آمدنی والوں کے لیے کچھ کرتا ہے؟'

In [26]:
df.Title[1]

'ٹم کین کا کہنا ہے کہ جارج ایلن کا فلیٹ ٹیکس "منصوبہ" ریک پیری کی تجویز سے زیادہ آمدنی کو کم کرتا ہے۔'

In [20]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [27]:
X_test = ['کیا ٹرمپ کا ٹیکس پلان کم آمدنی والوں کے لیے کچھ کرتا ہے؟','جارجاو ر امل کلونی کی غیر متوقع محبت کی کہانی','ٹم کین کا کہنا ہے کہ جارج ایلن کا فلیٹ ٹیکس "منصوبہ" ریک پیری کی تجویز سے زیادہ آمدنی کو کم کرتا ہے۔']
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "/content/drive/MyDrive/urdu/checkpoint-1200"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

loading configuration file /content/drive/MyDrive/urdu/checkpoint-1200/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/urdu/checkpoint-1200",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading weights file /content/drive/MyDrive/urdu/checkpoint-1200/pytorch_model.bin
All m

In [28]:
y_pred

array([0, 1, 0])