In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, pipeline, AutoModel, AutoModelForSequenceClassification, AutoTokenizer, pipeline
from sklearn.model_selection import train_test_split
import joblib

In [2]:
df = pd.read_csv("WELFake_Dataset.csv").drop(columns=["Unnamed: 0"], axis=1)
df['text'] = df['title'].astype(str) + " " + df['text']
df = df.drop(columns=["title"], axis=1)
df

Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,nan Did they post their votes for Hillary alre...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1
...,...,...
72129,Russians steal research on Trump in hack of U....,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,1
72131,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,0


In [3]:
df["text"] = df["text"].map(str)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

In [5]:
def preprocess_function(examples):
    return tokenizer(examples, truncation=True, padding=True)

In [6]:
X = df["text"].map(preprocess_function)
y = df["label"]

In [7]:
train_encodings, test_encodings, train_labels, test_labels = train_test_split(X, y, test_size=0.25)

In [8]:
def Text_dict(X):
    X_ = {"input_ids" : [], "attention_mask" : []}
    for i in X:
        X_["input_ids"].append(i["input_ids"])
        X_["attention_mask"].append(i["attention_mask"])

    return X_

In [9]:
def Labels_list(y):
    y_ = []
    for i in y:
        y_.append(i)
    return y_

In [10]:
train_encodings_ = Text_dict(train_encodings)
train_encodings = train_encodings_
test_encodings_ = Text_dict(test_encodings)
test_encodings = test_encodings_

In [11]:
train_labels_ = Labels_list(train_labels)
train_labels = train_labels_
test_labels_ = Labels_list(test_labels)
test_labels = test_labels_

In [12]:
class Fraud_EMail_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encodings["input_ids"][idx])
        target_ids = torch.tensor(self.labels[idx])
        return {"input_ids": input_ids, "labels": target_ids}

    def __len__(self):
        return len(self.labels)

train = Fraud_EMail_Dataset(encodings=train_encodings, labels=train_labels)
test = Fraud_EMail_Dataset(encodings=test_encodings, labels=test_labels)

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
accuracy = evaluate.load("accuracy")

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
id2label = {0: "RELIABLE", 1: "UNRELIABLE"}
label2id = {"RELIABLE": 0, "UNRELIABLE": 1}

In [17]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [19]:
training_args = TrainingArguments(
    output_dir="fake_news_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/54100 [00:00<?, ?it/s]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.2609, 'learning_rate': 1.9815157116451017e-05, 'epoch': 0.01}
{'loss': 0.1705, 'learning_rate': 1.9630314232902035e-05, 'epoch': 0.02}
{'loss': 0.1093, 'learning_rate': 1.944547134935305e-05, 'epoch': 0.03}
{'loss': 0.1125, 'learning_rate': 1.9260628465804068e-05, 'epoch': 0.04}
{'loss': 0.0698, 'learning_rate': 1.9075785582255083e-05, 'epoch': 0.05}
{'loss': 0.0544, 'learning_rate': 1.88909426987061e-05, 'epoch': 0.06}
{'loss': 0.0644, 'learning_rate': 1.8706099815157116e-05, 'epoch': 0.06}
{'loss': 0.0926, 'learning_rate': 1.8521256931608135e-05, 'epoch': 0.07}
{'loss': 0.0465, 'learning_rate': 1.833641404805915e-05, 'epoch': 0.08}
{'loss': 0.0441, 'learning_rate': 1.8151571164510168e-05, 'epoch': 0.09}
{'loss': 0.1001, 'learning_rate': 1.7966728280961186e-05, 'epoch': 0.1}
{'loss': 0.1226, 'learning_rate': 1.77818853974122e-05, 'epoch': 0.11}
{'loss': 0.0381, 'learning_rate': 1.759704251386322e-05, 'epoch': 0.12}
{'loss': 0.0857, 'learning_rate': 1.7412199630314234e-05, '

In [None]:
trainer.save_model()

In [4]:
from transformers import AutoModel

In [5]:
model = AutoModel.from_pretrained("./fake_news_detector/")

In [6]:
model.push_to_hub("tush9905/fake_news_detector")

pytorch_model.bin:   0%|          | 0.00/554M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tush9905/fake_news_detector/commit/aab2e3b4024474951f368bf8958cd8020c89e5cc', commit_message='Upload model', commit_description='', oid='aab2e3b4024474951f368bf8958cd8020c89e5cc', pr_url=None, pr_revision=None, pr_num=None)