In [10]:
#! pip install transformers datasets

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [12]:
df=pd.read_csv("politifact_fixed.csv")

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,content_sentance
0,0,0,At a Glance Indicators MicroData Publications ...
1,1,0,CQ Username Password Stay signed in Forgot you...
2,2,0,Login | Contact Us | Site Map | Paid archives ...
3,3,0,Home Democratic Leaders Say House Democrats A...
4,4,0,OMB Home White House Website THE NATION’S FISC...


In [15]:
df=df.drop(columns='Unnamed: 0')

In [16]:
for i in df["content_sentance"]:
  if isinstance(i,int):
    print(i)

In [17]:
df=df.dropna()

In [18]:
df.head()

Unnamed: 0,type,content_sentance
0,0,At a Glance Indicators MicroData Publications ...
1,0,CQ Username Password Stay signed in Forgot you...
2,0,Login | Contact Us | Site Map | Paid archives ...
3,0,Home Democratic Leaders Say House Democrats A...
4,0,OMB Home White House Website THE NATION’S FISC...


In [19]:
from datasets import Dataset

In [20]:
!pip install dataset




[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
!pip install datasets




[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
df.columns = ['label', 'text']

In [23]:
df

Unnamed: 0,label,text
0,0,At a Glance Indicators MicroData Publications ...
1,0,CQ Username Password Stay signed in Forgot you...
2,0,Login | Contact Us | Site Map | Paid archives ...
3,0,Home Democratic Leaders Say House Democrats A...
4,0,OMB Home White House Website THE NATION’S FISC...
...,...,...
532,1,Facebook USPOLN U.S Political News HOME POLITI...
533,1,Daily USA Update Menu News Politics 2016 Elect...
534,1,GENIUS Sign Up Sign In | | | Facebook Twitter ...
535,1,Home Latest News Top Stories Trending News Gle...


In [24]:
dataset = Dataset.from_pandas(df,preserve_index=False)

In [25]:
train_test_split = dataset.train_test_split(test_size=0.6, seed=42)

In [26]:
test_unsupervised_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

In [27]:
from datasets import DatasetDict

In [28]:
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': test_unsupervised_split['train'],
    'unsupervised': test_unsupervised_split['test']
})

In [29]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 206
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 154
    })
    unsupervised: Dataset({
        features: ['label', 'text'],
        num_rows: 155
    })
})

In [30]:
def preprocess_function(examples):
    if isinstance(examples["text"], str):
        return tokenizer(examples["text"], truncation=True)
    else:
        return tokenizer("N/A", truncation=True)

In [31]:
df_tokenized = dataset_dict.map(preprocess_function)

Map: 100%|██████████| 206/206 [00:02<00:00, 86.39 examples/s] 
Map: 100%|██████████| 154/154 [00:01<00:00, 113.22 examples/s]
Map: 100%|██████████| 155/155 [00:00<00:00, 183.98 examples/s]


In [32]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [33]:
import numpy as np

In [34]:
!pip install evaluate




[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import evaluate

accuracy = evaluate.load("accuracy")

In [36]:
def compute_metrics(eval_pred):

    predictions, labels = eval_pred

    predictions = np.argmax(predictions, axis=1)

    return accuracy.compute(predictions=predictions, references=labels)

In [37]:
from transformers import create_optimizer
import tensorflow as tf
tf.config.experimental_run_functions_eagerly(True)

batch_size = 16
num_epochs = 5
batches_per_epoch = len(df_tokenized["train"])
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [38]:
id2label = {0: "Real", 1: "Fake"}
label2id = {"Real": 0, "Fake": 1}

In [39]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification 

In [40]:
#!pip install datasets

In [41]:
tf_train_set = model.prepare_tf_dataset(
    df_tokenized["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    df_tokenized["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)



In [42]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

Epoch 1/3

 1/12 [=>............................] - ETA: 22:04 - loss: 0.6858