# Main Library

In [1]:
!pip install transformers torch datasets



In [2]:
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

2024-06-30 19:01:54.295899: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 19:01:54.295991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 19:01:54.462517: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
model_name = 'distilbert-base-uncased'

# Reading Data

In [4]:
data = pd.read_csv(r'/kaggle/input/fake-news-classification/WELFake_Dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
data.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [6]:
data['title'] = data['title'].fillna('')
data['text'] = data['text'].fillna('')

# Data Preprocessing

In [7]:
data['text'] = data['title'] + ' ' + data['text']
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...","Bobby Jindal, raised Hindu, uses story of Chri...",0
4,4,SATAN 2: Russia unvelis an image of its terrif...,SATAN 2: Russia unvelis an image of its terrif...,1


In [8]:
data = data.drop(columns=['Unnamed: 0', 'title'], axis=1)
data.head()

Unnamed: 0,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,1
1,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",0
4,SATAN 2: Russia unvelis an image of its terrif...,1


In [9]:
data['text'] = data['text'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,text,label
0,law enforcement on high alert following threat...,1
1,did they post their votes for hillary already?,1
2,unbelievable! obama’s attorney general says mo...,1
3,"bobby jindal, raised hindu, uses story of chri...",0
4,satan 2: russia unvelis an image of its terrif...,1


# Splitting Data Into Trianing & Testing

In [10]:
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 72134
})

In [11]:
dataset = dataset.train_test_split(0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 57707
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 14427
    })
})

# Make Transformer Arguments

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [13]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/57707 [00:00<?, ? examples/s]

Map:   0%|          | 0/14427 [00:00<?, ? examples/s]

# Building Model

In [14]:
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000
)

# Compute Metrics

In [16]:
def compute_metrics(pred):
    prediction, label = p
    prediction = prediction.argmax(axis=1)
    return {'accuracy': (prediction == label).mean()}

In [17]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
10,0.6935
20,0.6947
30,0.6943
40,0.6791
50,0.6746
60,0.6544
70,0.6074
80,0.5264
90,0.4674
100,0.4061




# Make Prediction

In [None]:
trainer.evaluta(tokenized_datasets['test'])

In [None]:
pred = trainer.predict(tokenized_datasets['test'])
pred

In [None]:
prediction = trainer.predict(tokenized_datasets['test'])[1]
prediction

# Save Model

In [None]:
trainer.save_model('spam-ham_model')