In [None]:
pip install transformers datasets torch pandas

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
fake_df = pd.read_csv('/content/processed_fake_sample.csv')
true_df = pd.read_csv('/content/processed_true_sample.csv')
fake_df['label'] = 0
true_df['label'] = 1
data = pd.concat([fake_df[['text', 'label']], true_df[['text', 'label']]])
train_data, val_data = train_test_split(data, test_size=0.2)
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

In [None]:
data

Unnamed: 0,text,label
0,the self proclaimed feminist apparently has ...,0
1,amateur president donald trump loves trumpcare...,0
2,former house speaker slams obama administratio...,0
3,this story is a perfect example of how muslim ...,0
4,hillary was seen passing up a bottle of water ...,0
...,...,...
14995,"quetta, pakistan/islamabad (reuters) - two sui...",1
14996,(this version of the story was corrected to c...,1
14997,abidjan (reuters) - four moldovan nationals we...,1
14998,washington (reuters) - u.s. house republicans ...,1


In [None]:
train_data

Unnamed: 0,text,label
11473,washington (reuters) - the trump administratio...,1
12643,washington/new york (reuters) - jpmorgan chase...,1
5021,geneva (reuters) - the u.n. s freedom of speec...,1
4107,the polls are not looking good for donald trum...,0
10786,the word nothing-burger has been tossed arou...,0
...,...,...
4301,a new report from realclearpolitics is detaili...,0
2942,there were a lot of arguments online after the...,0
1383,,0
9623,more and more dirt on these two grifters who r...,0


In [None]:
val_data

Unnamed: 0,text,label
1720,london (reuters) - british prime minister ther...,1
5360,this story is for anyone who thinks king obama...,0
9864,washington (reuters) - the u.s. senate’s numbe...,1
10788,there will be no peace in america until white...,0
9077,london (reuters) - prime minister theresa may ...,1
...,...,...
10183,donald trump has just been screwed over by mem...,0
12434,nairobi (reuters) - kenya s main opposition le...,1
10882,time to exhale james barack obama supporter ja...,0
9060,donald trump has absolutely ravaged the reputa...,0


In [None]:
print("Original length of fake dataset:", len(fake_df))
print("Original length of true dataset:", len(true_df))

Original length of fake dataset: 15000
Original length of true dataset: 15000


In [None]:
reduction_factor = 0.5
new_fake_len = int(len(fake_df) * reduction_factor)
new_true_len = int(len(true_df) * reduction_factor)
reduced_fake_df = fake_df.head(new_fake_len)
reduced_true_df = true_df.head(new_true_len)
print("Reduced length of fake dataset:", len(reduced_fake_df))
print("Reduced length of true dataset:", len(reduced_true_df))

Reduced length of fake dataset: 7500
Reduced length of true dataset: 7500


In [None]:
reduced_data = pd.concat([reduced_fake_df, reduced_true_df]).reset_index(drop=True)
print("Combined dataset length:", len(reduced_data))

Combined dataset length: 15000


Transformer

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd


In [None]:
X = reduced_data.iloc[:, :-1].apply(pd.to_numeric, errors='coerce').fillna(0).values
y = reduced_data.iloc[:, -1].apply(pd.to_numeric, errors='coerce').fillna(0).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, 64)
        self.positional_encoding = PositionalEncoding(64)
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=64, nhead=4)
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=2)
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x.unsqueeze(1))
        x = self.transformer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

In [None]:
input_dim = X_train.shape[1]
num_classes = len(torch.unique(y_train))
learning_rate = 0.001
epochs = 20
batch_size = 32
model = SimpleTransformer(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_data = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)



In [None]:
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

Epoch [1/20], Loss: 0.6912
Epoch [2/20], Loss: 0.6706
Epoch [3/20], Loss: 0.6828
Epoch [4/20], Loss: 0.6964
Epoch [5/20], Loss: 0.6871
Epoch [6/20], Loss: 0.6869
Epoch [7/20], Loss: 0.6935
Epoch [8/20], Loss: 0.6932
Epoch [9/20], Loss: 0.6917
Epoch [10/20], Loss: 0.7035
Epoch [11/20], Loss: 0.6970
Epoch [12/20], Loss: 0.6926
Epoch [13/20], Loss: 0.6926
Epoch [14/20], Loss: 0.6956
Epoch [15/20], Loss: 0.6908
Epoch [16/20], Loss: 0.6908
Epoch [17/20], Loss: 0.6934
Epoch [18/20], Loss: 0.6920
Epoch [19/20], Loss: 0.6972
Epoch [20/20], Loss: 0.6841


In [None]:
with torch.no_grad():
    predictions = model(X_test).argmax(dim=1)
    predictions = predictions.cpu().numpy()  # Convert predictions to numpy if they're on the GPU
    accuracy = accuracy_score(y_test.cpu().numpy(), predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 49.77%


In [None]:
from torch import nn, optim
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(np.unique(y)))
X = reduced_data.iloc[:, :-1].apply(pd.to_numeric, errors='coerce').fillna(0).values
y = reduced_data.iloc[:, -1].apply(pd.to_numeric, errors='coerce').fillna(0).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
class SimpleNN(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
input_dim = X_train.shape[1]
num_classes = len(np.unique(y_train))
learning_rate = 0.001
epochs = 20
batch_size = 32
model = SimpleNN(input_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_data = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

def train_model(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for batch in train_loader:
            inputs, labels = batch
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")

train_model(model, train_loader, optimizer, criterion)

Epoch 1/3, Loss: 0.6935948033332825
Epoch 2/3, Loss: 0.6933101765314738
Epoch 3/3, Loss: 0.6933108410835266


In [None]:
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predictions = torch.argmax(outputs, dim=1)
    accuracy = accuracy_score(y_test, predictions.cpu().numpy())
    print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 49.77%


Transformer using ROBERTa

In [None]:
import pandas as pd
reduction_factor = 0.5
fake_df = pd.read_csv('/content/processed_fake_sample.csv', quoting=3, on_bad_lines='skip')
true_df = pd.read_csv('/content/processed_true_sample.csv', quoting=3, on_bad_lines='skip')
fake_df['label'] = 0
true_df['label'] = 1

new_fake_len = int(len(fake_df) * reduction_factor)
new_true_len = int(len(true_df) * reduction_factor)
reduced_fake_df = fake_df.head(new_fake_len)
reduced_true_df = true_df.head(new_true_len)

reduced_data = pd.concat([reduced_fake_df, reduced_true_df]).reset_index(drop=True) # Assuming these dataframes are available in the current scope
X = reduced_data[text_column]
y = reduced_data[label_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(y.unique()))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd
X = reduced_data[text_column]
y = reduced_data[label_column]
X = X.dropna()
y = y[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
def tokenize_data(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
learning_rate = 2e-5
epochs = 3
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()
 running_loss += loss.item()
 print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(train_loader)}")


Epoch 1/3, Loss: 0.44081407243555243
Epoch 2/3, Loss: 0.33064686439254065
Epoch 3/3, Loss: 0.32407610389319336


In [None]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
  outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 87.21%


In [None]:
for param in model.roberta.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")


Accuracy: 87.21%
Precision: 0.76
Recall: 0.87
F1-score: 0.81


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_tokenizer')


('path_to_save_tokenizer/tokenizer_config.json',
 'path_to_save_tokenizer/special_tokens_map.json',
 'path_to_save_tokenizer/vocab.json',
 'path_to_save_tokenizer/merges.txt',
 'path_to_save_tokenizer/added_tokens.json')

In [None]:
model = RobertaForSequenceClassification.from_pretrained('/content/path_to_save_model')  # Replace '/content/path_to_save_model' with the actual directory path if different
tokenizer = RobertaTokenizer.from_pretrained('/content/path_to_save_tokenizer')  # Replace '/content/path_to_save_tokenizer' with the actual directory path if different