In [1]:
!pip uninstall -y transformers tokenizers
!pip install transformers==4.45.2 tokenizers==0.20.1 datasets accelerate

Found existing installation: transformers 4.45.2
Uninstalling transformers-4.45.2:
  Successfully uninstalled transformers-4.45.2
Found existing installation: tokenizers 0.20.1
Uninstalling tokenizers-0.20.1:
  Successfully uninstalled tokenizers-0.20.1
Collecting transformers==4.45.2
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers==0.20.1
  Using cached tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.45.2-py3-none-any.whl (9.9 MB)
Using cached tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.20.1 transformers-4.45.2


In [2]:
# -------------------- IMPORTS --------------------
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# -------------------- LOAD DATA --------------------
df = pd.read_csv("/content/dataset.csv")

# Remove unwanted column
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

df.dropna(inplace=True)

texts = df["text_"].tolist()        # KEEPING your text_ column
labels = df["label"].apply(lambda x: 1 if x == "CG" else 0).tolist()

# -------------------- TRAIN-TEST SPLIT --------------------
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.25, stratify=labels, random_state=42
)

# -------------------- TOKENIZER --------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text_"],           # <-- using text_ correctly
        padding="max_length",
        truncation=True,
        max_length=128
    )

# -------------------- HF DATASET --------------------
train_dataset = Dataset.from_dict({"text_": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text_": test_texts,  "label": test_labels})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# remove raw text column AFTER tokenization
train_dataset = train_dataset.remove_columns(["text_"])
test_dataset  = test_dataset.remove_columns(["text_"])

train_dataset.set_format("torch")
test_dataset.set_format("torch")

# -------------------- MODEL --------------------
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

# -------------------- METRICS --------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# -------------------- TRAINING ARGS --------------------
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# -------------------- TRAINER --------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# -------------------- TRAIN --------------------
trainer.train()

# -------------------- EVALUATE --------------------
results = trainer.evaluate()
print("\nEVALUATION RESULTS:\n", results)

# -------------------- PREDICTION --------------------
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

print("\nCLASSIFICATION REPORT:\n")
print(classification_report(test_labels, pred_labels, target_names=["Fake", "Genuine"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/30323 [00:00<?, ? examples/s]

Map:   0%|          | 0/10108 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mworktushar1824[0m ([33mworktushar1824-bennett-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2115,0.17904,0.931737,0.932419,0.923196,0.941828
2,0.1313,0.212087,0.942422,0.943627,0.924288,0.963791
3,0.0648,0.262211,0.941927,0.943563,0.917711,0.970914



EVALUATION RESULTS:
 {'eval_loss': 0.2120867818593979, 'eval_accuracy': 0.942421844083894, 'eval_f1': 0.9436265013560635, 'eval_precision': 0.9242884250474384, 'eval_recall': 0.9637910565888406, 'eval_runtime': 75.6118, 'eval_samples_per_second': 133.683, 'eval_steps_per_second': 8.358, 'epoch': 3.0}

CLASSIFICATION REPORT:

              precision    recall  f1-score   support

        Fake       0.96      0.92      0.94      5054
     Genuine       0.92      0.96      0.94      5054

    accuracy                           0.94     10108
   macro avg       0.94      0.94      0.94     10108
weighted avg       0.94      0.94      0.94     10108



In [5]:
# -------------------- SAVE MODEL & TOKENIZER --------------------
model.save_pretrained("my_bert_model")
tokenizer.save_pretrained("my_bert_model")
!zip -r my_bert_model.zip my_bert_model
from google.colab import files
files.download("my_bert_model.zip")


  adding: my_bert_model/ (stored 0%)
  adding: my_bert_model/tokenizer_config.json (deflated 75%)
  adding: my_bert_model/vocab.txt (deflated 53%)
  adding: my_bert_model/special_tokens_map.json (deflated 42%)
  adding: my_bert_model/config.json (deflated 49%)
  adding: my_bert_model/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load model
model = BertForSequenceClassification.from_pretrained("my_bert_model")
tokenizer = BertTokenizer.from_pretrained("my_bert_model")

model.eval()

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    if predicted_class == 1:
        return "Genuine (CG)"
    else:
        return "Fake"


In [3]:
print(predict("This is a genuine text."))
print(predict("This looks suspicious and fake."))

NameError: name 'predict' is not defined

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Use labels extracted from your existing code
# test_labels and pred_labels arrays must be available here

accuracy = accuracy_score(test_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, pred_labels, average='binary')

metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
scores = [accuracy, precision, recall, f1]

plt.figure(figsize=(8,5))
bars = plt.bar(metrics, scores, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
plt.ylim(0,1)
plt.title('Classification Metrics')
plt.ylabel('Score')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom')

plt.show()
