In [1]:
!pip install -U transformers peft accelerate bitsandbytes datasets


!pip install feedparser

from google.colab import drive
drive.mount('/content/drive')

Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-non

In [2]:
import os
import gc
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import peft
import bitsandbytes
import feedparser
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from transformers import AutoModelForSequenceClassification

from datasets import Dataset
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType
from transformers import Trainer
from transformers import BitsAndBytesConfig

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
gc.collect()
torch.cuda.empty_cache()


In [3]:
rss_urls = {
    "BBC": "http://feeds.bbci.co.uk/news/rss.xml",
    "Reuters": "http://feeds.reuters.com/reuters/topNews",
    "NPR": "https://www.npr.org/rss/rss.php?id=1001"
}
real_news = []
for source, url in rss_urls.items():
    feed = feedparser.parse(url)
    for entry in feed.entries:
        real_news.append({"text": entry.title.strip(), "label": 1})
df_real_news = pd.DataFrame(real_news).drop_duplicates(subset="text")


df1 = pd.read_csv("/content/drive/MyDrive/misinfo_project/liar/liar_train.csv")
df2 = pd.read_csv("/content/drive/MyDrive/misinfo_project/fakenewsnet/fakenewsnet_train.csv")
df_train = pd.concat([df1, df2, df_real_news], ignore_index=True)
df_train.dropna(subset=["text", "label"], inplace=True)
df_train["label"] = df_train["label"].astype(int)


df_real = df_train[df_train["label"] == 1]
df_train = pd.concat([df_train, df_real, df_real], ignore_index=True)


In [4]:
model_name = "deepseek-ai/deepseek-llm-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

class_weights = compute_class_weight("balanced", classes=np.array([0, 1]), y=df_train["label"])
class_weights = torch.tensor(class_weights, dtype=torch.float32)

train_dataset = Dataset.from_pandas(df_train).map(tokenize, batched=True)
train_dataset = train_dataset.remove_columns(["text"])
train_dataset.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Map:   0%|          | 0/18317 [00:00<?, ? examples/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


In [6]:



bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)


base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
base_model = prepare_model_for_kbit_training(base_model)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.08,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)
base_model.config.pad_token_id = tokenizer.pad_token_id
model = get_peft_model(base_model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id


config.json:   0%|          | 0.00/584 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/deepseek-llm-7b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(
    output_dir="./deepseek_IT1",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="no",
    report_to=[],
    disable_tqdm=False,
    gradient_checkpointing=True,
    fp16=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05
)


In [8]:
class CustomTrainer(Trainer):
    def __init__(self, *args, use_focal=False, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_focal = use_focal
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False,num_items_in_batch=None):
        labels = inputs.pop("labels").to(torch.long)
        outputs = model(**inputs)
        logits = outputs.logits

        if self.use_focal:
            ce_loss = nn.CrossEntropyLoss(reduction="none")(logits, labels)
            pt = torch.exp(-ce_loss)
            loss = 1.5 * (1 - pt) ** 2 * ce_loss
            loss = loss.mean()
        else:
            weight = self.class_weights.to(logits.device).to(logits.dtype)
            loss_fn = nn.CrossEntropyLoss(weight=weight)
            loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [9]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    use_focal=True,
    class_weights=class_weights
)

trainer.train()


  super().__init__(*args, **kwargs)
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,0.9265
100,0.9014
150,0.7583
200,0.4475
250,0.2805
300,0.2679
350,0.2807
400,0.2513
450,0.27
500,0.2556


TrainOutput(global_step=6867, training_loss=0.22686363693510617, metrics={'train_runtime': 10759.1683, 'train_samples_per_second': 5.107, 'train_steps_per_second': 0.638, 'total_flos': 5.130255280059187e+17, 'train_loss': 0.22686363693510617, 'epoch': 2.9996724533245986})

In [10]:
save_path = "/content/drive/MyDrive/misinfo_project/models/deepseek_IT1"
trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)



('/content/drive/MyDrive/misinfo_project/models/deepseek_IT1/tokenizer_config.json',
 '/content/drive/MyDrive/misinfo_project/models/deepseek_IT1/special_tokens_map.json',
 '/content/drive/MyDrive/misinfo_project/models/deepseek_IT1/tokenizer.json')