In [1]:
!pip install peft==0.10.0
!pip install accelerate==0.34.2
!pip install -U huggingface-hub
!pip install sentencepiece
!pip install transformers==4.45.2
!pip install sentence-transformers==3.1.1
!pip install protobuf
!pip install datasets
!pip install bitsandbytes

Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft==0.10.0)
  Downloading accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub>=0.17.0 (from peft==0.10.0)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors (from peft==0.10.0)
  Downloading safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading accelerate-1.4.0-py3-none-any.whl (342 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.29.1-py3-none-any.whl (468 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.0/468.0 kB[0m [31m25.7 MB/s[

In [None]:
import json
import torch
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from transformers import LlamaTokenizer, LlamaForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForSequenceClassification, DataCollatorWithPadding
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import Dataset, DatasetDict
from huggingface_hub import login
login(token='secret')

with open('dataset.json', 'r') as file:
    data = json.load(file)

def shorten_article(entry):
    entry["article"] = entry["article"][:int(len(entry["article"])*0.25)]
    cleaned_text = re.sub(r'\b(Donald|donald|Trump|trump|former|Former|president|President|Joe|joe|Biden|biden)\b', '', entry["article"])
    entry["article"] = cleaned_text.strip()
    return entry

df = pd.DataFrame(data)
#df = df.sample(frac=0.25,random_state=42)
print(df)

label_mapping = {"left": 0, "lean-left": 1, "lean-right": 2, "right": 3}
df['label'] = df['label'].map(label_mapping)

train_size = 0.6
temp_size = 0.4
val_size = 0.5
test_size = 0.5
df_train, df_temp = train_test_split(df, train_size=train_size, test_size=temp_size, random_state=42)
df_val, df_test = train_test_split(df_temp, train_size=val_size, test_size=test_size, random_state=42)
dataset_train = Dataset.from_pandas(df_train.reset_index(drop=True))
dataset_val = Dataset.from_pandas(df_val.reset_index(drop=True))
dataset_test = Dataset.from_pandas(df_test.reset_index(drop=True))

                                                 article       label
0      If Donald Trump loses November’s election, it ...        left
1      Donald Trump is not a rational choice for cons...        left
2      The outsider candidate has money, a running ma...        left
3      No recent Democratic president has faced such ...        left
4      The conservative justices have shown they are ...        left
...                                                  ...         ...
13898  ATLANTA — A Republican-led group is challengin...  lean-right
13899  PRAIRIE DU CHIEN, Wis. — A day after Vice Pres...  lean-right
13900  Donald Trump says newly released statistics fr...  lean-right
13901  OPINION: Colombian President Gustavo Petro, in...  lean-right
13902  Vice President Kamala Harris is closing out th...  lean-right

[13903 rows x 2 columns]


In [4]:
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'label'],
        num_rows: 8341
    })
    val: Dataset({
        features: ['article', 'label'],
        num_rows: 2781
    })
    test: Dataset({
        features: ['article', 'label'],
        num_rows: 2781
    })
})

In [5]:
df_train.label.value_counts(normalize=True)

label
3    0.333893
1    0.313512
2    0.294689
0    0.057907
Name: proportion, dtype: float64

In [6]:
class_weights = (1/df_train.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.6432, 0.1188, 0.1264, 0.1116])

In [7]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"

In [8]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

In [9]:
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=4
)

model

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((

In [11]:
model = prepare_model_for_kbit_training(model)
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((

In [12]:
model = get_peft_model(model, lora_config)
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
        

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [14]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [15]:
max_len = 512
def llama_preprocessing_function(examples):
    return tokenizer(examples['article'], truncation=True, max_length=max_len)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/8341 [00:00<?, ? examples/s]

Map:   0%|          | 0/2781 [00:00<?, ? examples/s]

Map:   0%|          | 0/2781 [00:00<?, ? examples/s]

In [16]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [18]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [19]:
training_args = TrainingArguments(
    output_dir = 'sequence_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)



In [20]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights = class_weights
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)


In [None]:
import torch.nn.functional as F
train_result = trainer.train()



In [46]:
label_mapping = {0 : 'left', 1: 'lean-left', 2: 'lean-right', 3: 'right'}
maptwo = {"left": 0,"lean-left": 1,"lean-right":2,"right":3}
def make_predictions(model, df):


  # Convert summaries to a list
  sentences = df.article.tolist()

  # Define the batch size
  batch_size = 32  # You can adjust this based on your system's memory capacity

  # Initialize an empty list to store the model outputs
  all_outputs = []

  # Process the sentences in batches
  for i in range(0, len(sentences), batch_size):
      # Get the batch of sentences
      batch_sentences = sentences[i:i + batch_size]

      # Tokenize the batch
      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move tensors to the device where the model is (e.g., GPU or CPU)
      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      # Perform inference and store the logits
      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])

  final_outputs = torch.cat(all_outputs, dim=0)
  df['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  df['predictions']=df['predictions'].apply(lambda l:label_mapping[l])

In [47]:
def get_performance_metrics(df_test):
  y_test = df_test.label.round()
  y_pred = df_test.predictions.map(maptwo)
  print(f"comparing test {y_test} and pred {y_pred}")

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [48]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report, confusion_matrix
make_predictions(model,df_val)

get_performance_metrics(df_val)
df_val

comparing test 12950    2
13157    2
13350    2
6300     2
5896     2
        ..
7646     1
1035     3
11838    0
2943     3
7727     1
Name: label, Length: 718, dtype: int64 and pred 12950    2
13157    2
13350    2
6300     2
5896     2
        ..
7646     1
1035     3
11838    0
2943     3
7727     1
Name: predictions, Length: 718, dtype: int64
Confusion Matrix:
[[ 48   0   0   0]
 [  1 245   5   0]
 [  0   2 198   5]
 [  0   0   4 210]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        48
           1       0.99      0.98      0.98       251
           2       0.96      0.97      0.96       205
           3       0.98      0.98      0.98       214

    accuracy                           0.98       718
   macro avg       0.98      0.98      0.98       718
weighted avg       0.98      0.98      0.98       718

Balanced Accuracy Score: 0.9808144218203547
Accuracy Score: 0.9763231197771588


Unnamed: 0,article,label,predictions
12950,At least two Jan. 6 defendants got out from be...,2,lean-right
13157,Vice President Kamala Harris and her Democrati...,2,lean-right
13350,The Supreme Court has settled the issue of whe...,2,lean-right
6300,Octogenarian President Biden has difficulty fu...,2,lean-right
5896,Democratic lawmakers have been complaining to ...,2,lean-right
...,...,...,...
7646,News Analysis This presidential campaign is pl...,1,lean-left
1035,Democrat California Gov. Gavin Newsom has said...,3,right
11838,Harris trounced Trump in August fundraising. C...,0,left
2943,"Zyahna Bryant, the Black Lives Matter and “fat...",3,right


In [24]:
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =        2.0
  total_flos               = 92898810GF
  train_loss               =     0.1911
  train_runtime            = 2:04:59.32
  train_samples            =       5741
  train_samples_per_second =      1.531
  train_steps_per_second   =      0.191


In [55]:
trainer.save_model("saved_model")