In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import numpy as np

In [4]:
# Load data
df = pd.read_csv("cleaned_data.csv", encoding='ISO-8859-1')
df['text'].fillna("Missing text", inplace=True)

# Clean labels
valid_labels = ['Left Wing', 'Right Wing', 'Neutral']
df['label'] = df['label'].apply(lambda x: x if x in valid_labels else np.nan)
df['label'] = df['label'].fillna('Neutral')

# Split data
train_df, remaining_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(remaining_df, test_size=0.5, random_state=42)

# Encode labels
encoder = LabelEncoder()
train_df['label'] = encoder.fit_transform(train_df['label'].astype(str))
val_df['label'] = encoder.transform(val_df['label'].astype(str))
test_df['label'] = encoder.transform(test_df['label'].astype(str))
# Check transformed labels
print("Encoded labels:", train_df['label'].unique())


Encoded labels: [0 1 2]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna("Missing text", inplace=True)


In [5]:
tokenizer = AutoTokenizer.from_pretrained('nghuyong/ernie-1.0')

class TextDataset(Dataset):
    def __init__(self, df):
        self.tokenizer = tokenizer
        self.data = self.tokenize_data(df)
    
    def tokenize_data(self, df):
        texts = df['text'].astype(str).tolist()
        labels = df['label'].tolist()
        
        tokenized = self.tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        tokenized['labels'] = torch.tensor(labels, dtype=torch.long)
        return tokenized

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}




In [6]:
model_id = 'nghuyong/ernie-1.0'
config = AutoConfig.from_pretrained(model_id, num_labels=3, id2label={0: 'LeftWing', 1: 'Neutral', 2: 'RightWing'})
model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config)


  return self.fget.__get__(instance, owner)()
Some weights of ErnieForSequenceClassification were not initialized from the model checkpoint at nghuyong/ernie-1.0 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [11]:
# Prepare datasets
train_dataset = TextDataset(train_df)
val_dataset = TextDataset(val_df)
repository_id = "harshal-11/Ernie-PoliticalBias-Finetune"
# Training arguments
training_args = TrainingArguments(
    output_dir=repository_id,
    evaluation_strategy="epoch",
    num_train_epochs=5,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)



In [12]:

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.587,0.553135,0.760728,0.753185,0.767947,0.745438
2,0.7662,0.502756,0.794798,0.783904,0.830055,0.762649
3,0.4928,0.478204,0.802081,0.790782,0.815507,0.777604
4,0.414,0.513858,0.817945,0.804287,0.833522,0.787751
5,0.2473,0.551125,0.821847,0.810301,0.819338,0.803303


TrainOutput(global_step=19225, training_loss=0.47948870035699814, metrics={'train_runtime': 1038.7102, 'train_samples_per_second': 148.059, 'train_steps_per_second': 18.509, 'total_flos': 2.023210625619456e+16, 'train_loss': 0.47948870035699814, 'epoch': 5.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.47820448875427246,
 'eval_accuracy': 0.8020806241872562,
 'eval_f1': 0.7907815576095528,
 'eval_precision': 0.8155065727351354,
 'eval_recall': 0.7776039158713802,
 'eval_runtime': 7.0763,
 'eval_samples_per_second': 543.366,
 'eval_steps_per_second': 34.058,
 'epoch': 5.0}

In [14]:
import os

# Replace 'your_token' with the actual token you copied from Hugging Face.
os.environ['HF_TOKEN'] = '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'

# Use this environment variable when you create the `Trainer` or call `push_to_hub`.

In [15]:
tokenizer.save_pretrained(training_args.output_dir)
trainer.create_model_card()

# Push the tokenizer, model, and model card to the hub
trainer.push_to_hub(commit_message="Training completed")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/399M [00:00<?, ?B/s][A[A
training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s][A

training_args.bin: 100%|██████████| 4.98k/4.98k [00:00<00:00, 34.0kB/s][A[A


model.safetensors:   2%|▏         | 9.52M/399M [00:00<00:07, 51.0MB/s][A[A

model.safetensors:   4%|▍         | 16.0M/399M [00:00<00:14, 26.1MB/s][A[A

model.safetensors:   5%|▌         | 21.5M/399M [00:00<00:11, 32.7MB/s][A[A

model.safetensors:   7%|▋         | 26.4M/399M [00:00<00:10, 36.6MB/s][A[A

model.safetensors:   8%|▊         | 32.0M/399M [00:01<00:13, 27.4MB/s][A[A

model.safetensors:   9%|▉         | 37.1M/399M [00:01<00:11, 32.0MB/s][A[A

model.safetensors:  11%|█         | 42.3M/399M [00:01<00:09, 36.3MB/s][A[A

model.safetensors:  12%|█▏        | 47.4M/399M [00:01<00:08, 40.0MB/s][A[A

model.safetensors:  13%|█▎        | 52.1M/399M [00:01<00:11, 30.7MB/s][A[A

model.safetensors:  14%|█

CommitInfo(commit_url='https://huggingface.co/harshal-11/Ernie-PoliticalBias-Finetune/commit/4b79e0021812e91507d7b940bd6fb8b0deb1a010', commit_message='Training completed', commit_description='', oid='4b79e0021812e91507d7b940bd6fb8b0deb1a010', pr_url=None, pr_revision=None, pr_num=None)