In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [2]:
# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')




In [7]:
# data load
df = pd.read_csv("cleaned_data.csv", encoding='ISO-8859-1')
df['text'].fillna("Missing text", inplace=True)  # Replace nulls with a placeholder string
# Check data types in the text column
print(df['text'].apply(type).value_counts())
import pandas as pd
import numpy as np

# Display unique values before cleaning
print("Unique labels before cleaning:", df['label'].unique())

# Clean labels: Only keep valid categories, set others to NaN
valid_labels = ['Left Wing', 'Right Wing', 'Neutral']
df['label'] = df['label'].apply(lambda x: x if x in valid_labels else np.nan)

# Option to drop NaNs if your dataset allows
# train_df.dropna(subset=['label'], inplace=True)
# val_df.dropna(subset=['label'], inplace=True)
# test_df.dropna(subset=['label'], inplace=True)
import pandas as pd

# Assuming df is your DataFrame
df['label'] = df['label'].fillna('Neutral')

# Display unique values after cleaning
print("Unique labels after cleaning:", df['label'].unique())
from sklearn.model_selection import train_test_split

# Split data into training and remaining data
train_df, remaining_df = train_test_split(df, test_size=0.2, random_state=42)

# Split remaining data into validation and test sets
val_df, test_df = train_test_split(remaining_df, test_size=0.5, random_state=42)
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Fit the encoder on the training data and transform all datasets
train_df['label'] = encoder.fit_transform(train_df['label'].astype(str))
val_df['label'] = encoder.transform(val_df['label'].astype(str))
test_df['label'] = encoder.transform(test_df['label'].astype(str))

# Check transformed labels
print("Encoded labels:", train_df['label'].unique())


text
<class 'str'>    38448
Name: count, dtype: int64
Unique labels before cleaning: ['Right Wing' 'Left Wing' nan ' whenever I leave the West'
 ' and that I may be better off then they are because I still have elders that I can go to who will make me feel at home for a while as they cleanse me. Sometimes I find myself wondering'
 '01/25/2022 18:45:00' 'Neutral']
Unique labels after cleaning: ['Right Wing' 'Left Wing' 'Neutral']
Encoded labels: [0 1 2]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna("Missing text", inplace=True)  # Replace nulls with a placeholder string


In [30]:
# class TextDataset(Dataset):
#     def __init__(self, encodings,labels):
#         self.tokenizer = tokenizer
#         self.texts = dataframe['text'].tolist()
#         self.labels = dataframe['label'].tolist()

#     def __getitem__(self, idx):
        
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
#         return item
#     def __len__(self):
#         return len(self.labels)


In [31]:
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# def tokenize_data(df, tokenizer):
#     texts = df['text'].astype(str).tolist()  # Convert text data to string
#     labels = df['label'].tolist()  # Extract labels

#     # Tokenize the text
#     tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
#     tokenized['labels'] = torch.tensor(labels, dtype=torch.long)  # Add labels to the tokenized data

#     return tokenized

In [32]:
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
class TextDataset(Dataset):
    def __init__(self, df):
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.data = self.tokenize_data(df)
    
    def tokenize_data(self, df):
        texts = df['text'].astype(str).tolist()  # Ensure text data is in string format
        labels = df['label'].tolist()  # Extract labels
        
        # Tokenize the text data
        tokenized = self.tokenizer(texts, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
        tokenized['labels'] = torch.tensor(labels, dtype=torch.long)
        
        return tokenized

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.data.items()}
        return item


In [33]:
from torch.utils.data import DataLoader

# train_dataset = tokenize_data(train_df)  # Assuming train_df is your DataFrame with training data
train_dataset = TextDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)


In [34]:
val_dataset = TextDataset(val_df)





In [35]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [36]:
repository_id = "harshal-11/DistillBERT-Political-Finetune"
training_args = TrainingArguments(
    output_dir=repository_id,
    evaluation_strategy="epoch",
    num_train_epochs = 5,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10
)


In [37]:
model_id = 'distilbert-base-uncased'

In [38]:
from transformers import AutoConfig

# Manually define class names if they are known
class_names = ['LeftWing', 'Neutral', 'RightWing']  # replace with your actual class names
num_labels=len(class_names)
# Create id2label mapping
id2label = {i: name for i, name in enumerate(class_names)}
config = AutoConfig.from_pretrained(model_id, num_labels=len(class_names), id2label=id2label)
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

number of labels: 3
the labels: ['LeftWing', 'Neutral', 'RightWing']




In [39]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config = config)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5547,0.391648,0.837971,0.828685,0.840992,0.819487
2,0.5061,0.495741,0.852276,0.840907,0.85504,0.830913
3,0.1709,0.573231,0.857477,0.844851,0.854147,0.837563
4,0.0965,0.693301,0.855917,0.843764,0.847762,0.840618
5,0.063,0.846217,0.858257,0.844365,0.845563,0.843268


TrainOutput(global_step=19225, training_loss=0.25989047006426574, metrics={'train_runtime': 572.0776, 'train_samples_per_second': 268.827, 'train_steps_per_second': 33.606, 'total_flos': 1.018626227394048e+16, 'train_loss': 0.25989047006426574, 'epoch': 5.0})

In [42]:
trainer.evaluate()

{'eval_loss': 0.39164817333221436,
 'eval_accuracy': 0.8379713914174253,
 'eval_f1': 0.8286850094263224,
 'eval_precision': 0.8409915832009255,
 'eval_recall': 0.8194867492438821,
 'eval_runtime': 3.7256,
 'eval_samples_per_second': 1032.055,
 'eval_steps_per_second': 64.688,
 'epoch': 5.0}

In [43]:
import os

# Replace 'your_token' with the actual token you copied from Hugging Face.
os.environ['HF_TOKEN'] = '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'

# Use this environment variable when you create the `Trainer` or call `push_to_hub`.

In [44]:
tokenizer.save_pretrained(training_args.output_dir)
trainer.create_model_card()

# Push the tokenizer, model, and model card to the hub
trainer.push_to_hub(commit_message="Training completed")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]
model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s][A

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s][A[A
training_args.bin: 100%|██████████| 4.98k/4.98k [00:00<00:00, 33.5kB/s][A

model.safetensors:   4%|▎         | 9.43M/268M [00:00<00:05, 49.7MB/s][A
model.safetensors:   5%|▌         | 14.4M/268M [00:00<00:05, 44.4MB/s][A
model.safetensors:   7%|▋         | 18.8M/268M [00:00<00:12, 20.7MB/s][A
model.safetensors:  10%|▉         | 25.9M/268M [00:00<00:07, 30.5MB/s][A
model.safetensors:  12%|█▏        | 32.0M/268M [00:01<00:08, 28.2MB/s][A
model.safetensors:  14%|█▍        | 37.4M/268M [00:01<00:06, 33.1MB/s][A
model.safetensors:  16%|█▌        | 42.4M/268M [00:01<00:06, 36.6MB/s][A
model.safetensors:  18%|█▊        | 47.1M/268M [00:01<00:05, 39.3MB/s][A
model.safetensors:  19%|█▉        | 51.8M/268M [00:01<00:07, 29.4MB/s][A
model.safetensors:  22%|██▏       | 57.6M/268M [00:01<00:05, 35.3M

CommitInfo(commit_url='https://huggingface.co/harshal-11/DistillBERT-Political-Finetune/commit/4e218bdcc3c69844bcb8aab1bf218e7942292222', commit_message='Training completed', commit_description='', oid='4e218bdcc3c69844bcb8aab1bf218e7942292222', pr_url=None, pr_revision=None, pr_num=None)