In [21]:
!pip install datasets



In [22]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch.nn import CrossEntropyLoss
from transformers import pipeline
from safetensors.torch import save_file

In [26]:
df = pd.read_csv("Train.csv", engine='python', on_bad_lines='skip')
columns_to_keep = ["text", "label_model"]
df = df[columns_to_keep]

In [27]:
df=df[:10000]
df.count()

Unnamed: 0,0
text,4872
label_model,4876


In [28]:
# Step 2: Clean the text column
def clean_text(text):
    text = str(text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text.lower()

df["text_cleaned"] = df["text"].apply(clean_text)

# Step 3: Tokenize text using Hugging Face tokenizer
tokenizer_name = "distilbert-base-uncased"  # Smaller model for faster training
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text_cleaned"],
        max_length=128,
        truncation=True,
        padding="max_length",
    )

In [29]:
df.head()

Unnamed: 0,text,label_model,text_cleaned
0,"It starts with pain, followed by hate\nFueled ...",LABEL_9,it starts with pain followed by hate fueled by...
1,Freedom!\nAlone again again alone\nPatiently w...,LABEL_9,freedom alone again again alone patiently wait...
2,"Biting the hand that feeds you, lying to the v...",LABEL_9,biting the hand that feeds you lying to the vo...
3,You say you know just who I am\nBut you can't ...,LABEL_9,you say you know just who i am but you cant im...
4,My heart is beating faster can't control these...,LABEL_9,my heart is beating faster cant control these ...


In [30]:
# Step 4: Encode target labels
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label_model"])

# Prepare the dataset for Hugging Face
dataset = Dataset.from_pandas(df[["text_cleaned", "label_encoded"]])
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text_cleaned"])
tokenized_datasets = tokenized_datasets.rename_column("label_encoded", "labels")
tokenized_datasets.set_format("torch")

# Split the dataset into train and validation
def train_test_split(dataset, train_ratio=0.8):
    train_size = int(len(dataset) * train_ratio)
    return dataset.select(range(train_size)), dataset.select(range(train_size, len(dataset)))

train_dataset, val_dataset = train_test_split(tokenized_datasets)

Map:   0%|          | 0/4876 [00:00<?, ? examples/s]

In [31]:
# Step 5: Address Imbalanced Data
class_counts = df["label_encoded"].value_counts().to_dict()
class_weights = torch.tensor([1.0 / class_counts[i] for i in range(len(class_counts))]).to(torch.float32)

def compute_loss_with_weights(outputs, labels):
    loss_fn = CrossEntropyLoss(weight=class_weights)
    return loss_fn(outputs.logits, labels)

# Step 6: Load model and set up training
model = AutoModelForSequenceClassification.from_pretrained(
    tokenizer_name,
    num_labels=len(label_encoder.classes_)
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=4e-5,  # Increased due to larger batch size
    per_device_train_batch_size=64,  # Increased for faster training
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=2,  # Simulating a batch size of 128 if needed
    num_train_epochs=1,  # Keeping it low for faster execution
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    fp16=True  # Mixed precision for speed
)




In [35]:
# Fixed compute_metrics function
def compute_metrics(pred):
    logits, labels = pred
    logits = torch.tensor(logits)  # Convert logits to a PyTorch tensor
    preds = torch.argmax(logits, axis=-1)
    labels = torch.tensor(labels)  # Ensure labels are also tensors
    accuracy = (preds == labels).float().mean().item()
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [36]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.768869,0.657787


TrainOutput(global_step=30, training_loss=1.7153807322184245, metrics={'train_runtime': 3132.9517, 'train_samples_per_second': 1.245, 'train_steps_per_second': 0.01, 'total_flos': 127173238456320.0, 'train_loss': 1.7153807322184245, 'epoch': 0.9836065573770492})

In [37]:
# Save the fine-tuned model in the required format
model_save_path = "/content/drive/MyDrive/MLDS_Trained_Model/test"
trainer.save_model(model_save_path)


