In [None]:
# ============================================================
# ðŸ“Œ 1. INSTALL DEPENDENCIES (COLAB)
# ============================================================

!pip install transformers datasets scikit-learn pandas numpy accelerate

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
import torch
import json
from google.colab import files

print("âœ” Dependencies installed.")


# ============================================================
# ðŸ“Œ 2. UPLOAD YOUR CSV FILE
# ============================================================

print("Combined Data.csv")
uploaded = files.upload()

# Detect uploaded CSV filename automatically
csv_filename = list(uploaded.keys())[0]
print("âœ” File uploaded:", csv_filename)


# ============================================================
# ðŸ“Œ 3. LOAD DATASET
# ============================================================

df = pd.read_csv(csv_filename)

# Expected columns:
#   statement â†’ input text
#   status    â†’ output label
df = df[['statement', 'status']].dropna()

print("âœ” Dataset loaded with shape:", df.shape)
df.head()


# ============================================================
# ðŸ“Œ 4. ENCODE LABELS
# ============================================================

le = LabelEncoder()
df["label"] = le.fit_transform(df["status"])

print("âœ” Label Mapping:")
label_mapping = {cls: int(num) for cls, num in zip(le.classes_, le.transform(le.classes_))}
label_mapping


# Save label mapping file
with open("text_model_label_mapping.json", "w") as f:
    json.dump(label_mapping, f, indent=4)

print("âœ” Label mapping saved â†’ text_model_label_mapping.json")


# ============================================================
# ðŸ“Œ 5. PREPARE HUGGINGFACE DATASET
# ============================================================

dataset = Dataset.from_pandas(df[['statement', 'label']])

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(
        batch["statement"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

dataset = dataset.map(preprocess, batched=True)
dataset = dataset.remove_columns(["statement"])
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch")

dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
test_ds = dataset["test"]

print("âœ” Dataset tokenized and split.")


# ============================================================
# ðŸ“Œ 6. LOAD MODEL
# ============================================================

num_labels = len(le.classes_)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

print("âœ” Model loaded with", num_labels, "output classes.")


# ============================================================
# ðŸ“Œ 7. TRAINING SETUP (Colab compatible)
# ============================================================

training_args = TrainingArguments(
    output_dir="text_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    logging_steps=50,
    learning_rate=3e-5,
    do_eval=True,
    save_steps=5000,
    eval_steps=5000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds
)

print("âœ” Trainer initialized.")


# ============================================================
# ðŸ“Œ 8. TRAIN MODEL
# ============================================================

trainer.train()

print("âœ” Training complete!")


# ============================================================
# ðŸ“Œ 9. SAVE MODEL + TOKENIZER
# ============================================================

model.save_pretrained("text_model")
tokenizer.save_pretrained("text_model")

print("âœ” Model saved to text_model/")


# ============================================================
# ðŸ“Œ 10. ZIP AND DOWNLOAD MODEL
# ============================================================

!zip -r text_model.zip text_model
files.download("text_model.zip")
files.download("text_model_label_mapping.json")

print("ðŸŽ‰ All files ready for download!")


âœ” Dependencies installed.
Combined Data.csv


Saving Combined Data.csv to Combined Data (5).csv
âœ” File uploaded: Combined Data (5).csv
âœ” Dataset loaded with shape: (52681, 2)
âœ” Label Mapping:
âœ” Label mapping saved â†’ text_model_label_mapping.json


Map:   0%|          | 0/52681 [00:00<?, ? examples/s]

âœ” Dataset tokenized and split.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


âœ” Model loaded with 7 output classes.
âœ” Trainer initialized.


Step,Training Loss
50,1.566
100,1.2537
150,1.1231
200,0.983
250,0.9191
300,0.8339
350,0.7797
400,0.7975
450,0.8275
500,0.8258


âœ” Training complete!
âœ” Model saved to text_model/
updating: text_model/ (stored 0%)
updating: text_model/runs/ (stored 0%)
updating: text_model/runs/Nov15_07-40-53_a74a79127f12/ (stored 0%)
updating: text_model/runs/Nov15_07-40-53_a74a79127f12/events.out.tfevents.1763192470.a74a79127f12.489.0 (deflated 70%)
updating: text_model/checkpoint-15000/ (stored 0%)
updating: text_model/checkpoint-15000/scheduler.pt (deflated 61%)
updating: text_model/checkpoint-15000/rng_state.pth (deflated 26%)
updating: text_model/checkpoint-15000/optimizer.pt (deflated 23%)
updating: text_model/checkpoint-15000/trainer_state.json (deflated 78%)
updating: text_model/checkpoint-15000/config.json (deflated 54%)
updating: text_model/checkpoint-15000/training_args.bin (deflated 53%)
updating: text_model/checkpoint-15000/model.safetensors (deflated 8%)
updating: text_model/tokenizer.json (deflated 71%)
updating: text_model/tokenizer_config.json (deflated 75%)
updating: text_model/checkpoint-5000/ (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ðŸŽ‰ All files ready for download!
