In [2]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


In [3]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(labels):
    encoder = LabelEncoder()
    encoded = encoder.fit_transform(labels)
    return encoded, encoder


In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from preprocess import clean_text
from utils import encode_labels
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load data
df = pd.read_csv("data/sample_data.csv")
df["text"] = df["text"].apply(clean_text)

# Encode labels
labels, encoder = encode_labels(df["label"])
df["label"] = labels

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load tokenizer + model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(df["label"].unique())
)

# Training arguments
training_args = TrainingArguments(
    output_dir="models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Save model + label encoder
model.save_pretrained("models/")
tokenizer.save_pretrained("models/")
import joblib
joblib.dump(encoder, "models/label_encoder.pkl")

print("Model training completed and saved in /models folder.")


In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import joblib
from preprocess import clean_text

def predict(text):
    tokenizer = DistilBertTokenizerFast.from_pretrained("models/")
    model = DistilBertForSequenceClassification.from_pretrained("models/")
    encoder = joblib.load("models/label_encoder.pkl")

    text = clean_text(text)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits).item()
    
    return encoder.inverse_transform([predicted_class])[0]

# Example
if __name__ == "__main__":
    sample_text = "I feel very anxious and worried about everything."
    result = predict(sample_text)
    print("Input:", sample_text)
    print("Predicted Class:", result)


Epoch 1: loss=0.58, accuracy=0.82  
Epoch 2: loss=0.32, accuracy=0.91  

Model training completed and saved in /models folder.
