In [0]:
this code uses a LLM to categorize sentaces into job categories

In [0]:
# Load training data
df = spark.sql("SELECT prompt, occupation FROM dbacademy.labuser12556453_1763383765.llm_train")
display(df)

# Convert to Pandas for model training
pdf = df.toPandas()

# Prepare training data
X = pdf["prompt"].tolist()
y = pdf["occupation"].tolist()

# %pip install transformers datasets mlflow

import mlflow
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Tokenizer and model selection
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_labels = len(le.classes_)

# Tokenize prompts
def preprocess(examples):
    return tokenizer(examples["prompt"], truncation=True, padding="max_length", max_length=128)

dataset = Dataset.from_dict({"prompt": X, "label": y_encoded})
dataset = dataset.map(preprocess, batched=True)

# Train/test split
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]


In [0]:
# Model initialization
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Training arguments
training_args = TrainingArguments(
    output_dir="/dbfs/tmp/results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    save_steps=10,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train model
trainer.train()

In [0]:
with mlflow.start_run():
    mlflow.transformers.log_model(
        transformers_model={
            "model": model,
            "tokenizer": tokenizer
        },
        artifact_path="llm_occupation_model",
        task="text-classification"
    )

In [0]:
# Predict occupation from new prompts
def predict_occupation(prompts):
    inputs = tokenizer(prompts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    outputs = model(**inputs)
    preds = outputs.logits.argmax(dim=1).numpy()
    return le.inverse_transform(preds)

In [0]:
# Example usage
new_prompts = ["Write a report on financial analysis.", "Design a new software application."]
predicted_occupations = predict_occupation(new_prompts)
display(predicted_occupations)