In [None]:
%pip install transformers datasets scikit-learn --quiet

In [None]:
%pip install 'accelerate>=0.26.0'

In [None]:
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
import pandas as pd

# Load data from the Delta table
spark = SparkSession.builder.getOrCreate()
df = spark.sql("SELECT prompt, occupation FROM dbacademy.labuser11975435_1759780254.train")

# Convert to pandas DataFrame for Hugging Face compatibility
df_pd = df.toPandas()

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Split data into train and validation sets
train_df, val_df = train_test_split(df_pd, test_size=0.2, random_state=42, stratify=df_pd['occupation'])

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
%pip install torch

In [None]:
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder

model_checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Encode labels
le = LabelEncoder()
train_dataset = train_dataset.add_column('labels', le.fit_transform(train_dataset['occupation']))
val_dataset = val_dataset.add_column('labels', le.transform(val_dataset['occupation']))

# Tokenize prompts
def tokenize_function(examples):
    return tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    report_to='none',
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate()
print('Validation Results:', results)

# Save the model and label encoder
model.save_pretrained('./llm_occupation_model')
tokenizer.save_pretrained('./llm_occupation_model')
import joblib
joblib.dump(le, './llm_occupation_model/label_encoder.joblib')
print('Model and label encoder saved.')

In [None]:
import torch

# Select 10 example prompts from the validation set
example_prompts = val_df['prompt'].iloc[:10].tolist()

# Tokenize the example prompts
inputs = tokenizer(example_prompts, truncation=True, padding='max_length', max_length=128, return_tensors='pt')

# Move inputs to the same device as the model
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

# Decode predicted labels
predicted_occupations = le.inverse_transform(preds)

# Display prompts and their predicted occupations
import pandas as pd
results_df = pd.DataFrame({
    'prompt': example_prompts,
    'predicted_occupation': predicted_occupations
})
display(results_df)