In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.metrics import classification_report

# Load the cleaned dataset
data = pd.read_csv("Movie_Questions_Base.csv")

# Define the features (questions) and labels (categories)
X = data['question']
y = data['category']

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Convert the data into a Hugging Face Dataset format
train_data = pd.DataFrame({'text': X_train, 'label': y_train})
test_data = pd.DataFrame({'text': X_test, 'label': y_test})

# Load the tokenizer and tokenize the dataset
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Prepare datasets for Trainer
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")
train_dataset.set_format("torch")
test_dataset.set_format("torch")



  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 747/747 [00:00<00:00, 2578.51 examples/s]
Map: 100%|██████████| 187/187 [00:00<00:00, 4616.50 examples/s]


In [3]:
# Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english",
    num_labels=len(label_encoder.classes_),
    ignore_mismatched_sizes=True
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    seed=42
)

# Define a compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  8%|▊         | 11/141 [00:01<00:18,  6.88it/s]

{'loss': 1.2464, 'grad_norm': 4.882848262786865, 'learning_rate': 1.8581560283687945e-05, 'epoch': 0.21}


 15%|█▍        | 21/141 [00:03<00:16,  7.26it/s]

{'loss': 1.0259, 'grad_norm': 5.986934661865234, 'learning_rate': 1.716312056737589e-05, 'epoch': 0.43}


 22%|██▏       | 31/141 [00:04<00:15,  7.31it/s]

{'loss': 0.8713, 'grad_norm': 4.915693283081055, 'learning_rate': 1.5744680851063832e-05, 'epoch': 0.64}


 29%|██▉       | 41/141 [00:06<00:14,  7.11it/s]

{'loss': 0.6996, 'grad_norm': 2.589223623275757, 'learning_rate': 1.4326241134751775e-05, 'epoch': 0.85}


                                                
 33%|███▎      | 47/141 [00:07<00:11,  7.89it/s]

{'eval_loss': 0.5043683648109436, 'eval_accuracy': 0.893048128342246, 'eval_runtime': 0.5088, 'eval_samples_per_second': 367.564, 'eval_steps_per_second': 23.587, 'epoch': 1.0}


 36%|███▌      | 51/141 [00:08<00:24,  3.74it/s]

{'loss': 0.6382, 'grad_norm': 5.007660865783691, 'learning_rate': 1.2907801418439719e-05, 'epoch': 1.06}


 43%|████▎     | 61/141 [00:10<00:11,  7.20it/s]

{'loss': 0.4935, 'grad_norm': 2.2139081954956055, 'learning_rate': 1.1489361702127662e-05, 'epoch': 1.28}


 50%|█████     | 71/141 [00:11<00:09,  7.51it/s]

{'loss': 0.3853, 'grad_norm': 2.2463111877441406, 'learning_rate': 1.0070921985815602e-05, 'epoch': 1.49}


 57%|█████▋    | 81/141 [00:12<00:08,  7.09it/s]

{'loss': 0.3125, 'grad_norm': 2.139033794403076, 'learning_rate': 8.652482269503547e-06, 'epoch': 1.7}


 65%|██████▍   | 91/141 [00:14<00:06,  7.35it/s]

{'loss': 0.2514, 'grad_norm': 4.168227672576904, 'learning_rate': 7.234042553191491e-06, 'epoch': 1.91}


                                                
 67%|██████▋   | 94/141 [00:15<00:05,  7.92it/s]

{'eval_loss': 0.2076019048690796, 'eval_accuracy': 0.9679144385026738, 'eval_runtime': 0.509, 'eval_samples_per_second': 367.355, 'eval_steps_per_second': 23.574, 'epoch': 2.0}


 72%|███████▏  | 101/141 [00:16<00:07,  5.48it/s]

{'loss': 0.2418, 'grad_norm': 1.7944506406784058, 'learning_rate': 5.815602836879432e-06, 'epoch': 2.13}


 79%|███████▊  | 111/141 [00:18<00:04,  6.95it/s]

{'loss': 0.2441, 'grad_norm': 2.823513984680176, 'learning_rate': 4.397163120567377e-06, 'epoch': 2.34}


 86%|████████▌ | 121/141 [00:19<00:02,  7.18it/s]

{'loss': 0.1807, 'grad_norm': 3.9511022567749023, 'learning_rate': 2.978723404255319e-06, 'epoch': 2.55}


 93%|█████████▎| 131/141 [00:21<00:01,  7.18it/s]

{'loss': 0.174, 'grad_norm': 3.066305637359619, 'learning_rate': 1.5602836879432626e-06, 'epoch': 2.77}


 99%|█████████▉| 140/141 [00:22<00:00,  7.11it/s]

{'loss': 0.1349, 'grad_norm': 2.0415124893188477, 'learning_rate': 1.4184397163120568e-07, 'epoch': 2.98}


                                                 
100%|██████████| 141/141 [00:23<00:00,  7.11it/s]

{'eval_loss': 0.15967796742916107, 'eval_accuracy': 0.9625668449197861, 'eval_runtime': 0.4889, 'eval_samples_per_second': 382.479, 'eval_steps_per_second': 24.544, 'epoch': 3.0}


100%|██████████| 141/141 [00:24<00:00,  5.72it/s]

{'train_runtime': 24.6511, 'train_samples_per_second': 90.909, 'train_steps_per_second': 5.72, 'train_loss': 0.49021066101730293, 'epoch': 3.0}





TrainOutput(global_step=141, training_loss=0.49021066101730293, metrics={'train_runtime': 24.6511, 'train_samples_per_second': 90.909, 'train_steps_per_second': 5.72, 'total_flos': 74217507130368.0, 'train_loss': 0.49021066101730293, 'epoch': 3.0})

In [4]:
# Evaluate the model
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")

# Predict on the test set
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Convert back to original labels
predicted_categories = label_encoder.inverse_transform(predicted_labels)
true_categories = label_encoder.inverse_transform(y_test)

# Generate classification report
report = classification_report(true_categories, predicted_categories)
print("Classification Report:")
print(report)

100%|██████████| 12/12 [00:00<00:00, 32.28it/s]


Evaluation Results: {'eval_loss': 0.2076019048690796, 'eval_accuracy': 0.9679144385026738, 'eval_runtime': 0.6131, 'eval_samples_per_second': 305.025, 'eval_steps_per_second': 19.574, 'epoch': 3.0}


100%|██████████| 12/12 [00:00<00:00, 30.98it/s]

Classification Report:
                precision    recall  f1-score   support

       Factual       0.93      0.96      0.94        52
    Multimedia       1.00      1.00      1.00        41
Recommendation       1.00      1.00      1.00        42
     Unrelated       0.96      0.92      0.94        52

      accuracy                           0.97       187
     macro avg       0.97      0.97      0.97       187
  weighted avg       0.97      0.97      0.97       187






In [5]:
# Save the trained model and tokenizer
model.save_pretrained("./movie_question_classifier")
tokenizer.save_pretrained("./movie_question_classifier")


('./movie_question_classifier\\tokenizer_config.json',
 './movie_question_classifier\\special_tokens_map.json',
 './movie_question_classifier\\vocab.txt',
 './movie_question_classifier\\added_tokens.json')

In [18]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import numpy as np

# Load the saved model and tokenizer
loaded_model = DistilBertForSequenceClassification.from_pretrained("./movie_question_classifier")
loaded_tokenizer = DistilBertTokenizer.from_pretrained("./movie_question_classifier")

# Function to perform inference
def classify_question(question):
    # Tokenize the input question
    inputs = loaded_tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Perform inference
    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits

    # Get predicted label
    predicted_label = np.argmax(logits.numpy(), axis=1)[0]
    return predicted_label

# Example inference
new_question = "What is the box office for The Godfather and The Godfather II"
predicted_label = classify_question(new_question)

# Map label index to category
predicted_category = label_encoder.inverse_transform([predicted_label])
print(f"Predicted Category: {predicted_category[0]}")


Predicted Category: Recommendation
