In [29]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [30]:
cd /content/gdrive/MyDrive/NLP/Assignment 3

/content/gdrive/MyDrive/NLP/Assignment 3


In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import load_metric
import numpy as np

In [32]:
# Load the dataset
df = pd.read_csv('music_sentiment_dataset.csv')

In [33]:
# Display the first 5 rows
df.head()

Unnamed: 0,User_ID,User_Text,Sentiment_Label,Recommended_Song_ID,Song_Name,Artist,Genre,Tempo (BPM),Mood,Energy,Danceability
0,U1,Way ball purpose public experience recently re...,Sad,S1,Someone Like You,Adele,Pop,67,Melancholic,Low,Low
1,U2,Save officer two myself a.,Happy,S2,Happy,Pharrell Williams,Pop,160,Joyful,High,High
2,U3,Decade ahead everyone environment themselves a...,Relaxed,S3,Clair de Lune,Debussy,Classical,60,Soothing,Low,Low
3,U4,Best change letter citizen try ask quality pro...,Happy,S4,Happy,Pharrell Williams,Pop,160,Joyful,High,High
4,U5,Worker player chance kind actually.,Happy,S5,Happy,Pharrell Williams,Pop,160,Joyful,High,High


In [34]:
# Map sentiments to numerical labels
sentiment_mapping = {'Happy': 0, 'Sad': 1, 'Motivated': 2, 'Relaxed': 3}
df['label'] = df['Sentiment_Label'].map(sentiment_mapping)

In [35]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [36]:
# Tokenize text
def tokenize_function(examples):
    # Ensure the input is treated as a list of strings for batch processing
    return tokenizer(examples.tolist(), padding='max_length', truncation=True, max_length=128)

# Pass the 'User_Text' column (as a pandas Series, which can be converted to a list) to the function
tokenized_data = tokenize_function(df['User_Text'])

In [37]:
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

In [38]:
# Create custom dataset
class MusicSentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [39]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['User_Text'], df['label'], test_size=0.2)

In [40]:
# Tokenize
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

In [41]:
# Create datasets
train_dataset = MusicSentimentDataset(train_encodings, train_labels.tolist())
val_dataset = MusicSentimentDataset(val_encodings, val_labels.tolist())

In [42]:
# Load teacher model (original BERT)
teacher_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Load student model (DistilBERT)
student_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

In [45]:
# Define distillation trainer
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.teacher.eval()

    # Added num_items_in_batch to the method signature
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        outputs_stu = model(**inputs)
        loss_ce = outputs_stu.loss
        logits_stu = outputs_stu.logits

        with torch.no_grad():
            outputs_tea = self.teacher(**inputs)
            logits_tea = outputs_tea.logits

        loss_fct = torch.nn.KLDivLoss(reduction='batchmean')
        loss_kd = loss_fct(
            torch.nn.functional.log_softmax(logits_stu/2.0, dim=-1),
            torch.nn.functional.softmax(logits_tea/2.0, dim=-1)) * (2.0 ** 2)

        loss = 0.7 * loss_ce + 0.3 * loss_kd
        return (loss, outputs_stu) if return_outputs else loss

In [46]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,                   # Increase from 3 to 5+ for better accuracy
    per_device_train_batch_size=8,        # Smaller batch may help generalization
    per_device_eval_batch_size=8,
    warmup_steps=100,                     # Reduced for smaller data
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch"
)

In [47]:
# Initialize trainer
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    teacher_model=teacher_model
)

API key: 090b8324bd8b423d6d5429a151560b24b898b7c9

In [48]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0019,0.97836
2,0.951,0.992912
3,0.7581,1.064177
4,0.5677,1.234452
5,0.4132,1.263115


TrainOutput(global_step=500, training_loss=0.7526095323562622, metrics={'train_runtime': 888.5839, 'train_samples_per_second': 4.502, 'train_steps_per_second': 0.563, 'total_flos': 11384323104000.0, 'train_loss': 0.7526095323562622, 'epoch': 5.0})

In [49]:
!pip install evaluate



In [50]:
import evaluate

accuracy_metric = evaluate.load("accuracy")

In [51]:
# Evaluate the model

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy

In [52]:
trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    teacher_model=teacher_model
)

results = trainer.evaluate()
print(results)

{'eval_loss': 1.2631150484085083, 'eval_model_preparation_time': 0.0208, 'eval_accuracy': 0.27, 'eval_runtime': 31.8398, 'eval_samples_per_second': 6.281, 'eval_steps_per_second': 0.785}


In [53]:
# Quantize the model
quantized_model = torch.quantization.quantize_dynamic(
    student_model, {torch.nn.Linear}, dtype=torch.qint8)

In [54]:
train_results = trainer.evaluate(eval_dataset=train_dataset)
print(f"Training Accuracy: {train_results['eval_accuracy'] * 100:.2f}%")


Training Accuracy: 98.88%


In [55]:
def recommend_song(text, model, tokenizer, df):
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Remove token_type_ids if present, as DistilBERT doesn't use it
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted sentiment
    predicted_label = torch.argmax(outputs.logits).item()
    sentiment = list(sentiment_mapping.keys())[list(sentiment_mapping.values()).index(predicted_label)]

    # Filter songs by predicted sentiment
    recommended_songs = df[df['Sentiment_Label'] == sentiment]

    # Return random song from matching sentiment
    if not recommended_songs.empty:
        return recommended_songs.sample(1)[['Song_Name', 'Artist', 'Genre']].to_dict('records')[0]
    else:
        return {"Song_Name": "No recommendation", "Artist": "", "Genre": ""}


In [56]:
# Example usage
sample_text = "I'm feeling Happy today!"
recommendation = recommend_song(sample_text, quantized_model, tokenizer, df)
print(recommendation)

{'Song_Name': 'Uptown Funk', 'Artist': 'Bruno Mars', 'Genre': 'Funk'}
