In [None]:
# Install required package

!pip install datasets
!pip install transformers
!pip install accelerate -U

In [55]:
#Import required package

from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, Trainer
from datasets import DatasetDict, load_dataset
import torch
from pprint import pprint
import sys
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier

In [56]:
#Load the data from google drive

dataset_url = "https://drive.google.com/uc?export=download&id=1o-T99eEn_meMxH2u_Yc1J_BzWTs76Yzm"
movie_data = load_dataset("csv", data_files=dataset_url, sep=",")

In [57]:
print("Number of Rows in the Dataset", len(movie_data['train']))

Number of Rows in the Dataset 186930


In [58]:
# The data that we are going to train will be pairs of Text and Labels

for i in range(5):
    print("Data Point {}".format(i))
    print("-"*50)
    print(f"Label: {movies_data['train']['label'][i]}\n")
    print(f"Text: \n{movies_data['train']['text'][i]}")
    print('\n')

Data Point 0
--------------------------------------------------
Label: like

Text: 
Given the watching history of User A13H2JJ3GEMJN1:
1. "Indiana Jones: Raiders of the Lost Ark VHS"
2. "Shanghai Noon"
3. "Life With Father VHS"
4. "Hollow Man"
Would you recommend the Movie/TV "Jumanji VHS" to the User?


Data Point 1
--------------------------------------------------
Label: like

Text: 
Given the watching history of User AIBSC3DOO3UHR:
1. "Transformers: Prime - Season One"
2. "Transformers: Prime - Season Two"
3. "G.I. Joe: Retaliation"
4. "Apollo 18"
Would you recommend the Movie/TV "Star Trek Into Darkness" to the User?


Data Point 2
--------------------------------------------------
Label: like

Text: 
Given the watching history of User A3JJ21YCMGSKGH:
1. "Im No Angel VHS"
2. "Sextette VHS"
3. "Myra Breckenridge"
4. "Intimate Portrait: Mae West VHS"
Would you recommend the Movie/TV "Night After Night VHS" to the User?


Data Point 3
-------------------------------------------------

In [63]:
# The number of "like" is around 4 times of "dislike"

movie_data.set_format(type='pandas')
print(movie_data['train'][:]['label'].value_counts())
movie_data.reset_format()


like       143911
dislike     43019
Name: label, dtype: int64


In [64]:
# Split the data into 8:1:1 Train:Validation:Test set
test_split = movie_data['train'].train_test_split(test_size=0.1)

train_val_split = test_split['train'].train_test_split(test_size=1/9)  # 1/9 is 10% of 90%

final_splits = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],  # this is actually the validation set now
    'test': test_split['test']
})

movies_data = final_splits

# Remove unnecessary column
columns_to_remove = [column for column in movies_data["train"].column_names
                     if column not in ["text", "label", "input_ids", "attention_mask"]]


movies_data = movies_data.map(lambda batch: batch, remove_columns=columns_to_remove)


Map:   0%|          | 0/149544 [00:00<?, ? examples/s]

Map:   0%|          | 0/18693 [00:00<?, ? examples/s]

Map:   0%|          | 0/18693 [00:00<?, ? examples/s]

In [65]:
# Load the bert model
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [66]:
# Tokenize the whole dataset

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

movies_encoded = movies_data.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/149544 [00:00<?, ? examples/s]

Map:   0%|          | 0/18693 [00:00<?, ? examples/s]

Map:   0%|          | 0/18693 [00:00<?, ? examples/s]

In [67]:
def label_to_number(label):
    if label == 'like':
        return 1
    elif label == 'dislike':
        return 0
    else:
        raise ValueError('Invalid label')

# Apply the function to the 'label' column of each split in the dataset
movies_encoded = movies_encoded.map(lambda examples: {'label': label_to_number(examples['label'])}, batched=False)

Map:   0%|          | 0/149544 [00:00<?, ? examples/s]

Map:   0%|          | 0/18693 [00:00<?, ? examples/s]

Map:   0%|          | 0/18693 [00:00<?, ? examples/s]

In [68]:
movies_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 149544
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18693
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 18693
    })
})

In [69]:
num_labels = 2
model = (AutoModelForSequenceClassification
          .from_pretrained(model_ckpt, num_labels=num_labels)
          .to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
## prepare the train arguments
batch_size = 64
logging_steps = len(movies_encoded["train"]) // batch_size
model_name = f"{model_ckpt}"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

In [71]:
# Define the metric

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [72]:
# Login into the hugging face

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
## Train the model!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=movies_encoded["train"],
                  eval_dataset=movies_encoded["validation"],
                  tokenizer=tokenizer)

trainer.train()

In [None]:

movies_encoded.set_format(type='pandas')
df_train = movies_encoded['train'][:]
df_valid = movies_encoded['validation'][:]

movies_encoded.reset_format()


X_train = df_train[['input_ids', 'attention_mask']]
y_train = df_train['label']

X_valid = df_valid[['input_ids', 'attention_mask']]
y_valid = df_valid['label']


In [None]:
## Compare the result with baseline - Majority vote.

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)