1. install transformers

In [None]:
!pip install transformers

2. main body of the training code

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
import pickle
import random
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import datetime
import transformers as tformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
os.chdir('/content/drive/MyDrive/Dataset')

def train_model(model, train_loader, optimizer, criterion, num_epochs, writer):
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for (plots, genres_labels) in train_loader:
            optimizer.zero_grad()
            plots = pd.Series(plots)
            tokenized = plots.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
            #truncation to fit the 512 length requirement
            max_sequence_length = 768
            tokenized = tokenized.apply(lambda seq: seq[:max_sequence_length])
            max_len = 0
            for i in tokenized.values:
                if len(i) > max_len:
                    max_len = len(i)
            padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
            attention_mask = np.where(padded != 0, 1, 0)
            input_ids = torch.tensor(padded)
            attention_mask = torch.tensor(attention_mask)
            outputs = model(input_ids, attention_mask)
            genres_labels = genres_labels.to(device)
            loss = criterion(outputs, genres_labels.float())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        avg_epoch_loss = epoch_loss / len(train_loader)
        writer.add_scalar('Training Loss', avg_epoch_loss, epoch+1)

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_epoch_loss:.4f}")
    writer.close()

def load_8k_set():
  pickle_filename = '8k_dataset.pkl'

  with open(pickle_filename, 'rb') as file:
      small_set = pickle.load(file)

  # print('pass!!!')
  batch_size = 64

  processed_data8k = []
  test_debug = []

  for movie in small_set:

      genre_label = torch.tensor(movie['genre'])
      plot_genre = (movie['plot'], genre_label)
      processed_data8k.append(plot_genre)
  # create train_loader
  train_loader = DataLoader(processed_data8k[0:7000], batch_size=batch_size, shuffle=True)
  # loading test data
  test_loader = DataLoader(processed_data8k[7000:8000], batch_size=batch_size, shuffle=False)
  return train_loader, test_loader
#this modele class below stands for the whole text pipeline
class DistilBert(nn.Module):
    def __init__(self, c_in):
        super(DistilBert, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(c_in, 512, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(512, 320, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(320, 120, bias=True),
            nn.ReLU(inplace=True),
            nn.Linear(120, 23, bias=True),
            nn.Sigmoid()
        )
        self.BERTog = model_class.from_pretrained(pretrained_weights)
        # self.BERTog = AutoModelForSequenceClassification.from_pretrained("zayedupal/movie-genre-prediction_distilbert-base-uncased")
        self.fc = self.fc.to(device)
        self.BERTog = self.BERTog.to(device)
        # freeze parameters
        for param in self.BERTog.parameters():
            param.requires_grad = False
    def forward(self, input_ids, attention_mask):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        last_hidden_states = self.BERTog(input_ids, attention_mask=attention_mask)

        features = last_hidden_states[0][:,0,:]
        # print("!!!!features是", features)
        x = self.fc(features)
        return x

num_classes = 23
# hidden_dim = ...
num_epochs = 10
learning_rate = 0.00005
# to create a loader object
train_loader, test_loader = load_8k_set()

# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (tformer.DistilBertModel, tformer.DistilBertTokenizer, 'distilbert-base-uncased')
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model_og = model_class.from_pretrained(pretrained_weights)
# tokenizer = AutoTokenizer.from_pretrained("zayedupal/movie-genre-prediction_distilbert-base-uncased")
# model_og = AutoModelForSequenceClassification.from_pretrained("zayedupal/movie-genre-prediction_distilbert-base-uncased")

# create a model object
model = DistilBert(768)
#################main training code#############################
# cost function and optimization choice
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#tensorboard launching
writer = SummaryWriter()
model_checkpoint_path = 'model_checkpoint_2023-08-27_21-18-33.pth'  # load previous results
if os.path.exists(model_checkpoint_path):
    model.load_state_dict(torch.load(model_checkpoint_path))
    print(f"Model parameters loaded from '{model_checkpoint_path}'")

train_model(model, train_loader, optimizer, criterion, num_epochs, writer)
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_checkpoint_path = f'/content/model_checkpoint_{current_time}.pth'

# name the checkpoints with current date and time
torch.save(model.state_dict(), model_checkpoint_path)
print(f"Model parameters saved to '{model_checkpoint_path}'")
%load_ext tensorboard
%tensorboard --logdir=runs
###############################


3.1 Testing - load essential function

In [None]:
from sklearn.metrics import f1_score
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Dataset')
threshold = 0.5

def binarize_predictions(predictions, threshold):
    binary_predictions = []
    for tensor in predictions:
        binary_tensor = (tensor >= threshold).int()
        binary_predictions.append(binary_tensor)
    return binary_predictions
model_checkpoint_path = 'model_checkpoint_2023-08-27_21-18-33.pth'
model = DistilBert(768)
# model.load_state_dict(torch.load(model_checkpoint_path, map_location=torch.device('cpu')))
model.load_state_dict(torch.load(model_checkpoint_path))
model.to(device)

model.eval()
test_predictions = []
test_labels = []
with torch.no_grad():
    for (plots, genres_labels) in test_loader:
        plots = pd.Series(plots)
        tokenized = plots.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
        max_sequence_length = 512
        tokenized = tokenized.apply(lambda seq: seq[:max_sequence_length])
        max_len = 0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)

        padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
        attention_mask = np.where(padded != 0, 1, 0)
        input_ids = torch.tensor(padded)
        attention_mask = torch.tensor(attention_mask)
        outputs = model(input_ids, attention_mask)
        test_predictions.append(outputs)
        test_labels.append(genres_labels)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


3.2 Testing- compute F1-score

In [None]:
test_labels = labels_debug
test_predictions = predictions_debug
print('一开始test_labels\n\n\n',test_labels)
threshold = 0.5
def binarize_predictions(predictions, threshold):
    binary_predictions = []
    for tensor in predictions:
        binary_tensor = (tensor >= threshold).int()
        binary_predictions.append(binary_tensor)
    return binary_predictions
test_predictions = binarize_predictions(test_predictions, threshold)
test_predictions = torch.cat(test_predictions, dim=0)
test_labels = torch.cat(test_labels, dim=0)
print('test_labels:\n\n', test_labels)
print('test_label type:\n\n',type(test_labels))
print('test_label shape:\n\n', np.array(test_labels).shape)
print('test_predictions:\n\n', test_predictions)
print('test_label type:\n\n',type(test_predictions))
test_predictions = test_predictions.cpu()
print('test_predictions shape:\n\n',np.array(test_predictions).shape)

f1 = f1_score(test_labels, test_predictions, average="weighted")
print('weighted F1-score:', f1)
microf1 = f1_score(test_labels, test_predictions, average="micro")
print('micro F1-score:', microf1)
macrof1 = f1_score(test_labels, test_predictions, average="macro")
print('macro F1-score:', macrof1)

一开始test_labels


 [tensor([[0, 1, 0,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [0, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32), tensor([[0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [1, 0, 1,  ..., 0, 0, 0]], dtype=torch.int32), tensor([[1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [1, 0, 1,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0]], dtype=torch.int32), tensor([[0, 0, 0,  ..., 0, 0,

the code to make 8k dataset, containing plot and genre


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
import h5py
import numpy as np
import os
import json
import shutil
import pickle
from PIL import Image
os.chdir('/content/drive/MyDrive/Dataset')
file = h5py.File('multimodal_imdb.hdf5', 'r')
import matplotlib.pyplot as plt
import torch
import cv2
from PIL import Image
import requests
import random
global classes
classes = ["Drama", "Comedy", "Romance", "Thriller", "Crime", "Action", "Adventure", "Horror", "Documentary", "Mystery", "Sci-Fi", "Music", "Fantasy", "Family", "Biography", "War", "History", "Animation", "Musical", "Western", "Sport", "Short", "Film-Noir"]

import json
import shutil
import pickle
from PIL import Image

# data2 folder
dataset2_folder = 'mmimdb-rawdata/dataset/'

# data1 element
data_I = file['images']
data_id = file['imdb_ids']
data_g = file['genres']
new_dataset = []
# randomly select 8000 samples from over 20k dataset
selected_indices = random.sample(range(len(data_I)), 8000)

processed_data = []
# This part involves integrating different forms of MM-IMDB dataset
# because the .hdf5 file doesn't contain natural-language plot description, we have to download those from other source and combining them together
for movie_index in selected_indices:

    data_I_1 = np.array(data_I[movie_index])
    data_I_show = np.transpose(data_I_1, (1, 2, 0))
    image_float = data_I_show.astype(np.float32)
    normalized_image = (image_float - np.min(image_float)) / (np.max(image_float) - np.min(image_float))
    data_I_1 = (normalized_image * 255.0).astype(np.uint8)


    movie_id = str(data_id[movie_index].decode('utf-8'))

    data_g_1 = data_g[movie_index]

    json_filename = f"{movie_id}.json"
    json_filepath = os.path.join(dataset2_folder, json_filename)
    director = None
    plot = None
    with open(json_filepath, 'r') as json_file:
      json_data = json.load(json_file)
      if 'director' in json_data:
        director = json_data['director'][0]['name']
      else:
        continue
      if 'plot' in json_data:
        plot = json_data['plot'][0]
      else:
        continue


    # the structure of our rearraged 8kdataset, it is later saved in a pickle file.
    processed_entry = {
        'image': data_I_1,
        'genre': data_g_1,
        'director': director,
        'plot': plot,
        'imdb_id': movie_id,
        'hdf5_index': movie_index
    }

    processed_data.append(processed_entry)


pickle_filename = '8k_dataset.pkl'
with open(pickle_filename, 'wb') as pickle_file:
    pickle.dump(processed_data, pickle_file)

print("it's been saved as:", pickle_filename)

