In [1]:
import numpy as numpy
import math
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import torch
import os

In [2]:
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                        sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                        sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

In [3]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

In [4]:
def preprocess(df, path='ml1m/content/dataset/ml1m-images', genres=genres) -> pd.DataFrame:
    df['img_path'] = df.apply(lambda x: os.path.join(path, str(x.name) + '.jpg'), axis=1)
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df = df.reset_index(drop=True)
    return df

trainset = preprocess(movies_train)
testset = preprocess(movies_test)

In [5]:
trainset = trainset.reset_index(drop=True)
trainset = trainset.drop_duplicates(subset=['title'])
trainset['title'] = trainset['title'].str[:-6]
trainset

Unnamed: 0,title,img_path,label
0,Washington Square,ml1m/content/dataset/ml1m-images\1650.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,"Net, The",ml1m/content/dataset/ml1m-images\185.jpg,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Batman Returns,ml1m/content/dataset/ml1m-images\1377.jpg,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,"Boys from Brazil, The",ml1m/content/dataset/ml1m-images\3204.jpg,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Dear Jesse,ml1m/content/dataset/ml1m-images\1901.jpg,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
3101,Analyze This,ml1m/content/dataset/ml1m-images\2539.jpg,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3102,"Face in the Crowd, A",ml1m/content/dataset/ml1m-images\3038.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3103,Heaven's Burning,ml1m/content/dataset/ml1m-images\1832.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, ..."
3104,Yankee Zulu,ml1m/content/dataset/ml1m-images\657.jpg,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [6]:
testset = testset.reset_index(drop=True)
testset = testset.drop_duplicates(subset=['title'])
testset['title'] = testset['title'].str[:-6]
testset

Unnamed: 0,title,img_path,label
0,"Great Muppet Caper, The",ml1m/content/dataset/ml1m-images\3397.jpg,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Doctor Zhivago,ml1m/content/dataset/ml1m-images\2067.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
2,Frankenstein Meets the Wolf Man,ml1m/content/dataset/ml1m-images\2651.jpg,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,For Your Eyes Only,ml1m/content/dataset/ml1m-images\2989.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,"Mirror, The (Zerkalo)",ml1m/content/dataset/ml1m-images\3415.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...
772,"Inheritors, The (Die Siebtelbauern)",ml1m/content/dataset/ml1m-images\2309.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
773,"Karate Kid, Part II, The",ml1m/content/dataset/ml1m-images\2421.jpg,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, ..."
774,"League of Their Own, A",ml1m/content/dataset/ml1m-images\3255.jpg,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
775,Algiers,ml1m/content/dataset/ml1m-images\974.jpg,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [7]:
for i in range(len(trainset)):
    # Check if the last characters are ', The' then move it to the front
    if trainset['title'][i][-5:] == ', The':
        trainset['title'][i] = 'The ' + trainset['title'][i][:-5]

trainset['context'] = ''

In [8]:
for i in range(len(testset)):
    # Check if the last characters are ', The' then move it to the front
    if testset['title'][i][-5:] == ', The':
        testset['title'][i] = 'The ' + testset['title'][i][:-5]

testset['context'] = ''

In [9]:
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-Flan-T5-248M")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print(device)

cuda


In [10]:
quote = 'What is the story of the movie {}?'
model.eval()

for i in tqdm(range(len(trainset))):
    input_text = quote.format(trainset['title'][i])
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=256, do_sample=True, temperature=0.09)
    trainset['context'][i] = tokenizer.decode(outputs[0])

  0%|          | 0/3106 [00:00<?, ?it/s]




100%|██████████| 3106/3106 [1:00:45<00:00,  1.17s/it]


In [11]:
trainset.to_csv('trainset.csv', sep = "|", index=False)

In [12]:
quote = 'What is the story of the movie {}?'
model.eval()

for i in tqdm(range(len(testset))):
    input_text = quote.format(testset['title'][i])
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=256, do_sample=True, temperature=0.09)
    testset['context'][i] = tokenizer.decode(outputs[0])

100%|██████████| 777/777 [15:33<00:00,  1.20s/it]


In [13]:
testset.to_csv('testset.csv', sep = "|", index=False)