In [1]:
import numpy as numpy
import math
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import torch

In [2]:
users = pd.read_csv('ml1m/content/dataset/users.dat', sep='::',
                        engine='python',
                        names=['userid', 'gender', 'age', 'occupation', 'zip']).set_index('userid')
ratings = pd.read_csv('ml1m/content/dataset/ratings.dat', engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
movies_train = pd.read_csv('ml1m/content/dataset/movies_train.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_test = pd.read_csv('ml1m/content/dataset/movies_test.dat', engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='ISO-8859-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train.genre.str.split('|')
movies_train.index.name = 'ID'
movies_test['genre'] = movies_test.genre.str.split('|')
movies_test.index.name = 'ID'

In [3]:
with open('ml1m/content/dataset/genres.txt', 'r') as f:
    genre_all = f.readlines()
genres = [genre.strip() for genre in genre_all]

In [4]:
def preprocess(df, genres=genres) -> pd.DataFrame:
    df['label'] = df.genre.apply(lambda x: [1 if genre in x else 0 for genre in genres])
    df.drop(columns=['genre'], inplace=True)
    df.rename(columns={'title': 'text'}, inplace=True)
    df = df.reset_index(drop=True)
    return df

trainset = preprocess(movies_train)
testset = preprocess(movies_test)

In [5]:
# df1 = trainset + testset
df1 = pd.concat([trainset, testset], ignore_index=True)
df1 = df1.reset_index(drop=True)
df2 = pd.read_csv('movie.csv')
df2 = df2.drop(columns=['movieId'])
df2['genre'] = df2.genres.str.split('|')
df2 = preprocess(df2)
df2.drop(columns='genres', inplace=True)

In [6]:
# df contains all the movies, no duplicates from df1 and df2
df = pd.concat([df1, df2], ignore_index=True)
df = df.drop_duplicates(subset=['text'])
df = df.reset_index(drop=True)
df['text'] = df['text'].str[:-6]
df

Unnamed: 0,text,label
0,Washington Square,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,"Net, The","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Batman Returns,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,"Boys from Brazil, The","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Dear Jesse,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
27800,Kein Bund für's Leben,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27801,"Feuer, Eis & Dosenbier","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
27802,The Pirates,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
27803,Rentun Ruusu,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
for i in range(len(df)):
    # Check if the last characters are ', The' then move it to the front
    if df['text'][i][-5:] == ', The':
        df['text'][i] = 'The ' + df['text'][i][:-5]

df['context'] = ''

In [8]:
tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [9]:
quote = 'What is the story of the movie {}?'
model.eval()

for i in tqdm(range(len(df))):
    input_text = quote.format(df['text'][i])
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids, max_length=1024, do_sample=True, temperature=0.09)
    df['context'][i] = tokenizer.decode(outputs[0])

  0%|          | 0/27805 [00:00<?, ?it/s]

  0%|          | 10/27805 [00:35<27:09:20,  3.52s/it]


KeyboardInterrupt: 

In [None]:
df