# Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../../Datasets/ml-100k/Text/u.item', sep='|', names=i_cols,
encoding='latin-1')

In [3]:
items

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
items["movie title"]

0                                Toy Story (1995)
1                                GoldenEye (1995)
2                               Four Rooms (1995)
3                               Get Shorty (1995)
4                                  Copycat (1995)
                          ...                    
1677                            Mat' i syn (1997)
1678                             B. Monkey (1998)
1679                         Sliding Doors (1998)
1680                          You So Crazy (1994)
1681    Scream of Stone (Schrei aus Stein) (1991)
Name: movie title, Length: 1682, dtype: object

# Extract Summaries

In [33]:
import imdb
# import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

ia = imdb.IMDb()

In [62]:
def get_summary(movie_name):
    mv = ia.search_movie(movie_name)[0]
    url = ia.get_imdbURL(mv)
    movie = ia.get_movie(mv.movieID) 
    
    r = urlopen(Request(url=url, headers = {'User-Agent': 'Mozilla/5.0'})).read()
    soup = BeautifulSoup(r, 'html.parser')
    
    cast = ("|").join([x['name'] for x in movie['cast'][:5]])
    director = movie['director'][0]['name']
    runtime = movie['runtimes']
    summ = soup.find("div", attrs = {'data-testid': 'storyline-plot-summary'}).text
    rating = soup.find("span", attrs = {'class': 'AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV'}).text
    users = soup.find("div", attrs = {'class': 'AggregateRatingButton__TotalRatingAmount-sc-1ll29m0-3 jkCVKJ'}).text
    return(url, summ, cast, director, runtime, rating, users)

In [64]:
get_summary("top gun")

https://www.imdb.com/title/tt1745960/
None


In [None]:
for index, row in tqdm(items.iterrows(), total=items.shape[0]):
    try:
        movie_name = row['movie title']
        url, summ, cast, director, runtime, rating, users = get_summary(movie_name)
        items.loc[index, "IMDb URL"] = url
        items.loc[index, "Summary"] = summ
        items.loc[index, "Cast"] = cast
        items.loc[index, "Director"] = director
        items.loc[index, "Rating"] = rating
        items.loc[index, "Runtime"] = runtime
        items.loc[index, "No. of ratings"] = users
    except:
        print(movie_name)
        continue

In [None]:
# items.to_csv('../Datasets/ml-100k/Text/items.csv')

# XLNet Embeddings

In [None]:
import re
import torch
import transformers
import pandas as pd
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup

In [None]:
items = pd.read_csv("../Datasets/ml-100k/Text/items.csv")

In [None]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', str(text))
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

In [None]:
items['Summary'] = items['Summary'].apply(clean_text)

In [None]:
from transformers import XLNetTokenizer, XLNetModel

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')
model = model.to(device)

In [None]:
def get_embeddings(input_txt):
    encodings = tokenizer.encode_plus(input_txt, add_special_tokens=True, max_length=16, return_tensors='pt', return_token_type_ids=False, return_attention_mask=True, padding="longest")
    attention_mask = pad_sequences(encodings['attention_mask'], maxlen=20, dtype=torch.Tensor ,truncating="post",padding="post")
    attention_mask = attention_mask.astype(dtype = 'int64')
    attention_mask = torch.tensor(attention_mask).to(device)

    input_ids = pad_sequences(encodings['input_ids'], maxlen=20, dtype=torch.Tensor ,truncating="post",padding="post")
    input_ids = input_ids.astype(dtype = 'int64')
    input_ids = torch.tensor(input_ids).to(device)
    
    with torch.no_grad():
        outputs = model.forward(input_ids, attention_mask)
        last_hidden_states = outputs.last_hidden_state.cpu().detach().numpy()
    
    torch.cuda.empty_cache()
    return(last_hidden_states)

In [None]:
embeddings = []
for input_txt in tqdm(items['Summary'], total = items['Summary'].shape[0]):
    embedding = get_embeddings(input_txt)[0, -1, :]
    embeddings.append(embedding)

In [None]:
len(embeddings)

In [None]:
import csv

with open("../Datasets/ml-100k/Text/embeddings.csv", "w") as f:
    wr = csv.writer(f)
    wr.writerows(embeddings)