In [1]:
import pandas as pd
import numpy as np
import json
import re
import wikipedia
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
data_path = '../data_raw/'

In [3]:
movies_raw = pd.read_parquet(data_path +'movies.pq')  # инфа о фильмах -- названия и жанры 
ratings_raw = (
    pd.read_parquet(data_path + 'ratings.pq')
    .drop(columns='timestamp')
)  # инфа о рейтингах -- пользователи, рейтинги
tags_raw = (
    pd.read_parquet(data_path + 'tags.pq')
    .drop(columns='timestamp')
)  # тэги -- какой пользователь и какой тег поставил фильму
links_raw = pd.read_parquet(data_path + 'links.pq')  # метчинг айдишников с другими датасетами
tmdb_movies_raw = pd.read_parquet(data_path + 'tmdb_5000_movies.pq')  # инфа о фильмах с tmdb
tmdb_credits_raw = pd.read_parquet(data_path + 'tmdb_5000_credits.pq')  # инфа о создателях фильмов с tmdb
genome_data = pd.read_parquet(data_path + 'genome.pq')

### TMDB preprocessing

In [4]:
tmdb_movies = tmdb_movies_raw[
    ['id', 'title', 'genres', 'release_date', 'production_countries', 'runtime', 'revenue', 'popularity', 'vote_average', 'vote_count']
]
tmdb_movies.loc[:, 'genres'] = tmdb_movies.genres.transform(lambda y: json.loads(y, object_hook=lambda x: x['name']))
tmdb_movies.loc[:, 'production_countries'] = tmdb_movies.production_countries.transform(lambda y: json.loads(y, object_hook=lambda x: x['name']))
tmdb_movies.loc[:, 'release_date'] = tmdb_movies.release_date.transform(lambda x: str(x).split('-')[0])
tmdb_movies = tmdb_movies.rename(columns={'id': 'movie_id', 'release_date': 'year'})
tmdb_movies

Unnamed: 0,movie_id,title,genres,year,production_countries,runtime,revenue,popularity,vote_average,vote_count
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",2009,"[United States of America, United Kingdom]",162.0,2787965087,150.437577,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",2007,[United States of America],169.0,961000000,139.082615,6.9,4500
2,206647,Spectre,"[Action, Adventure, Crime]",2015,"[United Kingdom, United States of America]",148.0,880674609,107.376788,6.3,4466
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",2012,[United States of America],165.0,1084939099,112.312950,7.6,9106
4,49529,John Carter,"[Action, Adventure, Science Fiction]",2012,[United States of America],132.0,284139100,43.926995,6.1,2124
...,...,...,...,...,...,...,...,...,...,...
4798,9367,El Mariachi,"[Action, Crime, Thriller]",1992,"[Mexico, United States of America]",81.0,2040920,14.269792,6.6,238
4799,72766,Newlyweds,"[Comedy, Romance]",2011,[],85.0,0,0.642552,5.9,5
4800,231617,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TV Movie]",2013,[United States of America],120.0,0,1.444476,7.0,6
4801,126186,Shanghai Calling,[],2012,"[United States of America, China]",98.0,0,0.857008,5.7,7


In [5]:
tmdb_credits = tmdb_credits_raw[['movie_id', 'cast', 'crew']]
tmdb_credits.loc[:, 'cast'] = tmdb_credits.cast.transform(lambda y: json.loads(y, object_hook=lambda x: x['name'])[:10])
tmdb_credits.loc[:, 'crew'] = tmdb_credits.crew.transform(lambda y: json.loads(y))
tmdb_credits.loc[:, 'crew'] = tmdb_credits.crew.transform(lambda x: [el for el in x if el['job'] == 'Director'])
tmdb_credits = tmdb_credits[tmdb_credits.crew.apply(len) >= 1]
tmdb_credits.loc[:, 'crew'] = tmdb_credits.crew.transform(lambda x: [el['name'] for el in x])
tmdb_credits = tmdb_credits.rename(columns={'crew': 'director'})
tmdb_credits

Unnamed: 0,movie_id,cast,director
0,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]
...,...,...,...
4798,9367,"[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...",[Robert Rodriguez]
4799,72766,"[Edward Burns, Kerry Bishé, Marsha Dietlein, C...",[Edward Burns]
4800,231617,"[Eric Mabius, Kristin Booth, Crystal Lowe, Geo...",[Scott Smith]
4801,126186,"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan...",[Daniel Hsia]


In [6]:
tmdb_data = tmdb_movies.merge(tmdb_credits, on='movie_id')
tmdb_data

Unnamed: 0,movie_id,title,genres,year,production_countries,runtime,revenue,popularity,vote_average,vote_count,cast,director
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",2009,"[United States of America, United Kingdom]",162.0,2787965087,150.437577,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",2007,[United States of America],169.0,961000000,139.082615,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]",2015,"[United Kingdom, United States of America]",148.0,880674609,107.376788,6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",2012,[United States of America],165.0,1084939099,112.312950,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]",2012,[United States of America],132.0,284139100,43.926995,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]
...,...,...,...,...,...,...,...,...,...,...,...,...
4768,9367,El Mariachi,"[Action, Crime, Thriller]",1992,"[Mexico, United States of America]",81.0,2040920,14.269792,6.6,238,"[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...",[Robert Rodriguez]
4769,72766,Newlyweds,"[Comedy, Romance]",2011,[],85.0,0,0.642552,5.9,5,"[Edward Burns, Kerry Bishé, Marsha Dietlein, C...",[Edward Burns]
4770,231617,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TV Movie]",2013,[United States of America],120.0,0,1.444476,7.0,6,"[Eric Mabius, Kristin Booth, Crystal Lowe, Geo...",[Scott Smith]
4771,126186,Shanghai Calling,[],2012,"[United States of America, China]",98.0,0,0.857008,5.7,7,"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan...",[Daniel Hsia]


In [7]:
# tmdb_data.to_parquet(data_path + 'tmdb_data.pq')

### Wiki data

In [8]:
movies = movies_raw.merge(links_raw, on='movieId')[['movieId', 'tmdbId', 'title', 'genres']]
movies = movies.merge(tmdb_movies, left_on='tmdbId', right_on='movie_id')
movies

Unnamed: 0,movieId,tmdbId,title_x,genres_x,movie_id,title_y,genres_y,year,production_countries,runtime,revenue,popularity,vote_average,vote_count
0,1,862.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862,Toy Story,"[Animation, Comedy, Family]",1995,[United States of America],81.0,373554033,73.640445,7.7,5269
1,10,710.0,GoldenEye (1995),Action|Adventure|Thriller,710,GoldenEye,"[Adventure, Action, Thriller]",1995,"[United Kingdom, United States of America]",130.0,352194034,59.824565,6.6,1174
2,11,9087.0,"American President, The (1995)",Comedy|Drama|Romance,9087,The American President,"[Comedy, Drama, Romance]",1995,[United States of America],106.0,107879496,11.056763,6.5,195
3,14,10858.0,Nixon (1995),Drama,10858,Nixon,"[History, Drama]",1995,[United States of America],192.0,13681765,3.770161,7.1,71
4,15,1408.0,Cutthroat Island (1995),Action|Adventure|Romance,1408,Cutthroat Island,"[Action, Adventure]",1995,"[France, Germany, Italy, United States of Amer...",119.0,10017322,7.029308,5.7,136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4597,200562,248402.0,A Fine Step (2014),Drama,248402,A Fine Step,[Drama],2014,[],90.0,0,0.654340,4.1,7
4598,201050,206213.0,Zombie Hunter (2013),Action|Comedy|Sci-Fi|Thriller,206213,Zombie Hunter,"[Comedy, Action, Science Fiction, Thriller]",2013,[United States of America],93.0,0,3.418372,3.5,34
4599,203797,24227.0,Excessive Force (1993),Action,24227,Excessive Force,[Action],1993,[United States of America],87.0,1200000,1.279106,4.5,10
4600,204288,51130.0,Open Secret (1948),Crime|Mystery|Thriller,51130,Open Secret,"[Crime, Mystery, Thriller]",1948,[United States of America],68.0,0,0.186401,7.0,2


In [9]:
def get_wikipedia_page_name(raw_name):
    names = wikipedia.search(raw_name)
    if len(names) == 0:
      return ''
    else:
      return names[0]

def get_movie_plot(page_name):
    try:
      try:
        movie_page_content = str(wikipedia.page(page_name, auto_suggest=False).content)
      except wikipedia.DisambiguationError as e:
        for option in e.options:
          if 'film' in option:
            movie_page_content = str(wikipedia.page(option, auto_suggest=False).content)
        return None
    except (wikipedia.PageError, KeyError):
      return None
    re_groups = re.search("Plot ==(.*?)=+ [A-Z]", str(movie_page_content).replace('\n', ''))
    if re_groups:
      return re_groups.group(1)
    else:
      return None

In [None]:
movies['wikipedia_page_name'] = movies['title_x'].progress_apply(lambda name: get_wikipedia_page_name(name))
movies['movie_plot'] = movies['wikipedia_page_name'].progress_apply(lambda page_name: get_movie_plot(page_name))
print(f'There are {movies["movie_plot"].isna().sum()} NaN movie plots')
movies

In [None]:
movies = movies.drop(columns='title_x').rename(columns={'title_y': 'title'})

In [None]:
movies.loc[:, 'genres_x'] = movies.genres_x.apply(lambda x: x.split('|'))
movies.loc[:, 'genres_y'] = movies.genres_y.apply(list)
movies['genres'] = (movies.genres_x + movies.genres_y).apply(set).apply(list)
movies.drop(columns=['genres_x', 'genres_y'], inplace=True)

In [None]:
def remove_no_genres(x):
    if ('(no genres listed)' in x) and len(x) > 1:
        return [el for el in x if el != '(no genres listed)']
    return x

def remove_foreign(x):
    if ('Foreign' in x):
        if len(x) > 1:
            return [el for el in x if el != 'Foreign']
        return ['(no genres listed)']
    return x
    
movies.loc[:, 'genres'] = movies.genres.apply(remove_no_genres).apply(remove_foreign)
movies[movies.genres.apply(str).str.contains('(no genres listed)')]

  movies_wiki[movies_wiki.genres.apply(str).str.contains('(no genres listed)')]


Unnamed: 0,movieId,tmdbId,movie_id,title,year,runtime,revenue,popularity,vote_average,vote_count,wikipedia_page_name,movie_plot,genres
4247,132981,137955.0,137955,Crowsnest,2012,84.0,0,0.057564,4.8,12,"Crowsnest Pass, Alberta",,[(no genres listed)]
4349,141747,335874.0,335874,Childless,2015,90.0,0,0.243853,4.5,2,Childlessness,,[(no genres listed)]
4394,146410,325140.0,325140,Hum To Mohabbat Karega,2000,0.0,0,0.001186,0.0,0,Hum To Mohabbat Karega,"Rajiv ""Raju"" Bhatnagar is a waiter in a popula...",[(no genres listed)]
4461,153396,162396.0,162396,The Big Swap,1998,0.0,0,0.627763,0.0,0,The Big Swap,A group of friends meet and plan to swap partn...,[(no genres listed)]
4486,158563,294550.0,294550,The Outrageous Sophie Tucker,2014,96.0,0,0.021419,0.0,0,Sophie Tucker,,[(no genres listed)]
4535,163605,48382.0,48382,"The Book of Mormon Movie, Volume 1: The Journey",2003,120.0,1672730,0.031947,5.0,2,The Book of Mormon Movie,The film is based on the first two books of th...,[(no genres listed)]
4585,190841,176074.0,176074,Reality Show,2015,92.0,0,0.472123,5.5,2,Reality television,,[(no genres listed)]


In [None]:
# movies_wiki.loc[:, 'genres'] = movies_wiki.genres.apply(set)
def remove_duplicate_music(x):
    if ('Music' in x):
        tmp = [el for el in x if el != 'Music']
        if ('Musical' in x):
            return tmp
        else:
            return tmp + ['Musical']
    return x

def remove_duplicate_scifi(x):
    if ('Science Fiction' in x):
        tmp = [el for el in x if el != 'Science Fiction']
        if ('Sci-Fi' in x):
            return tmp
        else:
            return tmp + ['Sci-Fi']
    return x
    
movies.loc[:, 'genres'] = movies.genres.apply(remove_duplicate_music).apply(remove_duplicate_scifi)
np.unique(movies.genres.sum())

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Film-Noir', 'History', 'Horror', 'IMAX', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'TV Movie', 'Thriller', 'War',
       'Western'], dtype='<U18')

In [None]:
production_country = tmdb_data[['movie_id', 'production_countries']]

In [None]:
movies = movies.merge(production_country, left_on='tmdbId', right_on='movie_id')

In [None]:
# movies_wiki.loc[:, 'movie_plot'] = movies_wiki.movie_plot.apply(lambda x: x if x != '' else None)
movies = movies.drop(columns=['wikipedia_page_name', 'movie_id_y']).rename(columns={'movie_id_x': 'movie_id'})
movies

Unnamed: 0,movieId,tmdbId,movie_id,title,year,runtime,revenue,popularity,vote_average,vote_count,movie_plot,genres,production_countries
0,1,862.0,862,Toy Story,1995,81.0,373554033,73.640445,7.7,5269,,"[Fantasy, Adventure, Comedy, Children, Animati...",[United States of America]
1,10,710.0,710,GoldenEye,1995,130.0,352194034,59.824565,6.6,1174,"In 1986, MI6 agents James Bond and Alec Trevel...","[Adventure, Action, Thriller]","[United Kingdom, United States of America]"
2,11,9087.0,9087,The American President,1995,106.0,107879496,11.056763,6.5,195,Popular Democratic President Andrew Shepherd p...,"[Comedy, Romance, Drama]",[United States of America]
3,14,10858.0,10858,Nixon,1995,192.0,13681765,3.770161,7.1,71,"In 1972, the White House Plumbers break into T...","[History, Drama]",[United States of America]
4,15,1408.0,1408,Cutthroat Island,1995,119.0,10017322,7.029308,5.7,136,"In 1668 Jamaica, Morgan Adams having escaped a...","[Adventure, Action, Romance]","[France, Germany, Italy, United States of Amer..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4588,200562,248402.0,248402,A Fine Step,2014,90.0,0,0.654340,4.1,7,,[Drama],[]
4589,201050,206213.0,206213,Zombie Hunter,2013,93.0,0,3.418372,3.5,34,"The film opens with a news report on ""Natas"", ...","[Comedy, Action, Thriller, Sci-Fi]",[United States of America]
4590,203797,24227.0,24227,Excessive Force,1993,87.0,1200000,1.279106,4.5,10,Detective Terry McCain (Thomas Ian Griffith) i...,[Action],[United States of America]
4591,204288,51130.0,51130,Open Secret,1948,68.0,0,0.186401,7.0,2,Newlyweds Paul Lester (Ireland) and his wife N...,"[Crime, Mystery, Thriller]",[United States of America]


In [None]:
movies = movies.merge(tmdb_data, left_on='movieId', right_on='movie_id')
movies

Unnamed: 0,movieId,tmdbId,movie_id_x,title,year,runtime,revenue,popularity,vote_average,vote_count,movie_plot,genres,production_countries,movie_id_y,cast,director
0,11,9087.0,9087,The American President,1995,106.0,107879496,11.056763,6.5,195,Popular Democratic President Andrew Shepherd p...,"[Comedy, Romance, Drama]",[United States of America],11,"[Mark Hamill, Harrison Ford, Carrie Fisher, Pe...",[George Lucas]
1,14,10858.0,10858,Nixon,1995,192.0,13681765,3.770161,7.1,71,"In 1972, the White House Plumbers break into T...","[History, Drama]",[United States of America],14,"[Kevin Spacey, Annette Bening, Thora Birch, We...",[Sam Mendes]
2,16,524.0,524,Casino,1995,178.0,116112375,40.066880,7.8,1307,"In 1973, sports handicapper and Mafia associat...","[Crime, Drama]","[France, United States of America]",16,"[Björk, Catherine Deneuve, David Morse, Peter ...",[Lars von Trier]
3,18,5.0,5,Four Rooms,1995,98.0,4300000,22.876230,6.5,530,"On New Year's Eve, bellhop Sam (Marc Lawrence)...","[Crime, Comedy]",[United States of America],18,"[Bruce Willis, Gary Oldman, Ian Holm, Milla Jo...",[Luc Besson]
4,19,9273.0,9273,Ace Ventura: When Nature Calls,1995,90.0,212385533,34.391792,6.1,1099,"In the Himalayas, after a failed rescue missio...","[Crime, Adventure, Comedy]",[United States of America],19,"[Brigitte Helm, Alfred Abel, Gustav Fröhlich, ...",[Fritz Lang]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,89492,60308.0,60308,Moneyball,2011,133.0,110206216,46.180421,7.0,1381,"Billy Beane, the Oakland Athletics general man...",[Drama],[United States of America],89492,"[Paul Rudd, Leslie Mann, Jason Segel, Maude Ap...",[Judd Apatow]
319,115210,228150.0,228150,Fury,2014,135.0,211817906,139.575085,7.4,3946,"In April of 1945, the Allies make their final ...","[War, Action, Drama]","[United Kingdom, United States of America, China]",115210,"[Ross Noble, Tommy Knight, Gemma-Leah Devereux...",[Conor McMahon]
320,116977,100042.0,100042,Dumb and Dumber To,2014,110.0,169837010,67.767785,5.4,1127,For twenty years a catatonic Lloyd Christmas h...,[Comedy],[United States of America],116977,"[Charlie Sheen, Wayne Brady, Hilary Duff, Eva ...",[Lawrence Kasanoff]
321,123553,252360.0,252360,In the Name of the King III,2013,85.0,0,4.304341,3.3,19,,"[Fantasy, Adventure, Action, Drama]","[Bulgaria, Canada]",123553,"[Lily Collins, Jamie Campbell Bower, Kevin Zeg...",[Harald Zwart]


In [None]:
# movies.to_parquet(data_path + 'movies_data.pq')

### Сводим фильмы с тегами

In [None]:
tag_relevance_threshold = 0.3

In [None]:
movies = movies_raw.merge(genome, on='movieId')
movies = movies[movies.relevance > tag_relevance_threshold]
movies

Unnamed: 0,movieId,title,genres,tag,relevance
10,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3d,0.58025
18,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,action,0.66250
19,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,action packed,0.30075
20,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,adaptation,0.31675
28,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,adventure,0.89375
...,...,...,...,...,...
15584410,206499,Between Two Ferns: The Movie (2019),Comedy,visually appealing,0.38400
15584420,206499,Between Two Ferns: The Movie (2019),Comedy,weapons,0.43075
15584422,206499,Between Two Ferns: The Movie (2019),Comedy,weed,0.35375
15584423,206499,Between Two Ferns: The Movie (2019),Comedy,weird,0.36950


In [None]:
all_tags = pd.Series(movies.tag.unique())
all_tags.shape

(1128,)

In [None]:
stemmer = SnowballStemmer('english')
stemmed_tags = movies.tag.apply(stemmer.stem)
unique_stammed = stemmed_tags.unique()
unique_stammed.shape

(1078,)

In [None]:
# nltk.download('averaged_perceptron_tagger')
tags_with_pos = all_tags.apply(str.split).apply(nltk.pos_tag)

Получается много пересекающихся тегов, в том числе потому, что многие похожие по смыслу теги записываются в несколько слов, с ошибками и т.д. Надо придумать, че с этим сделать  
Как идея -- попробовать посчитать встречаемость слов (может быть, до и после стемминга?)

In [None]:
tags_list = list(map(lambda x: x.split(), list(all_tags)))
words = []
for element in tags_list:
    words += element

In [None]:
unique, counts = np.unique(np.array(words), return_counts=True)
word_counts = dict(zip(unique, counts))

Возможно, время выставления также стоит учитывать, т.к. тогда мы сможем каким-то образом учитывать изменение в предпочтениях пользователя. Хотя, с другой стороны, для mvp нам это явно не поможет (это скорее улучшение для существующих пользователей). В общем, пока выкинем, потом придумаем