In [1]:
import pandas as pd
import numpy as np
import json
import re
import wikipedia
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
data_path = '../data_raw/'

In [3]:
movies_raw = pd.read_parquet(data_path +'movies.pq')  # инфа о фильмах -- названия и жанры 
links_raw = pd.read_parquet(data_path + 'links.pq')  # метчинг айдишников с другими датасетами
tmdb_data = pd.read_parquet(data_path + 'tmdb_data.pq')  # инфа о фильмах с tmdb

In [4]:
movies = movies_raw.merge(links_raw, on='movieId')[['movieId', 'tmdbId', 'title', 'genres']]
movies = movies.merge(tmdb_data, left_on='tmdbId', right_on='movie_id').drop(columns='movie_id')

In [5]:
movies.loc[:, 'genres_x'] = movies.genres_x.apply(lambda x: x.split('|'))
movies.loc[:, 'genres_y'] = movies.genres_y.apply(list)
movies['genres'] = (movies.genres_x + movies.genres_y).apply(set).apply(list)
movies.drop(columns=['genres_x', 'genres_y'], inplace=True)

In [6]:
def remove_no_genres(x):
    if ('(no genres listed)' in x) and len(x) > 1:
        return [el for el in x if el != '(no genres listed)']
    return x

def remove_foreign(x):
    if ('Foreign' in x):
        if len(x) > 1:
            return [el for el in x if el != 'Foreign']
        return ['(no genres listed)']
    return x
    
movies.loc[:, 'genres'] = movies.genres.apply(remove_no_genres).apply(remove_foreign)

In [7]:
# movies_wiki.loc[:, 'genres'] = movies_wiki.genres.apply(set)
def remove_duplicate_music(x):
    if ('Music' in x):
        tmp = [el for el in x if el != 'Music']
        if ('Musical' in x):
            return tmp
        else:
            return tmp + ['Musical']
    return x

def remove_duplicate_scifi(x):
    if ('Science Fiction' in x):
        tmp = [el for el in x if el != 'Science Fiction']
        if ('Sci-Fi' in x):
            return tmp
        else:
            return tmp + ['Sci-Fi']
    return x
    
movies.loc[:, 'genres'] = movies.genres.apply(remove_duplicate_music).apply(remove_duplicate_scifi)
np.unique(movies.genres.sum())

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Film-Noir', 'History', 'Horror', 'IMAX', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'TV Movie', 'Thriller', 'War',
       'Western'], dtype='<U18')

In [8]:
movies

Unnamed: 0,movieId,tmdbId,title_x,title_y,year,production_countries,runtime,revenue,popularity,vote_average,vote_count,cast,director,genres
0,1,862.0,Toy Story (1995),Toy Story,1995,[United States of America],81.0,373554033,73.640445,7.7,5269,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[Family, Comedy, Children, Animation, Adventur..."
1,10,710.0,GoldenEye (1995),GoldenEye,1995,"[United Kingdom, United States of America]",130.0,352194034,59.824565,6.6,1174,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...",[Martin Campbell],"[Action, Thriller, Adventure]"
2,11,9087.0,"American President, The (1995)",The American President,1995,[United States of America],106.0,107879496,11.056763,6.5,195,"[Michael Douglas, Annette Bening, Michael J. F...",[Rob Reiner],"[Romance, Drama, Comedy]"
3,14,10858.0,Nixon (1995),Nixon,1995,[United States of America],192.0,13681765,3.770161,7.1,71,"[Anthony Hopkins, Joan Allen, Powers Boothe, E...",[Oliver Stone],"[History, Drama]"
4,15,1408.0,Cutthroat Island (1995),Cutthroat Island,1995,"[France, Germany, Italy, United States of Amer...",119.0,10017322,7.029308,5.7,136,"[Geena Davis, Matthew Modine, Frank Langella, ...",[Renny Harlin],"[Action, Romance, Adventure]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4588,200562,248402.0,A Fine Step (2014),A Fine Step,2014,[],90.0,0,0.654340,4.1,7,"[Luke Perry, Anna Claire Sneed, Leonor Varela]",[Jonathan Meyers],[Drama]
4589,201050,206213.0,Zombie Hunter (2013),Zombie Hunter,2013,[United States of America],93.0,0,3.418372,3.5,34,"[Martin Copping, Danny Trejo, Clare Niederprue...",[Kevin King],"[Comedy, Action, Thriller, Sci-Fi]"
4590,203797,24227.0,Excessive Force (1993),Excessive Force,1993,[United States of America],87.0,1200000,1.279106,4.5,10,"[Thomas Ian Griffith, Lance Henriksen, James E...",[Jon Hess],[Action]
4591,204288,51130.0,Open Secret (1948),Open Secret,1948,[United States of America],68.0,0,0.186401,7.0,2,"[John Ireland, Jane Randolph, Sheldon Leonard,...",[John Reinhardt],"[Mystery, Thriller, Crime]"


In [35]:
def get_wikipedia_page_name(raw_name):
    names = wikipedia.search(raw_name)
    if len(names) == 0:
      return ''
    else:
      return names[0]

def get_movie_plot(page_name):
    try:
      try:
        movie_page_content = str(wikipedia.page(page_name, auto_suggest=False).content)
      except wikipedia.DisambiguationError as e:
        for option in e.options:
          if 'film' in option:
            movie_page_content = str(wikipedia.page(option, auto_suggest=False).content)
        return None
    except (wikipedia.PageError, KeyError):
      return None
    re_groups = re.search("Plot ==(.*?)=+ [A-Z]", str(movie_page_content).replace('\n', ''))
    if re_groups:
      return re_groups.group(1)
    else:
      return None

In [12]:
movies['wikipedia_page_name'] = movies['title_x'].progress_apply(lambda name: get_wikipedia_page_name(name))
movies['movie_plot'] = movies['wikipedia_page_name'].progress_apply(lambda page_name: get_movie_plot(page_name))
print(f'There are {movies["movie_plot"].isna().sum()} NaN movie plots')
movies

  0%|          | 0/4593 [00:00<?, ?it/s]

  0%|          | 0/4593 [00:00<?, ?it/s]



  lis = BeautifulSoup(html).find_all('li')


There are 453 NaN movie plots


Unnamed: 0,movieId,tmdbId,title_x,title_y,year,production_countries,runtime,revenue,popularity,vote_average,vote_count,cast,director,genres,wikipedia_page_name,movie_plot
0,1,862.0,Toy Story (1995),Toy Story,1995,[United States of America],81.0,373554033,73.640445,7.7,5269,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[Family, Comedy, Children, Animation, Adventur...",Toy Story,"A group of living toys, who assume lifelessnes..."
1,10,710.0,GoldenEye (1995),GoldenEye,1995,"[United Kingdom, United States of America]",130.0,352194034,59.824565,6.6,1174,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...",[Martin Campbell],"[Action, Thriller, Adventure]",GoldenEye,"In 1986, MI6 agents James Bond and Alec Trevel..."
2,11,9087.0,"American President, The (1995)",The American President,1995,[United States of America],106.0,107879496,11.056763,6.5,195,"[Michael Douglas, Annette Bening, Michael J. F...",[Rob Reiner],"[Romance, Drama, Comedy]",The American President,Popular Democratic President Andrew Shepherd p...
3,14,10858.0,Nixon (1995),Nixon,1995,[United States of America],192.0,13681765,3.770161,7.1,71,"[Anthony Hopkins, Joan Allen, Powers Boothe, E...",[Oliver Stone],"[History, Drama]",Nixon (film),"In 1972, the White House Plumbers break into T..."
4,15,1408.0,Cutthroat Island (1995),Cutthroat Island,1995,"[France, Germany, Italy, United States of Amer...",119.0,10017322,7.029308,5.7,136,"[Geena Davis, Matthew Modine, Frank Langella, ...",[Renny Harlin],"[Action, Romance, Adventure]",Cutthroat Island,"In 1668 Jamaica, Morgan Adams having escaped a..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4588,200562,248402.0,A Fine Step (2014),A Fine Step,2014,[],90.0,0,0.654340,4.1,7,"[Luke Perry, Anna Claire Sneed, Leonor Varela]",[Jonathan Meyers],[Drama],A Fine Balance,
4589,201050,206213.0,Zombie Hunter (2013),Zombie Hunter,2013,[United States of America],93.0,0,3.418372,3.5,34,"[Martin Copping, Danny Trejo, Clare Niederprue...",[Kevin King],"[Comedy, Action, Thriller, Sci-Fi]",Zombie Hunter (film),"The film opens with a news report on ""Natas"", ..."
4590,203797,24227.0,Excessive Force (1993),Excessive Force,1993,[United States of America],87.0,1200000,1.279106,4.5,10,"[Thomas Ian Griffith, Lance Henriksen, James E...",[Jon Hess],[Action],Excessive Force (film),Detective Terry McCain (Thomas Ian Griffith) i...
4591,204288,51130.0,Open Secret (1948),Open Secret,1948,[United States of America],68.0,0,0.186401,7.0,2,"[John Ireland, Jane Randolph, Sheldon Leonard,...",[John Reinhardt],"[Mystery, Thriller, Crime]",Open Secret,Newlyweds Paul Lester (Ireland) and his wife N...


In [37]:
movies['wikipedia_page_name'] = movies.progress_apply(
    lambda row: get_wikipedia_page_name(row['title_y']) 
    if not row['movie_plot'] 
    else row['wikipedia_page_name'],
    axis=1
)
movies['movie_plot'] = movies.progress_apply(
    lambda row: get_movie_plot(row['wikipedia_page_name']) 
    if not row['movie_plot'] 
    else row['movie_plot'],
    axis=1
)
print(f'There are {movies["movie_plot"].isna().sum()} NaN movie plots')
movies

  0%|          | 0/4593 [00:00<?, ?it/s]

In [49]:
def get_new_wikipedia_page_name(raw_name):
    names = wikipedia.search(raw_name)
    if len(names) == 0:
      return ''
    else:
      for name in names:
        if '(film)' in name:
          return name
      return names[0]

movies['wikipedia_page_name'] = movies.progress_apply(
    lambda row: get_new_wikipedia_page_name(row['title_y']) 
    if not row['movie_plot'] 
    else row['wikipedia_page_name'],
    axis=1
)
movies['movie_plot'] = movies.progress_apply(
    lambda row: get_movie_plot(row['wikipedia_page_name']) 
    if not row['movie_plot'] 
    else row['movie_plot'],
    axis=1
)
print(f'There are {movies["movie_plot"].isna().sum()} NaN movie plots')
movies

  0%|          | 0/4593 [00:00<?, ?it/s]

  0%|          | 0/4593 [00:00<?, ?it/s]



  lis = BeautifulSoup(html).find_all('li')


There are 264 NaN movie plots


Unnamed: 0,movieId,tmdbId,title_x,title_y,year,production_countries,runtime,revenue,popularity,vote_average,vote_count,cast,director,genres,wikipedia_page_name,movie_plot
0,1,862.0,Toy Story (1995),Toy Story,1995,[United States of America],81.0,373554033,73.640445,7.7,5269,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[Family, Comedy, Children, Animation, Adventur...",Toy Story,"A group of living toys, who assume lifelessnes..."
1,10,710.0,GoldenEye (1995),GoldenEye,1995,"[United Kingdom, United States of America]",130.0,352194034,59.824565,6.6,1174,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...",[Martin Campbell],"[Action, Thriller, Adventure]",GoldenEye,"In 1986, MI6 agents James Bond and Alec Trevel..."
2,11,9087.0,"American President, The (1995)",The American President,1995,[United States of America],106.0,107879496,11.056763,6.5,195,"[Michael Douglas, Annette Bening, Michael J. F...",[Rob Reiner],"[Romance, Drama, Comedy]",The American President,Popular Democratic President Andrew Shepherd p...
3,14,10858.0,Nixon (1995),Nixon,1995,[United States of America],192.0,13681765,3.770161,7.1,71,"[Anthony Hopkins, Joan Allen, Powers Boothe, E...",[Oliver Stone],"[History, Drama]",Nixon (film),"In 1972, the White House Plumbers break into T..."
4,15,1408.0,Cutthroat Island (1995),Cutthroat Island,1995,"[France, Germany, Italy, United States of Amer...",119.0,10017322,7.029308,5.7,136,"[Geena Davis, Matthew Modine, Frank Langella, ...",[Renny Harlin],"[Action, Romance, Adventure]",Cutthroat Island,"In 1668 Jamaica, Morgan Adams having escaped a..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4588,200562,248402.0,A Fine Step (2014),A Fine Step,2014,[],90.0,0,0.654340,4.1,7,"[Luke Perry, Anna Claire Sneed, Leonor Varela]",[Jonathan Meyers],[Drama],Luke Perry,
4589,201050,206213.0,Zombie Hunter (2013),Zombie Hunter,2013,[United States of America],93.0,0,3.418372,3.5,34,"[Martin Copping, Danny Trejo, Clare Niederprue...",[Kevin King],"[Comedy, Action, Thriller, Sci-Fi]",Zombie Hunter (film),"The film opens with a news report on ""Natas"", ..."
4590,203797,24227.0,Excessive Force (1993),Excessive Force,1993,[United States of America],87.0,1200000,1.279106,4.5,10,"[Thomas Ian Griffith, Lance Henriksen, James E...",[Jon Hess],[Action],Excessive Force (film),Detective Terry McCain (Thomas Ian Griffith) i...
4591,204288,51130.0,Open Secret (1948),Open Secret,1948,[United States of America],68.0,0,0.186401,7.0,2,"[John Ireland, Jane Randolph, Sheldon Leonard,...",[John Reinhardt],"[Mystery, Thriller, Crime]",Open Secret,Newlyweds Paul Lester (Ireland) and his wife N...


In [63]:
movies.dropna().to_parquet(data_path + 'movies_data.pq')

In [79]:
movies = pd.read_parquet(data_path + 'movies_wiki.pq')

In [80]:
movies = (
    movies
    .drop(columns=['title_x', 'wikipedia_page_name'])
    .rename(columns={
        'title_y': 'title', 
        'movieId': 'movielens_id', 
        'tmdbId': 'tmdb_id',
        'popularity': 'tmdb_popularity_score',
        'vote_average': 'tmdb_rating_avg',
        'vote_count': 'tmdb_votes_count'
    })
)
movies

Unnamed: 0,movielens_id,tmdb_id,title,year,production_countries,runtime,revenue,tmdb_popularity_score,tmdb_rating_avg,tmdb_votes_count,cast,director,genres,movie_plot
0,1,862.0,Toy Story,1995,[United States of America],81.0,373554033,73.640445,7.7,5269,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[Family, Comedy, Children, Animation, Adventur...","A group of living toys, who assume lifelessnes..."
1,10,710.0,GoldenEye,1995,"[United Kingdom, United States of America]",130.0,352194034,59.824565,6.6,1174,"[Pierce Brosnan, Sean Bean, Izabella Scorupco,...",[Martin Campbell],"[Action, Thriller, Adventure]","In 1986, MI6 agents James Bond and Alec Trevel..."
2,11,9087.0,The American President,1995,[United States of America],106.0,107879496,11.056763,6.5,195,"[Michael Douglas, Annette Bening, Michael J. F...",[Rob Reiner],"[Romance, Drama, Comedy]",Popular Democratic President Andrew Shepherd p...
3,14,10858.0,Nixon,1995,[United States of America],192.0,13681765,3.770161,7.1,71,"[Anthony Hopkins, Joan Allen, Powers Boothe, E...",[Oliver Stone],"[History, Drama]","In 1972, the White House Plumbers break into T..."
4,15,1408.0,Cutthroat Island,1995,"[France, Germany, Italy, United States of Amer...",119.0,10017322,7.029308,5.7,136,"[Geena Davis, Matthew Modine, Frank Langella, ...",[Renny Harlin],"[Action, Romance, Adventure]","In 1668 Jamaica, Morgan Adams having escaped a..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4587,199952,367961.0,Savva. Heart of the Warrior,2015,[Russia],85.0,0,2.625594,6.4,7,"[Maksim Chukharyov, Konstantin Khabenskiy, Mik...",[Maksim Fadeev],"[Fantasy, Animation, Adventure]","A mother tells her young son, Savva, a bed-tim..."
4589,201050,206213.0,Zombie Hunter,2013,[United States of America],93.0,0,3.418372,3.5,34,"[Martin Copping, Danny Trejo, Clare Niederprue...",[Kevin King],"[Comedy, Action, Thriller, Sci-Fi]","The film opens with a news report on ""Natas"", ..."
4590,203797,24227.0,Excessive Force,1993,[United States of America],87.0,1200000,1.279106,4.5,10,"[Thomas Ian Griffith, Lance Henriksen, James E...",[Jon Hess],[Action],Detective Terry McCain (Thomas Ian Griffith) i...
4591,204288,51130.0,Open Secret,1948,[United States of America],68.0,0,0.186401,7.0,2,"[John Ireland, Jane Randolph, Sheldon Leonard,...",[John Reinhardt],"[Mystery, Thriller, Crime]",Newlyweds Paul Lester (Ireland) and his wife N...


In [82]:
movies.to_parquet(data_path + 'movies_data.pq')