In [None]:
import pandas as pd
import numpy as np

from pwiki.wiki import Wiki
import wikipedia
import wikipediaapi
import re

from tqdm import tqdm
tqdm.pandas()

In [None]:
movies_ml = pd.read_csv('movies.csv')
movies_ml.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
def get_movie_description(title: str) -> str:
    """
    Retrieves movie plot from Wikipedia with a title
    """
    data_loaders = {'wiki': wikipedia_loader, 'pywiki': pywiki_loader, 'apiwiki': apiwiki_loader}
    titles = [title, f"{title.split(' (')[0]} (film)", title.split(' (')[0]]

    content = ''
    for title in titles:
        for loader_name in data_loaders:
            loader = data_loaders[loader_name]
            try:
                content = loader(title)
                if loader_name == 'apiwiki' and len(content) > 0:
                    return content

                plot_pattern = re.compile(r'==\s*Plot\s*==\n(.*?)(?==|$)', re.DOTALL)
                plot_match = plot_pattern.search(content)

                if plot_match:
                    return plot_match.group(1).strip()
            except:
                pass
    return None

def wikipedia_loader(title: str) -> str:
    return wikipedia.page(title).content

def pywiki_loader(title: str) -> str:
    wiki = Wiki()
    return wiki.page_text(title)

def apiwiki_loader(title: str) -> str:
    wiki = wikipediaapi.Wikipedia('Diploma (m.tsalyk@ucu.edu.ua)', 'en')
    return wiki.page(title).text

def scrap_descriptions(df: pd. DataFrame) -> pd.DataFrame:
    if 'description' not in df.columns:
        df['description'] = None
    df.loc[df['description'].isna(), 'description'] = df.loc[df['description'].isna()]['title'].progress_map(get_movie_description)
    return df

movies_ml = scrap_descriptions(movies_ml)
movies_ml



  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 1389/1389 [46:55<00:00,  2.03s/it]


Unnamed: 0,movieId,title,genres,description
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"Sentient when humans are not around, a group o..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"In 1969, Alan Parrish lives with his parents, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,The feud between Max and John has cooled and t...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"Four friends (Savannah, Robin, Bernadine, and ..."
4,5,Father of the Bride Part II (1995),Comedy,"Four years after the events of the first film,..."
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,The story is based on the sinking of the Titan...
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"In the present, Izuna and Tet play a game of c..."
9739,193585,Flint (2017),Drama,Flint is a 2017 television drama film based on...
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"6 years ago, the most violent struggle in Yoko..."


In [None]:
movies_ml[movies_ml['description'].isna()]

Unnamed: 0,movieId,title,genres,description
69,77,Nico Icon (1995),Documentary,
110,128,Jupiter's Wife (1994),Documentary,
185,217,"Babysitter, The (1995)",Drama|Thriller,
247,285,Beyond Bedlam (1993),Drama|Horror,
255,294,"Perez Family, The (1995)",Comedy|Romance,
...,...,...,...,...
9695,184791,Fred Armisen: Standup for Drummers (2018),Comedy,
9726,190209,Jeff Ross Roasts the Border (2017),Comedy,
9730,190221,Hommage à Zgougou (et salut à Sabine Mamou) (2...,Documentary,
9736,193579,Jon Stewart Has Left the Building (2015),Documentary,


In [None]:
movies_ml.to_csv('movies.csv', index=False)