# Task #1: Get info Box (store in Python dictionary)

### Import necessary libaries

In [36]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Load the webpage

In [None]:
req = requests.get(url='https://en.wikipedia.org/wiki/WALL-E')
# Convert to a beautiful soup object
soup = bs(req.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [22]:
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
for row in info_rows:
    print(row.prettify())

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text('|', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

movie_info: dict = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text('|', strip=True)
    elif index == 1:
        continue
    else:
        header = row.find('th')
        if header:
            content_key = row.find('th').get_text('|', strip=True)
            content_value = get_content_value(row.find('td'))
            movie_info[content_key] = content_value

movie_info

# Task #2: Get info box for all movies

In [34]:
r = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()

In [None]:
movies = soup.select('.wikitable.sortable i')
movies[0:10]
movies[0]

In [27]:
'''
" ", strip=True:
    " " - каким образом разделить объединенные строки
    strip=True - удалить пробелы в начале и конце строк
    Напримре: Productioncompany --> Production company
'''
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    
    # Данный elif находит тег <br>, из-за которого не получалось на выходе получить лист
    # Например, без данного elif мы получали: 'Starring': 'Fess Parker Jeffrey Hunter John Lupton Jeff York Slim Pickens',
    # После добавления данного elif получили: 'Starring': ['Fess Parker', 'Jeffrey Hunter', 'John Lupton', 'Jeff York', 'Slim Pickens']
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('xa0', ' ')


# Функция очищает тэги <sup> - который отображает текст в виде верхнего индекса
# и <span> - который отображает дату в формате "yyyy.mm.dd"
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()


def get_info_box(url: str) -> dict:
    req = requests.get(url=url)
    # Convert to a beautiful soup object
    soup = bs(req.content)

    info_box = soup.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    clean_tags(soup=soup)

    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
        elif index == 1:
            continue
        else:
            header = row.find('th')
            # print(header)
            if header:
                content_key = row.find('th').get_text(' ', strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value

    return movie_info

In [None]:
get_info_box(url='https://en.wikipedia.org/wiki/Zorro_(1957_TV_series)#Theatrical')

In [None]:
%timeit
req = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(req.content)

base_path = 'https://en.wikipedia.org/'

movies = soup.select('.wikitable.sortable i a')
# movies[0:10]

movie_info_list: list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']

        # В список добавляем словарь, который возвращает функция get_info_box
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)

In [30]:
len(movie_info_list)

541

### Save/Reload Movie data

In [31]:
# Сохранить данные в JSON формат
import json
def save_data(title, data):
    with open(file=title, mode='w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)   
        

In [32]:
save_data('disney_data.json', movie_info_list)

In [33]:
# Загрузить данные из JSON формата
import json
def load_data(title):
    with open(title, mode='r', encoding='utf-8') as f:
        return json.load(f)

In [41]:
movie_info_list = load_data(title='disney_data.json')
df = pd.DataFrame(movie_info_list)
df.loc[540]

title                                                         Tinker Bell
Directed by             [Bradley Raymond (1 & 3), Klay Hall (2), Peggy...
Story by                                                              NaN
Based on                                                              NaN
Produced by                                                           NaN
Music by                                               Joel McNeely (1–6)
Production company                                     DisneyToon Studios
Distributed by                  [Walt Disney Studios, Home Entertainment]
Release dates           [1, : October 28, 2008, 2, :, 3, :, 4, :, 5, :...
Running time                                                [515 minutes]
Country                                                     United States
Language                                                          English
Budget                                                                NaN
Box office                            

# Task #3: Clean data

### Subtasks
- ~~Clean up references (remove [1] [2] etc)~~ функция clean_tags выше
- Convert running time into an integer
- Convert dates into datetime object
- Split up the long strings
- Convert Budget and Box office to numbers

In [None]:
# Split up the long strings


In [85]:
print(len(movie_info_list))

535


In [95]:
df = pd.DataFrame(movie_info_list)

df

Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Music by,Production company,Distributed by,Release dates,Running time,...,Cinematography,Release date,Written by,Edited by,Languages,Narrated by,Screenplay by,Countries,Color process,Production companies
0,Snow White and the Seven Dwarfs,"[Perce Pearce, William Cottrell, Larry Morey, ...","[Ted Sears, Richard Creedon, Otto Englander, D...","["", Snow White, "", by the, Brothers Grimm]",Walt Disney,"[Frank Churchill, Leigh Harline, Paul Smith]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",83 minutes,...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...","[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]",See plot,Walt Disney Productions,RKO Radio Pictures,,126 minutes,...,James Wong Howe,"[November 13, 1940]",,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...",,,Walt Disney,"[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,74 minutes,...,Bert Glennon,"[June 27, 1941]","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",Paul Weatherwax,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",64 minutes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,The Little Mermaid: Ariel's Beginning,Peggy Holmes,"[Jule Selbo, Jenny Wingfield]",[The Little Mermaid by Hans Christian Andersen...,Kendra Halland,James Dooley,,Walt Disney Studios Home Entertainment,,77 minutes,...,,[26 August 2008],,John Royer,,,"[Robert Reece, Evan Spiliotopoulos]","[Canada, Australia, United States, Philippines]",,"[DisneyToon Studios, Walt Disney Pictures]"
349,Beverly Hills Chihuahua,Raja Gosnell,Jeff Bushell,,"[David Hoberman, Todd Lieberman, John Jacobs, ...",Heitor Pereira,,"[Walt Disney Studios, Motion Pictures]","[September 18, 2008 ( El Capitan Theatre ), Oc...",91 minutes,...,Phil Méheux,,,Sabrina Plisco,"[English, Spanish]",,"[Analisa LaBianco, Jeff Bushell]",,,"[Walt Disney Pictures, Mandeville Films, Smart..."
350,Morning Light,Mark Monroe,"[Roy E. Disney, Leslie DeMeuse, Thomas J. Poll...",,Morgan Sackett,"[Ric Markmann, Dan Pinella, Chris Wagner]",Walt Disney Pictures,"[Walt Disney Studios, Motion Pictures]",,100 minutes,...,John Brooks,"[October 17, 2008]",Mark Morone,Paul Crowder,,,,,,
351,High School Musical 3: Senior Year,Kenny Ortega,,"[Characters, by Peter Barsocchini]","[Bill Borden, Barry Rosenbush]",David Lawrence,,Walt Disney Studios Motion Pictures,"[October 17, 2008 (London), October 24, 2008 (...",112 minutes,...,Daniel Aranyò,,Peter Barsocchini,Don Brochu,,,,,,"[Walt Disney Pictures, Borden & Rosenbush Ente..."


In [102]:
df['Release date'] = pd.to_datetime(df['Release date'][0])
df.head()

Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Music by,Production company,Distributed by,Release dates,Running time,...,Cinematography,Release date,Written by,Edited by,Languages,Narrated by,Screenplay by,Countries,Color process,Production companies
0,Snow White and the Seven Dwarfs,"[Perce Pearce, William Cottrell, Larry Morey, ...","[Ted Sears, Richard Creedon, Otto Englander, D...","["", Snow White, "", by the, Brothers Grimm]",Walt Disney,"[Frank Churchill, Leigh Harline, Paul Smith]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",83 minutes,...,,NaT,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...","[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,...,,NaT,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]",See plot,Walt Disney Productions,RKO Radio Pictures,,126 minutes,...,James Wong Howe,NaT,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...",,,Walt Disney,"[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,74 minutes,...,Bert Glennon,NaT,"[Live-action:, Ted Sears, Al Perkins, Larry Cl...",Paul Weatherwax,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",64 minutes,...,,NaT,,,,,,,,
