# Task #1: Get info Box (store in Python dictionary)

### Import necessary libaries

In [106]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Load the webpage

In [None]:
req = requests.get(url='https://en.wikipedia.org/wiki/WALL-E')
# Convert to a beautiful soup object
soup = bs(req.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
for row in info_rows:
    print(row.prettify())

In [None]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text('|', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

movie_info: dict = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text('|', strip=True)
    elif index == 1:
        continue
    else:
        header = row.find('th')
        if header:
            content_key = row.find('th').get_text('|', strip=True)
            content_value = get_content_value(row.find('td'))
            movie_info[content_key] = content_value

movie_info

# Task #2: Get info box for all movies

In [67]:
r = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()

In [None]:
movies = soup.select('.wikitable.sortable i a')
movies

In [79]:
'''
    " ", strip=True:
    " " - каким образом разделить объединенные строки
    strip=True - удалить пробелы в начале и конце строк
    Напримре: Productioncompany --> Production company
'''
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    
    # Данный elif находит тег <br>, из-за которого не получалось на выходе получить лист
    # Например, без данного elif мы получали: 'Starring': 'Fess Parker Jeffrey Hunter John Lupton Jeff York Slim Pickens',
    # После добавления данного elif получили: 'Starring': ['Fess Parker', 'Jeffrey Hunter', 'John Lupton', 'Jeff York', 'Slim Pickens']
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')


# Функция очищает тэги <sup> - который отображает текст в виде верхнего индекса
# и <span> - который отображает дату в формате "yyyy.mm.dd"
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()


def get_info_box(relative_path, url: str) -> dict:
    req = requests.get(url=url)
    # Convert to a beautiful soup object
    soup = bs(req.content, 'html.parser')  # Указываем парсер 'html.parser'

    info_box = soup.find(class_='infobox vevent')
    if not info_box:
        return {} # Возвращаем пустой словарь, если инфобокс не найден
    
    info_rows = info_box.find_all('tr')
    
    clean_tags(soup=soup)

    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0: # Извлекаем название фильма
            movie_info['URL'] = relative_path
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
            # movie_info['link'] = [a['href'] for a in row.find_all('a', href=True)]
            
        elif index == 1:  # Пропускаем вторую строку, если нужно
            continue
        else:
            header = row.find('th')
            # print(header)
            if header:
                content_key = row.find('th').get_text(' ', strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value

    return movie_info

In [None]:
%timeit
req = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(req.content, 'html.parser')

base_path = 'https://en.wikipedia.org/'

movies = soup.select('.wikitable.sortable i a')


movie_info_list: list = []

for index, movie in enumerate(movies):
    # endpoint = soup.select('h2')[0].get_text()
    # if endpoint == 'Upcoming':
    #     break

    if index % 10 == 0:
        print(index)

    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']

        # В список добавляем словарь, который возвращает функция get_info_box
        movie_info_list.append(get_info_box(relative_path, full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)

In [None]:
get_info_box(url='https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)')

In [105]:
len(movie_info_list)

553

### Save/Reload Movie data

In [100]:
# Сохранить данные в JSON формат
import json
def save_data(title, data):
    with open(file=title, mode='w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        

In [101]:
save_data('disney_data.json', movie_info_list)

In [102]:
# Загрузить данные из JSON формата
import json
def load_data(title):
    with open(title, mode='r', encoding='utf-8') as f:
        return json.load(f)

In [111]:
movie_info_list = load_data(title='disney_data.json')
# df = pd.DataFrame(movie_info_list)
# df

# Task #3: Clean data

### Subtasks
- ~~Clean up references (remove [1] [2] etc)~~ функция clean_tags выше
- Convert running time into an integer
- Convert dates into datetime object
- ~~Split up the long strings~~
- Convert Budget and Box office to numbers

In [None]:
# Split up the long strings


In [124]:
type(movie_info_list)

list

In [None]:
[movie.get('Running time', 'N/A') for movie in movie_info_list]

In [143]:
def minutes_to_integer(running_time):
    if running_time == 'N/A':
        return None
    elif '\n' not in running_time:
        if isinstance(running_time, list):
            return int(running_time[0].split(' ')[0])
        else:
            return int(running_time.split(' ')[0])
    else:
        if isinstance(running_time, list):
            return int(running_time[0].split('\n')[0])
        else:
            return int(running_time.split('\n')[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

In [None]:
movie_info_list[-10]

In [None]:
df = pd.DataFrame(movie_info_list)
df

In [None]:
df['Release date'] = pd.to_datetime(df['Release date'][0])
df.head()

In [None]:
# искать символ переноса строки ('\n') по всем столбцам с помощью .applymap():
rows_with_newline = df[df.map(lambda x: '\n' in str(x)).any(axis=1)]
rows_with_newline