# Task #1: Get info Box (store in Python dictionary)

### Import necessary libaries

In [85]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Load the webpage

In [None]:
req = requests.get(url='https://en.wikipedia.org/wiki/WALL-E')
# Convert to a beautiful soup object
soup = bs(req.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
for row in info_rows:
    print(row.prettify())

In [None]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text('|', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

movie_info: dict = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text('|', strip=True)
    elif index == 1:
        continue
    else:
        header = row.find('th')
        if header:
            content_key = row.find('th').get_text('|', strip=True)
            content_value = get_content_value(row.find('td'))
            movie_info[content_key] = content_value

movie_info

# Task #2: Get info box for all movies

In [67]:
r = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()

In [None]:
movies = soup.select('.wikitable.sortable i a')
movies

In [78]:
'''
    " ", strip=True:
    " " - каким образом разделить объединенные строки
    strip=True - удалить пробелы в начале и конце строк
    Напримре: Productioncompany --> Production company
'''
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    
    # Данный elif находит тег <br>, из-за которого не получалось на выходе получить лист
    # Например, без данного elif мы получали: 'Starring': 'Fess Parker Jeffrey Hunter John Lupton Jeff York Slim Pickens',
    # После добавления данного elif получили: 'Starring': ['Fess Parker', 'Jeffrey Hunter', 'John Lupton', 'Jeff York', 'Slim Pickens']
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')


# Функция очищает тэги <sup> - который отображает текст в виде верхнего индекса
# и <span> - который отображает дату в формате "yyyy.mm.dd"
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()


def get_info_box(relative_path, url: str) -> dict:
    req = requests.get(url=url)
    # Convert to a beautiful soup object
    soup = bs(req.content, 'html.parser')  # Указываем парсер 'html.parser'

    info_box = soup.find(class_='infobox vevent')
    if not info_box:
        return {} # Возвращаем пустой словарь, если инфобокс не найден
    
    info_rows = info_box.find_all('tr')
    
    clean_tags(soup=soup)

    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0: # Извлекаем название фильма
            movie_info['URL'] = relative_path
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
            # movie_info['link'] = [a['href'] for a in row.find_all('a', href=True)]
            
        elif index == 1:  # Пропускаем вторую строку, если нужно
            continue
        else:
            header = row.find('th')
            # print(header)
            if header:
                content_key = row.find('th').get_text(' ', strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value

    return movie_info

In [131]:
import time
import requests
from bs4 import BeautifulSoup as bs

start_time = time.time()

req = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Преобразуем ответ в объект BeautifulSoup
soup = bs(req.content, 'html.parser')

base_path = 'https://en.wikipedia.org/'

# Находим основной блок с контентом
mw_body_content = soup.select('.mw-content-ltr.mw-parser-output')

movie_info_list = []

# Проходим по всем элементам в основном блоке
for elements in mw_body_content:
    # Находим все заголовки h2
    headings = elements.select('h2')

    for head in headings:
        # Если встречаем заголовок "Upcoming", прекращаем выполнение
        if 'Upcoming' in head.get_text():
            print("Encountered 'Upcoming', stopping movie processing.")
            break

        # Ищем таблицы с фильмами перед заголовком "Upcoming"
        movies = elements.select('.wikitable.sortable i a')

        # Обрабатываем фильмы, если они есть
        for index, movie in enumerate(movies):
            if index % 10 == 0:
                print(f"Processing movie {index}: {movie.get_text()}")
            try:
                relative_path = movie['href']
                full_path = base_path + relative_path
                title = movie.get_text()

                # В список добавляем словарь, который возвращает функция get_info_box
                movie_info_list.append(get_info_box(relative_path, full_path))
                
            except Exception as e:
                print(f"Error processing movie {movie.get_text()}: {e}")
    
    # Прерываем внешний цикл после заголовка "Upcoming"
    if 'Upcoming' in head.get_text():
        break

# Измеряем время выполнения
end_time = time.time()
print(f"Execution time: {end_time - start_time} seconds")

# Результат (список фильмов)
print(f"Processed {len(movie_info_list)} movies.")

Processing movie 0: Snow White and the Seven Dwarfs
Processing movie 10: Song of the South
Processing movie 20: The Sword and the Rose
Processing movie 30: Davy Crockett and the River Pirates
Processing movie 40: The Shaggy Dog
Processing movie 50: One Hundred and One Dalmatians
Processing movie 60: The Legend of Lobo
Processing movie 70: The Three Lives of Thomasina
Processing movie 80: Follow Me, Boys!
Processing movie 90: The Horse in the Gray Flannel Suit
Processing movie 100: Scandalous John
Processing movie 110: One Little Indian
Processing movie 120: One of Our Dinosaurs Is Missing
Processing movie 130: The Many Adventures of Winnie the Pooh
Processing movie 140: Unidentified Flying Oddball
Processing movie 150: The Fox and the Hound
Processing movie 160: The Journey of Natty Gann
Processing movie 170: The Little Mermaid
Processing movie 180: The Mighty Ducks
Processing movie 190: Blank Check
Processing movie 200: Heavyweights
Processing movie 210: Tom and Huck
Processing movie 

In [None]:
get_info_box(url='https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)')

### Save/Reload Movie data

In [134]:
# Сохранить данные в JSON формат
import json
def save_data(title, data):
    with open(file=title, mode='w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        

In [135]:
save_data('disney_data.json', movie_info_list)

In [113]:
# Загрузить данные из JSON формата
import json
def load_data(title):
    with open(title, mode='r', encoding='utf-8') as f:
        return json.load(f)

In [114]:
movie_info_list = load_data(title='disney_data.json')
# df = pd.DataFrame(movie_info_list)
# df

# Task #3: Clean data

### Subtasks
- ~~Clean up references (remove [1] [2] etc)~~ функция clean_tags выше
- ~~Convert running time into an integer~~
- Convert dates into datetime object
- ~~Split up the long strings~~
- ~~Convert Budget and Box office to numbers~~

In [124]:
type(movie_info_list)

list

In [None]:
[movie.get('Running time', 'N/A') for movie in movie_info_list]

In [143]:
def minutes_to_integer(running_time):
    if running_time == 'N/A':
        return None
    elif '\n' not in running_time:
        if isinstance(running_time, list):
            return int(running_time[0].split(' ')[0])
        else:
            return int(running_time.split(' ')[0])
    else:
        if isinstance(running_time, list):
            return int(running_time[0].split('\n')[0])
        else:
            return int(running_time.split('\n')[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

In [None]:
movie_info_list[1:10]

In [None]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

In [68]:
# Convert Budget and Box office to numbers
import re
amounts = r'thousand|million|billion'
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf'\${number}(-|\sto\s|–)?({number})?\s({amounts})'
value_re = rf'\${number}'

def word_to_value(word):
    value_dict: dict = {'thousand' : 1_000, 'million' : 1_000_000, 'billion' : 1_000_000_000}
    return value_dict[word]


def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(',', ''))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word=word)
    return value * word_value


def parse_value_syntax(string) -> float:
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ''))
    return value
    # Strip out commas before solution


# ОСНОВНАЯ ФУНКЦИЯ
def money_conversion(money):

    if money == 'N/A':
        return None

    if isinstance(money, list):
        money = money[0]

    # print(money)

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)
    

    if word_syntax:
       return parse_word_syntax(string=word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(string=value_syntax.group())

    else:
        return None


In [58]:
money_conversion(movie_info_list[40]['Budget'])

$6 million


6000000.0

In [69]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    # movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

In [None]:
# Измерение времени выполнения
import time

start_time = time.time()

req = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Преобразуем ответ в объект BeautifulSoup
soup = bs(req.content, 'html.parser')

mw_body_content = soup.select('.mw-content-ltr.mw-parser-output')


for elements in mw_body_content:
    # heading = elements.select('.mw-heading.mw-heading2 > h2')
    movies = elements.select('.wikitable.sortable i a')
    for movie in movies:
        print(movie.get_text())
        

In [None]:
df = pd.DataFrame(movie_info_list)
df

In [None]:
df['Release date'] = pd.to_datetime(df['Release date'][0])
df.head()

In [None]:
# искать символ переноса строки ('\n') по всем столбцам с помощью .applymap():
rows_with_newline = df[df.map(lambda x: '\n' in str(x)).any(axis=1)]
rows_with_newline