# Task #1: Get info Box (store in Python dictionary)

### Import necessary libaries

In [2]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Load the webpage

In [None]:
req = requests.get(url='https://en.wikipedia.org/wiki/WALL-E')
# Convert to a beautiful soup object
soup = bs(req.content)

# Print out the HTML
contents = soup.prettify()
print(contents)

In [None]:
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
for row in info_rows:
    print(row.prettify())

In [None]:
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text('|', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')

movie_info: dict = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text('|', strip=True)
    elif index == 1:
        continue
    else:
        header = row.find('th')
        if header:
            content_key = row.find('th').get_text('|', strip=True)
            content_value = get_content_value(row.find('td'))
            movie_info[content_key] = content_value

movie_info

# Task #2: Get info box for all movies

In [67]:
r = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Convert to a beautiful soup object
soup = bs(r.content)

contents = soup.prettify()

In [None]:
movies = soup.select('.wikitable.sortable i a')
movies

In [78]:
'''
    " ", strip=True:
    " " - каким образом разделить объединенные строки
    strip=True - удалить пробелы в начале и конце строк
    Напримре: Productioncompany --> Production company
'''
def get_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
    
    # Данный elif находит тег <br>, из-за которого не получалось на выходе получить лист
    # Например, без данного elif мы получали: 'Starring': 'Fess Parker Jeffrey Hunter John Lupton Jeff York Slim Pickens',
    # После добавления данного elif получили: 'Starring': ['Fess Parker', 'Jeffrey Hunter', 'John Lupton', 'Jeff York', 'Slim Pickens']
    elif row_data.find('br'):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', ' ')


# Функция очищает тэги <sup> - который отображает текст в виде верхнего индекса
# и <span> - который отображает дату в формате "yyyy.mm.dd"
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()


def get_info_box(relative_path, url: str) -> dict:
    req = requests.get(url=url)
    # Convert to a beautiful soup object
    soup = bs(req.content, 'html.parser')  # Указываем парсер 'html.parser'

    info_box = soup.find(class_='infobox vevent')
    if not info_box:
        return {} # Возвращаем пустой словарь, если инфобокс не найден
    
    info_rows = info_box.find_all('tr')
    
    clean_tags(soup=soup)

    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0: # Извлекаем название фильма
            movie_info['URL'] = relative_path
            movie_info['title'] = row.find('th').get_text(' ', strip=True)
            # movie_info['link'] = [a['href'] for a in row.find_all('a', href=True)]
            
        elif index == 1:  # Пропускаем вторую строку, если нужно
            continue
        else:
            header = row.find('th')
            # print(header)
            if header:
                content_key = row.find('th').get_text(' ', strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value

    return movie_info

In [None]:
import time

start_time = time.time()

req = requests.get(url='https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

# Преобразуем ответ в объект BeautifulSoup
soup = bs(req.content, 'html.parser')

base_path = 'https://en.wikipedia.org/'

# Находим основной блок с контентом
mw_body_content = soup.select('.mw-content-ltr.mw-parser-output')

movie_info_list = []

# Проходим по всем элементам в основном блоке
for elements in mw_body_content:
    # Находим все заголовки h2
    headings = elements.select('h2')

    for head in headings:
        # Если встречаем заголовок "Upcoming", прекращаем выполнение
        if 'Upcoming' in head.get_text():
            print("Encountered 'Upcoming', stopping movie processing.")
            break

        # Ищем таблицы с фильмами перед заголовком "Upcoming"
        movies = elements.select('.wikitable.sortable i a')

        # Обрабатываем фильмы, если они есть
        for index, movie in enumerate(movies):
            if index % 10 == 0:
                print(f"Processing movie {index}: {movie.get_text()}")
            try:
                relative_path = movie['href']
                full_path = base_path + relative_path
                title = movie.get_text()

                # В список добавляем словарь, который возвращает функция get_info_box
                movie_info_list.append(get_info_box(relative_path, full_path))
                
            except Exception as e:
                print(f"Error processing movie {movie.get_text()}: {e}")
    
    # Прерываем внешний цикл после заголовка "Upcoming"
    if 'Upcoming' in head.get_text():
        break

# Измеряем время выполнения
end_time = time.time()
print(f"Execution time: {end_time - start_time} seconds")

# Результат (список фильмов)
print(f"Processed {len(movie_info_list)} movies.")

In [None]:
get_info_box(url='https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)')

### Save/Reload Movie data

In [74]:
# Сохранить данные в JSON формат
import json
def save_data(title, data):
    with open(file=title, mode='w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    

In [135]:
save_data('disney_data.json', movie_info_list)

In [3]:
# Загрузить данные из JSON формата
import json
def load_data(title):
    with open(title, mode='r', encoding='utf-8') as f:
        return json.load(f)

In [4]:
movie_info_list = load_data(title='disney_data.json')
# df = pd.DataFrame(movie_info_list)
# df

# Task #3: Clean data

### Subtasks
- ~~Clean up references (remove [1] [2] etc)~~ функция clean_tags выше
- ~~Convert running time into an integer~~
- ~~Convert dates into datetime object~~
- ~~Split up the long strings~~
- ~~Convert Budget and Box office to numbers~~

In [5]:
type(movie_info_list)

list

In [None]:
[movie.get('Running time', 'N/A') for movie in movie_info_list]

In [7]:
def minutes_to_integer(running_time):
    if running_time == 'N/A':
        return None
    elif '\n' not in running_time:
        if isinstance(running_time, list):
            return int(running_time[0].split(' ')[0])
        else:
            return int(running_time.split(' ')[0])
    else:
        if isinstance(running_time, list):
            return int(running_time[0].split('\n')[0])
        else:
            return int(running_time.split('\n')[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', 'N/A'))

In [None]:
movie_info_list[1:10]

In [None]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

In [9]:
# Convert Budget and Box office to numbers
import re
amounts = r'thousand|million|billion'
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf'\${number}(-|\sto\s|–)?({number})?\s({amounts})'
value_re = rf'\${number}'

def word_to_value(word):
    value_dict: dict = {'thousand' : 1_000, 'million' : 1_000_000, 'billion' : 1_000_000_000}
    return value_dict[word]


def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(',', ''))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word=word)
    return value * word_value


def parse_value_syntax(string) -> float:
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ''))
    return value
    # Strip out commas before solution


# ОСНОВНАЯ ФУНКЦИЯ
def money_conversion(money):

    if money == 'N/A':
        return None

    if isinstance(money, list):
        money = money[0]

    # print(money)

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)
    

    if word_syntax:
       return parse_word_syntax(string=word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(string=value_syntax.group())

    else:
        return None


In [None]:
money_conversion(movie_info_list['Budget'])

In [12]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

In [None]:
movie_info_list

In [17]:
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

['N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A', 'N/A', 'N/A', ['September 27, 1947'], ['May 27, 1948'], 'N/A', ['October 5, 1949'], 'N/A', 'N/A', 'N/A', 'N/A', ['February 5, 1953'], ['July 23, 1953 (United States)'], ['November 9, 1953'], 'N/A', ['August 17, 1954'], ['December 23, 1954'], ['May 25, 1955'], ['June 22, 1955'], ['September 14, 1955'], ['December 22, 1955'], ['June 8, 1956'], ['July 18, 1956'], ['September 4, 1956'], ['December 20, 1956'], ['June 19, 1957'], ['August 28, 1957'], ['December 25, 1957'], ['July 8, 1958'], ['August 12, 1958'], ['December 25, 1958'], ['January 29, 1959'], ['March 19, 1959'], 'N/A', ['November 10, 1959'], ['January 21, 1960 ( Sarasota, FL )'], ['February 24, 1960'], ['May 19, 1960'], 'N/A', 'N/A', ['November 1, 1960'], ['December 21, 1960'], ['January 25, 1961'], ['March 16, 1961'], ['June 21, 1961'], ['July 12, 1961'], ['July 17, 1961'], ['December 14, 1961'], ['April 5, 1962'], ['May 17, 

In [None]:
# Convert dates to datetime object
# June 28, 2023

from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split('(')[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    if date == 'N/A':
        return None
    
    date_str = clean_date(date=date)
    print(date_str)

    fmts = ['%B %d, %Y', '%d %B %Y', ]

    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except Exception as e:
            print(e)
    return None

# Для проверки
# for date in dates:
#     print(date_conversion(date))
#     print()

for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [None]:
movie_info_list[50]

In [63]:
import pickle

def save_data_pickle(name, data):
    with open(name, mode='wb') as f:
        pickle.dump(data, f)

In [2]:
import pickle

def load_data_pickle(name):
    with open(name, mode='rb') as f:
        return pickle.load(f)

In [None]:
save_data_pickle('disney_movie_data_cleaned_more.pickle', movie_info_list)

In [6]:
a = load_data_pickle(name='disney_movie_data_cleaned_more.pickle')

In [None]:
a == movie_info_list

### Task #4: Attach IMDB//Metascore Tomatoes scores

In [45]:
movie_info_list = load_data_pickle(name='disney_movie_data_cleaned_more.pickle')

In [None]:
movie_info_list[-60]

In [None]:
import requests
import urllib
import os

def get_omdb_info(title):
    
    # api_key = os.environ['OMDB_API_KEY'] = "3e0ef61b"
    api_key = os.environ.get('OMDB_API_KEY')

    base_url = 'http://www.omdbapi.com/?'
    parameters = {'apikey' : api_key, 't' : title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

# info = get_omdb_info('into the woods')
# info
# get_rotten_tomato_score(info)

In [None]:
movie_info_list

In [56]:
for movie in movie_info_list:
    title = movie.get('title', None)
    # print(title)
    omdb_info = get_omdb_info(title=title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info=omdb_info)


In [58]:
movie_info_list[-55]

{'URL': '/wiki/Turning_Red',
 'title': 'Turning Red',
 'Directed by': 'Domee Shi',
 'Screenplay by': ['Julia Cho', 'Domee Shi'],
 'Story by': ['Domee Shi', 'Julia Cho', 'Sarah Streicher'],
 'Produced by': 'Lindsey Collins',
 'Starring': ['Rosalie Chiang',
  'Sandra Oh',
  'Ava Morse',
  'Hyein Park',
  'Maitreyi Ramakrishnan',
  'Orion Lee',
  'Wai Ching Ho',
  'Tristan Allerick Chen',
  'James Hong'],
 'Cinematography': ['Mahyar Abousaeedi', 'Jonathan Pytko'],
 'Edited by': ['Nicholas C. Smith', 'Steve Bloom'],
 'Music by': ['Ludwig Göransson',
  '. (Score)',
  'Billie Eilish',
  'and',
  "Finneas O'Connell",
  '(songs)'],
 'Production company': 'Pixar Animation Studios',
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['March 1, 2022 ( El Capitan Theatre )',
  'March 11, 2022 (United States; Disney+ )',
  'February 9, 2024 (United States and United Kingdom; theatrical)'],
 'Running time': '100 minutes',
 'Country': 'United States',
 'Language': 'Engli

In [64]:
save_data_pickle(name='disney_movie_data_final.pickle',data=movie_info_list)

### Task #5: Save data as JSON & CSV

In [65]:
movie_info_list[50]

{'URL': '/wiki/One_Hundred_and_One_Dalmatians',
 'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Wolfgang Reitherman', 'Hamilton Luske', 'Clyde Geronimi'],
 'Screenplay by': 'Bill Peet (uncredited)',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  "J. Pat O'Malley",
  'Betty Lou Gerson',
  'Martha Wentworth',
  'Ben Wright',
  'Cate Bauer',
  'Dave Frankham',
  'Fred Worlock',
  'Lisa Davis',
  'Tom Conway',
  'Tudor Owen',
  'George Pelling',
  'Ramsay Hill',
  'Sylvia Marriott',
  'Queenie Leonard',
  'Marjorie Bennett',
  'Micky Maga',
  'Barbara Beaird',
  'Mimi Gibson',
  'Sandra Abbott',
  'Thurl Ravenscroft',
  'Bill Lee',
  'Max Smith',
  'Bob Stevens',
  'Paul Wexler',
  'Mary Wickes',
  'Barbara Luddy',
  'Lisa Daniels',
  'Helene Stanley',
  'Don Barclay',
  'Dal McKennon',
  'Jeanne Bruns'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Musi

In [67]:
# Делаем копию
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [69]:
# import datetime

for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime('%B %d, %Y')
    else:
        movie['Release date (datetime)'] = None

In [72]:
movie_info_copy[20]

{'URL': '/wiki/The_Sword_and_the_Rose',
 'title': 'The Sword and the Rose',
 'Directed by': 'Ken Annakin',
 'Screenplay by': 'Lawrence Edward Watkin',
 'Based on': ['When Knighthood Was in Flower',
  'by',
  'Charles Major',
  '(1856-1913)',
  '(of',
  'Shelbyville, Indiana',
  ', in 1896)',
  'James B. Fagan',
  '(play)'],
 'Produced by': ['Perce Pearce', 'Walt Disney'],
 'Starring': ['Glynis Johns',
  'James Robertson Justice',
  'Richard Todd',
  'Michael Gough',
  'Jane Barrett',
  'Peter Copley',
  'Ernest Jay',
  'Jean Mercure',
  'D. A. Clarke-Smith',
  'Gérard Oury',
  'Fernand Fabre',
  'Gaston Richer',
  'Rosalie Crutchley',
  'Bryan Coleman'],
 'Cinematography': 'Geoffrey Unsworth',
 'Edited by': 'Gerald Thomas',
 'Music by': 'Clifton Parker',
 'Production company': 'RKO-Walt Disney British Productions Limited',
 'Distributed by': 'RKO Radio Pictures Ltd.',
 'Release date': ['July 23, 1953 (United States)'],
 'Running time': '92 minutes',
 'Countries': ['United Kingdom', 'Un

In [75]:
save_data(title='disney_data_final.json', data=movie_info_copy)

In [None]:
# df['Release date'] = pd.to_datetime(df['Release date'][0])
# df.head()

In [None]:
# искать символ переноса строки ('\n') по всем столбцам с помощью .applymap():
# rows_with_newline = df[df.map(lambda x: '\n' in str(x)).any(axis=1)]
# rows_with_newline