# import

In [56]:
import requests
import re
import json
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from time import sleep
from pprint import pprint

# url and headers

In [65]:
DOMAIN = 'https://www.csfd.cz'
ratings_url = DOMAIN+'/zebricky/filmy/nejlepsi'
urls = [ratings_url]
for page in range(100, 901, 100):
    urls.append(f'{ratings_url}/?from={page}')
    
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36'}


# get_movies

In [58]:
def get_movies(content):
    movies = []
    soup = BeautifulSoup(content, 'html.parser')
    section = soup.find('section', class_='box')
    articles = section.find_all('article')
    
    for article in articles:
        no = int(article.find('span', class_='film-title-user').text.strip().replace('.', ''))
        title_a_tag = article.find('a', class_='film-title-name')
        title = title_a_tag.text.strip()
        rating_avg = float(article.find('div', class_='rating-average').text.replace('%', '').replace(',', '.'))
        rating_total = int(article.find('div', class_='rating-total').find(string=True, recursive=False).text.replace('\xa0', '').strip())
        year = int(article.find('span', class_='info').text.strip().replace('(', '').replace(')', ''))
        countries = article.find('p', class_='film-origins-genres').find('span', class_='info-country').text.strip().split(" / ")
        genres = article.find('p', class_='film-origins-genres').find('span', class_='info').find(string=True, recursive=False).text.replace(', ', '').strip().split(" / ")
        duration = get_duration(title_a_tag.attrs['href'])
        creators = article.find_all('p', class_='film-creators')
        directors = get_creators(creators[0])
        try:
            actors = get_creators(creators[1])
        except IndexError:
            print(f"No actors found on index {no-1}, saved None as value.")
            actors = None

        movies.append(
            {
                'no': no,
                'title': title,
                'rating_avg': rating_avg,
                'rating_total': rating_total,
                'year': year,
                'countries': countries,
                'genres': genres,
                'duration': duration,
                'directors': directors,
                'actors': actors
            }
        )
        
    return movies

# get_creators

In [59]:
def get_creators(creators):   
    creator_list = creators.find_all('a')
    for i, c in enumerate(creator_list):
        creator_list[i] = c.text.strip()
    return creator_list

# get_duration

In [60]:
def get_duration(href):
    response = requests.get(DOMAIN+href, headers=headers)
    sleep(5)

    if response.ok:
        soup = BeautifulSoup(response.content, 'html.parser')
        duration = soup.find('div', class_='film-info-content').find('div', class_='origin').find('span').next_sibling.text.strip()
        duration = int(re.match(r'\d{1,3}',duration).group())
        return duration
    else:
        print(f'An error occured during get_duration({href})')
        print(response.status_code)
        return None

# main

In [66]:
all_movies = []
folder = './data/'
json_movies = 'csfd_movies.json'

for url in urls:
    response = requests.get(url, headers=headers)
    movies = get_movies(response.content)
    all_movies = all_movies + movies
    sleep(5)
    print(f'Data from {url} downloaded.')
    
with open(folder+json_movies, 'w', encoding='utf-8') as file:
    file.write(json.dumps(all_movies, indent=4, ensure_ascii=False))
    print(f'Dictionary saved into {json_movies}.')
    
print('All done!')

Data from https://www.csfd.cz/zebricky/filmy/nejlepsi downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=100 downloaded.
No actors found on index 274, saved None as value.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=200 downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=300 downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=400 downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=500 downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=600 downloaded.
No actors found on index 738, saved None as value.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=700 downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=800 downloaded.
Data from https://www.csfd.cz/zebricky/filmy/nejlepsi/?from=900 downloaded.
Dictionary saved into csfd_movies.json.
All done!
