In [27]:
from bs4 import BeautifulSoup
import requests
import logging
import multiprocessing.dummy as mp
import os

Пишем функции для получения URL фильмов

In [28]:
def get_urls_from_afisha(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    films = soup.findAll('li', {'class': 'lists__li'})
    return set(filter(None, map(lambda x: get_urls_function(x, 'name'), films)))


def get_urls_function(x, class_name):
    href_list = x.findAll('a', {'class': class_name})
    if len(href_list) != 0:
        current_url = href_list[0]['href']
        if current_url.find("https://afisha.tut.by/film") != -1:
            return href_list[0]['href']


def get_urls_from_online_cinema(base_url):
    return get_urls_from_online_cinema_page(base_url, set())


def get_urls_from_online_cinema_page(url, result_data):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    films = soup.findAll('li', {'class': 'lists__li'})
    urls = set(filter(None, map(lambda x: get_urls_function(x, 'media'), films)))
    result_data = result_data.union(urls)
    next_button = soup.find('li', {'class', 'p-next'}).find('a')
    if next_button is not None:
        next_url = next_button['href']
        return get_urls_from_online_cinema_page(next_url, result_data)
    else:
        return result_data


def get_urls_from_afisha_and_online_cinema(base_url_afisha, base_url_online_films):
    online = get_urls_from_online_cinema(base_url_online_films)
    afisha = get_urls_from_afisha(base_url_afisha)
    return afisha.union(online)


Получаем все ссылки на фильмы c афишы и онлайн кинотеатра

In [29]:


logging.basicConfig(level=logging.INFO)    
base_url_afisha = 'https://afisha.tut.by/film/'
base_url_online_cinema='https://afisha.tut.by/online-cinema/'
urlsOfFilms=get_urls_from_afisha_and_online_cinema(base_url_afisha,base_url_online_cinema)
logging.info("Number of films "+str(len(urlsOfFilms)))

INFO:root:Number of films 88


Пишем функции для парсинга отдельных элементов страницы

In [30]:
def get_film_name(parsed_film_page):
    return parsed_film_page.findAll('h1', {'class': 'title'})[0].getText().rstrip()


def is_premiere(parsed_film_page):
    label = parsed_film_page.find('div', {'class': 'title__labels'}).findAll('span', {'class': 'label'})
    for val in label:
        if val.getText() == "Премьера":
            return True
    return False


def get_film_age_rank(parsed_film_page):
    label = parsed_film_page.find('div', {'class': 'title__labels'}).findAll('span', {'class': 'label'})
    for val in label:
        if val.getText().find("+") != -1:
            return val.getText()
    return "none"


def get_native_film_name(parsed_film_page):
    label = parsed_film_page.find('div', {'class': 'sub_title'})
    if label is not None:
        return label.getText()
    return "none"


def get_poster_url(parsed_film_page):
    return parsed_film_page.find('img', {'class': 'main_image'})["src"]


def get_genre(parsed_film_page):
    genres = parsed_film_page.findAll('a', {'itemprop': 'genre'})
    genres_dict = {}
    for genre in genres:
        genres_dict[genre.getText()] = {"name": genre.getText(), "url": genre["href"]}
    return genres_dict


def get_set_values(text):
    return set(text.split(", "))


def get_table_field_set(parsed_film_page, field_name):
    td = parsed_film_page.find('td', {'class': field_name})
    if td is not None:
        text = td.getText()
        return get_set_values(text)
    return set()


def get_table_field_single(parsed_film_page, field_name):
    td = parsed_film_page.find('td', {'class': field_name})
    if td is not None:
        return td.getText()
    return "none"


def get_table_field_single_with_url(parsed_film_page, field_name):
    td = parsed_film_page.find('td', {'class': field_name})
    if td is not None:
        a_tag = td.find('a')
        if a_tag is not None:
            return {"data": a_tag.getText(), "url": a_tag["href"]}
        return {"data": td.getText(), "url": "none"}
    return {"data": "none", "url": "none"}


def get_year_of_film(parsed_film_page):
    return get_table_field_single_with_url(parsed_film_page, 'year')


def get_duration(parsed_film_page):
    return get_table_field_single(parsed_film_page, 'duration')


def get_country(parsed_film_page):
    return set(map(lambda x: x.strip(),get_table_field_set(parsed_film_page, 'author')))


def get_showing_end_date(parsed_film_page):
    end_date = get_table_field_single(parsed_film_page, 'date')
    if end_date != "none":
        return end_date.split("В прокате до ")[1]
    return end_date


def get_afisha_rating(parsed_film_page):
    rating = parsed_film_page.find('span', {'class': 'rating-big__value'})
    if rating is not None:
        return rating.getText()
    return "none"


def get_not_afisha_rating(parsed_film_page, label):
    rating_tag = parsed_film_page.find('td', {'class': 'IMDb'})
    if rating_tag is not None:
        ratings = rating_tag.findAll("b")
        for rating in ratings:
            sibling = rating.findPreviousSibling('p')
            if sibling.getText() == label:
                return rating.getText()
    return "none"


def get_imdb_rating(parsed_film_page):
    return get_not_afisha_rating(parsed_film_page, 'Рейтинг IMDb')


def get_kinopoisk_rating(parsed_film_page):
    return get_not_afisha_rating(parsed_film_page, 'Кинопоиск')


def get_ratings(parsed_film_page):
    return {"afisha": get_afisha_rating(parsed_film_page), "imdb": get_imdb_rating(parsed_film_page),
            "kinopoisk": get_afisha_rating(parsed_film_page)}


def persons_from_string(p_tag, selector_text):
    persons = {}
    a_tags = p_tag.findAll('a')
    for a_tag in a_tags:
        name = a_tag.getText()
        persons[name] = {"name": name, "additional_information": a_tag['href']}
        a_tag.extract()
    names = filter(lambda x: x != "", p_tag.getText().split(selector_text + " ")[1].split(", "))
    for name in names:
        persons[name] = {"name": name, "additional_information": "none"}
    return persons


def get_director(parsed_film_page):
    previous_sibling = parsed_film_page.find('table', {'class': 'movie_info'})
    p_tag = previous_sibling.findNextSibling('p')
    if p_tag is not None:
        if p_tag.getText().find("Режиссер:") != -1:
            return persons_from_string(p_tag, "Режиссер:")
    return {}


def get_actors(parsed_film_page):
    previous_sibling = parsed_film_page.find('table', {'class': 'movie_info'})
    p_tags = previous_sibling.findNextSiblings('p', limit=2)
    for p_tag in p_tags:
        if p_tag is not None:
            if p_tag.getText().find("В ролях:") != -1:
                return persons_from_string(p_tag, "В ролях:")
    return {}


def get_description(parsed_film_page):
    description_div = parsed_film_page.find('div', {'itemprop': 'description'})
    if description_div is not None:
        siblings = description_div.findAll()
        for sibling in siblings:
            sibling.extract()
        return description_div.getText().replace("\n", "").replace("\t", "").strip()
    return "none"


def get_film_videos(parsed_film_page):
    media_div = parsed_film_page.find('div', {'id': 'event-photos'})
    if media_div is not None:
        result = set()
        video_frames = media_div.findAll('a', {'data-action': 'iframe'})
        for video_frame in video_frames:
            result.add(video_frame["href"])
        return result
    return set()


def get_film_images(parsed_film_page):
    media_div = parsed_film_page.find('div', {'id': 'event-photos'})
    if media_div is not None:
        result = set()
        video_frames = media_div.findAll('img')
        for video_frame in video_frames:
            result.add(video_frame["src"])
        return result
    return set()



Пишем функцию для парсинга страницы

In [31]:
def parse_film_page(url):
    logging.info(url)
    response_of_film_page = requests.get(url)
    parsed_film_page = BeautifulSoup(response_of_film_page.text, 'html.parser')
    name = get_film_name(parsed_film_page)
    data = {'url': url, 'name': name}  # set name and url of film
    data['is_premiere'] = is_premiere(parsed_film_page)  # is this film premiere
    data['age_rank'] = get_film_age_rank(parsed_film_page)  # age rank  (16+ and etc.)
    data['native_film_name'] = get_native_film_name(parsed_film_page)  # name of film in original
    data['poster_url'] = get_poster_url(parsed_film_page)  # url to film poster
    data['genres'] = get_genre(parsed_film_page)  # genres of film
    data['year'] = get_year_of_film(parsed_film_page)  # year of creation
    data['duration'] = get_duration(parsed_film_page)  # duration
    data['countries'] = get_country(parsed_film_page)  # countries 
    data['showing_end_date'] = get_showing_end_date(parsed_film_page)  # end date of showing in theaters
    data['desription'] = get_description(parsed_film_page)  # text description of film
    data['ratings'] = get_ratings(parsed_film_page)  # ratings
    data['directors'] = get_director(parsed_film_page)  # directors of film and link for additional director information
    data['actors'] = get_actors(parsed_film_page)  # actors  and link for additional actors information
    data['film_videos'] = get_film_videos(parsed_film_page)  # urls to videos of film (trailer and etc.)
    data['film_images'] = get_film_images(parsed_film_page)  # urls to images of film 
    return name, data


Пишем функцию для парсинга страниц

In [32]:
def parse_all_pages(urls):
    cores_num=os.cpu_count()
    logging.info("***************************START*****************************")
    p=mp.Pool(cores_num)
    parsed_pages=dict(p.map(parse_film_page,urls))
    logging.info("***************************END*****************************")
    return parsed_pages


Парсим страницы

In [33]:
data=parse_all_pages(urlsOfFilms)
logging.info("Prepared data for "+str(len(data))+" films")


INFO:root:***************************START*****************************
INFO:root:https://afisha.tut.by/film/maleficent_mistress_of_evil_ru_sub/
INFO:root:https://afisha.tut.by/film/joker-film/
INFO:root:https://afisha.tut.by/film/metallica_i_simfonicheskiy_orkestr_san_francisko_s_m/
INFO:root:https://afisha.tut.by/film/gemini/?utm_source=afisha.tut.by&utm_medium=event_recirculation&utm_campaign=top3
INFO:root:https://afisha.tut.by/film/metallica_i_simfonicheskiy_orkestr_san_francisko_s_m/?utm_source=afisha.tut.by&utm_medium=films&utm_campaign=premiere_block
INFO:root:https://afisha.tut.by/film/ty-umeesh-khranit-sekrety/
INFO:root:https://afisha.tut.by/film/once-upon-a-time-in-hollywood/
INFO:root:https://afisha.tut.by/film/k-zvezdam/
INFO:root:https://afisha.tut.by/film/muzey-prado-kollekciya-chudes/
INFO:root:https://afisha.tut.by/film/terminator_dark_fate_ru_sub/
INFO:root:https://afisha.tut.by/film/kokosha_malenkiy_drakon/
INFO:root:https://afisha.tut.by/film/vertinskiy-odinokiy-st

Выполняем следующее задание (от 05.10.2019)

Загружаем словарь в Spark DataFrame

In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

import pandas as pd

In [35]:
spark = SparkSession.builder \
     .appName("load data") \
     .getOrCreate()

sc = spark.sparkContext

In [36]:


pdDF = pd.DataFrame(data).transpose()
pdDF[['film_images','film_videos','countries']]=pdDF[['film_images','film_videos','countries']]\
    .applymap(list)

movies_df = spark.createDataFrame(pdDF)


Задание: Посчитайте сколько фильмов в каждой возрастной категории.


In [37]:
movies_df.groupBy("age_rank").count().orderBy("count",ascending=False).show()

+--------+-----+
|age_rank|count|
+--------+-----+
|    none|   51|
|     16+|   10|
|     18+|    9|
|     12+|    8|
|      6+|    2|
|      0+|    2|
+--------+-----+



Задание: Посчитайте сколько фильмов по странам.



In [38]:
movie_count_df=movies_df.withColumn("country",explode(movies_df.countries))
movie_count_df.groupBy('country').count().orderBy("count",ascending=False).show()

+--------------+-----+
|       country|count|
+--------------+-----+
|           США|   18|
|Великобритания|   10|
|        Россия|   10|
|      Германия|    5|
|       Испания|    5|
|         Китай|    4|
|        Италия|    4|
|       Франция|    3|
|     Аргентина|    2|
|      Бразилия|    1|
|       Мексика|    1|
|   Корея Южная|    1|
|      Норвегия|    1|
|        Япония|    1|
|      Ирландия|    1|
|       Эстония|    1|
|       Бельгия|    1|
|        Грузия|    1|
|    Португалия|    1|
+--------------+-----+



Задание:
Посчитайте сколько фильмов по актерам

In [39]:
movies_df.select("name", explode(movies_df.actors).alias("actor_name", "data"))\
    .groupBy('actor_name').count().orderBy("count",ascending=False).show(1000)


+--------------------+-----+
|          actor_name|count|
+--------------------+-----+
|         Эль Фаннинг|    3|
|            Джуд Лоу|    2|
|       Глория Фостер|    2|
|         Маркус Чонг|    2|
|      Лоренс Фишбёрн|    2|
|        Келли Рорбах|    2|
|        Селена Гомес|    2|
|    Джулиан Араханга|    2|
|        Ребекка Холл|    2|
|      Джо Пантольяно|    2|
|    Александр Петров|    2|
|         Лив Шрайбер|    2|
|        Хьюго Уивинг|    2|
|           Брэд Питт|    2|
|       Тимоти Шаламе|    2|
|          Мэтт Доран|    2|
|          Киану Ривз|    2|
|      Кэрри-Энн Мосс|    2|
|       Кристин Гёрке|    1|
|          Ли Сон-гюн|    1|
|    Джонни Ли Миллер|    1|
|     Дамиан Алькасар|    1|
|    Хульета Эгуррола|    1|
|          Йен Шрагер|    1|
|      Кигэн-Майкл Ки|    1|
|   Леонардо ДиКаприо|    1|
|          Зоя Бербер|    1|
|        Скут МакНэри|    1|
|      Малика Лапшина|    1|
|     Phillip Johnson|    1|
|        Алена Чехова|    1|
|      Григори