In [1]:
import os
from datetime import datetime

import psycopg2

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

from hdfs import InsecureClient

In [2]:
spark = SparkSession\
    .builder\
    .config('spark.driver.extraClassPath'
            , '/home/user/shared_folder/postgresql-42.2.23.jar')\
    .master('local')\
    .appName("homework_6")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/08/05 16:19:26 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Load Pagila tables to Bronze HDFS

In [4]:
pg_creds = {
      'host': '192.168.88.138'
    , 'port': '5432'
    , 'database': 'pagila'
    , 'user': 'pguser'
    , 'password': 'secret'
}

In [5]:
tables_to_load = (  
     'actor'
    ,'category'
    ,'film'
    ,'film_actor'
    ,'film_category'
    ,'customer'
    ,'address'
    ,'city'
    ,'inventory'
    ,'rental'
)

In [6]:
hdfs_url = 'http://127.0.0.1:50070/'

In [7]:
client_hdfs = InsecureClient(hdfs_url, user='user')

In [8]:
current_date = datetime.now().strftime("%Y-%m-%d")

In [9]:
for table_name in tables_to_load:
    
    bronze_dir = os.path.join('/', 'bronze', 'pagila', table_name, current_date)
    
    with psycopg2.connect(**pg_creds) as pg_connection:
        cursor = pg_connection.cursor()
        
        with client_hdfs.write(os.path.join(bronze_dir, table_name + '.csv')) as csv_file:
            
            client_hdfs.delete(os.path.join(bronze_dir, table_name + '.csv') , recursive=False)
            cursor.copy_expert(f"COPY {table_name} TO STDOUT WITH HEADER CSV", csv_file)

In [10]:
# Read used csv files to DataFrames

In [11]:
actor_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'actor', current_date, 'actor.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
category_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'category', current_date, 'category.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
film_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'film', current_date, 'film.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
film_actor_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'film_actor', current_date, 'film_actor.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
film_category_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'film_category', current_date, 'film_category.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
customer_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'customer', current_date, 'customer.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
address_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'address', current_date, 'address.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
city_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'city', current_date, 'city.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
inventory_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'inventory', current_date, 'inventory.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)
rental_df = spark.read.load(
    os.path.join('/', 'bronze', 'pagila', 'rental', current_date, 'rental.csv')
    , header="true"
    , inferSchema="true"
    , format="csv"
)

    


In [12]:
# вывести количество фильмов в каждой категории, отсортировать по убыванию.

In [60]:
result_1 = film_category_df.join(category_df
        , film_category_df.category_id == category_df.category_id
        , 'inner')\
    .select(category_df.name.alias('category_name'))

result_1.groupBy(result_1.category_name)\
    .count().orderBy(F.desc("count")).show()

+-------------+-----+
|category_name|count|
+-------------+-----+
|       Sports|   74|
|      Foreign|   73|
|       Family|   69|
|  Documentary|   68|
|    Animation|   66|
|       Action|   64|
|          New|   63|
|        Drama|   62|
|       Sci-Fi|   61|
|        Games|   61|
|     Children|   60|
|       Comedy|   58|
|       Travel|   57|
|     Classics|   57|
|       Horror|   56|
|        Music|   51|
+-------------+-----+

