## Создание сессии Spark

In [None]:
import findspark
findspark.init()
findspark.find()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder\
    .master("local[1]")\
    .appName("task_47")\
    .config("spark.executor.memory", "10g")\
    .config("spark.executor.cores", 5)\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.maxExecutors", 5)\
    .config("spark.shuffle.service.enabled", "true")\
.getOrCreate()

## Задание 4.8.2

#### Исходные данные задания -> Spark DataFrames

In [None]:
import datetime as DT
import pandas as pd

start_date = DT.datetime(2023, 8, 1)
end_date = DT.datetime(2023, 8, 31)

res = pd.date_range(
    min(start_date, end_date),
    max(start_date, end_date)
).strftime('%Y-%m-%d').tolist()

week_tmp = [DT.datetime.strptime(i, '%Y-%m-%d').date().isocalendar()[1] for i in res]
week_min = min(week_tmp)
weeks = [str(i - week_min + 1) for i in week_tmp]
res_days_p = tuple(zip(res, weeks))

data_schema = ['day', 'week']
res_days_s = spark.createDataFrame(data = res_days_p, schema = data_schema)

In [None]:
res_days_s.printSchema()
res_days_s.show()

In [None]:
week_str_p = (('1', '01.08—06.08'), ('2', '07.08—13.08'), ('3', '14.08—20.08'), ('4', '21.08—27.08'), ('5', '28.08—31.08'))
data_schema = ['week', 'week_str']
week_str_s = spark.createDataFrame(data = week_str_p, schema = data_schema)

In [None]:
week_str_s.printSchema()
week_str_s.show()

In [None]:
demand_p = (('1', '01', 100), ('1', '02', 110), ('2', '01', 120), ('2', '02', 90), ('3', '01', 70), ('3', '02', 80))
data_schema = ['product', 'location', 'demand']
demand_s = spark.createDataFrame(data = demand_p, schema = data_schema)

In [None]:
demand_s.printSchema()
demand_s.show()

In [None]:
stock_p = (('1', '01', 1000), ('1', '02', 400), ('2', '01', 300), ('2', '02', 250))
data_schema = ['product', 'location', 'stock']
stock_s = spark.createDataFrame(data = stock_p, schema = data_schema)

In [None]:
stock_s.printSchema()
stock_s.show()

#### Решение задания

In [None]:
res_days_s.createOrReplaceTempView('res_days')
week_str_s.createOrReplaceTempView('week_str')
demand_s.createOrReplaceTempView('demand')
stock_s.createOrReplaceTempView('stock')

In [None]:
sql = """
with tmp_1 as (
select d.*, coalesce(s.stock, 0) as stock
from demand d 
    left join stock s using(product, location)
),

tmp_2 as (
select tmp_1.*, res_days.*
from tmp_1 cross join res_days
),

tmp_3 as (
select tmp_2.*,
    sum (demand) over (partition by product, location order by day) as dem
from tmp_2
),

tmp_4 as (
select tmp_3.*,
    lag(stock - dem, 1) over (partition by product, location order by day) as clm
from tmp_3
),

tmp_5 as (
select tmp_4.*,
    case 
        when clm > demand then demand
        when clm > 0 and clm <= demand then clm
        else 0
    end as res
from tmp_4
),

tmp_6 as (
select tmp_5.*,
    min(tmp_5.clm) over (partition by tmp_5.product, tmp_5.location, tmp_5.week) as clm_1
from tmp_5
)

select 
    w.week_str,
    tmp_6.product,
    tmp_6.location,
    sum(tmp_6.res),
    avg(tmp_6.clm_1)
from tmp_6
    join week_str w on tmp_6.week = w.week 
group by w.week_str, tmp_6.product, tmp_6.location
order by 1, 2, 3
"""

In [None]:
print('Итоговая таблица: \n\n')
res = spark.sql(sql)\
    .withColumnRenamed('week_str', 'week_dates')\
    .withColumnRenamed('sum(res)', 'sales')\
    .withColumnRenamed('avg(clm_1)', 'stock_at_end')
res.show(truncate=False)

In [None]:
spark.stop