In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')
import time
from pyspark.sql import SparkSession, DataFrameWriter
import pyspark.sql.functions as F

In [3]:
# Создание спарк сессии
spark = SparkSession.builder.master("local").enableHiveSupport().appName("extract-transform").getOrCreate()
spark

In [4]:
# Читаем данные из паркета
df = spark.read.format("parquet").load('data_in/competition_data_only_target_pqt/')

In [5]:
# Создаем локальное представление датафрейма, как sql таблицы mts
df.createOrReplaceTempView("mts")

In [6]:
target = spark.read.format("parquet").load('data_in/public_train.pqt')
target.createOrReplaceTempView("target")

In [7]:
data_learn = spark.sql("with sub_1 as"
    " (select user_id, max(request_cnt) as max_request_cnt, round(avg(request_cnt), 3) as avg_request_cnt, count(request_cnt)as count_request_cnt "
    " from mts group by user_id)"
    ", sub_2 as (select user_id, max(request_cnt) as max_night_request_cnt, "
    " round(avg(request_cnt), 3) as avg_night_request_cnt, count(request_cnt)as count_night_request_cnt "
    " from mts where part_of_day = 'night' group by user_id)"
    ", sub_3 as (select user_id, max(request_cnt) as max_day_request_cnt, "
    " round(avg(request_cnt), 3) as avg_day_request_cnt, count(request_cnt)as count_day_request_cnt "
    " from mts where part_of_day = 'day' group by user_id)"
    ", sub_4 as (select user_id, max(request_cnt) as max_morning_request_cnt, "
    " round(avg(request_cnt), 3) as avg_morning_request_cnt, count(request_cnt)as count_morning_request_cnt "
    " from mts where part_of_day = 'morning' group by user_id)"
    ", sub_5 as (select user_id, max(request_cnt) as max_evening_request_cnt, "
    " round(avg(request_cnt), 3) as avg_evening_request_cnt, count(request_cnt)as count_evening_request_cnt "
    " from mts where part_of_day = 'evening' group by user_id)"
    ", sub_6 as (select user_id, max(sum_date_request_cnt) as max_sum_date_request_cnt, min(sum_date_request_cnt) as min_sum_date_request_cnt, "
    " round(avg(sum_date_request_cnt), 3) as avg_sum_date_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts group by user_id, date) as t1"
    " group by user_id)"
    ", sub_7 as (select user_id, max(sum_date_request_cnt) as max_sum_date_day_request_cnt, min(sum_date_request_cnt) as min_sum_date_day_request_cnt, "
    " round(avg(sum_date_request_cnt), 3) as avg_sum_date_day_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'day' group by user_id, date) as t2"
    " group by user_id)"
    ", sub_8 as (select user_id, max(sum_date_request_cnt) as max_sum_date_night_request_cnt, min(sum_date_request_cnt) as min_sum_date_night_request_cnt, "
    "round(avg(sum_date_request_cnt), 3) as avg_sum_date_night_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'night' group by user_id, date) as t3"
    " group by user_id)"
    ", sub_9 as (select user_id, max(sum_date_request_cnt) as max_sum_date_morning_request_cnt, min(sum_date_request_cnt) as min_sum_date_morning_request_cnt, "
    " round(avg(sum_date_request_cnt), 3) as avg_sum_date_morning_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'morning' group by user_id, date) as t4"
    " group by user_id)"
    ", sub_10 as (select user_id, max(sum_date_request_cnt) as max_sum_date_evening_request_cnt, min(sum_date_request_cnt) as min_sum_date_evening_request_cnt, "
    " round(avg(sum_date_request_cnt), 3) as avg_sum_date_evening_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'evening' group by user_id, date) as t5"
    " group by user_id)"
    ", sub_11 as (select user_id, count(date) as count_date"
    " from (select user_id, date from mts group by user_id, date) as t6 group by user_id)"
    ", sub_12 as (select user_id, count(date) as count_day_date"
    " from (select user_id, date from mts where part_of_day = 'day' group by user_id, date) as t7 group by user_id)"
    ", sub_13 as (select user_id, count(date) as count_night_date"
    " from (select user_id, date from mts where part_of_day = 'night' group by user_id, date) as t8 group by user_id)"
    ", sub_14 as (select user_id, count(date) as count_morning_date"
    " from (select user_id, date from mts where part_of_day = 'morning' group by user_id, date) as t9 group by user_id)"
    ", sub_15 as (select user_id, count(date) as count_evening_date"
    " from (select user_id, date from mts where part_of_day = 'evening' group by user_id, date) as t10 group by user_id)"
    ", sub_16 as (select user_id, avg(count_part_of_day_date) as avg_count_part_of_day_date,"
    " max(count_part_of_day_date) as max_count_part_of_day_date, min(count_part_of_day_date) as min_count_part_of_day_date"
    " from (select user_id, date, count(part_of_day) as count_part_of_day_date"
    " from (select user_id, date, part_of_day from mts group by user_id, date, part_of_day) as t11"
    " group by user_id, date) as t12 group by user_id)"
    ", sub_17 as (select user_id, avg(lag_date) as avg_lag_date, max(lag_date) as max_lag_date, min(lag_date) as min_lag_date"
    " from (select user_id, int(date - lag(date) over (partition by user_id order by date)) as lag_date"
    " from (select user_id, date from mts group by user_id, date order by user_id, date) as t13) as t14"
    " group by user_id order by user_id)"
    ", sub_18 as (select user_id, count(region_name) as count_region_name"
    " from (select user_id, region_name from mts group by user_id, region_name) as t15"
    " group by user_id order by user_id)"
    ", sub_19 as (select user_id, count(city_name) as count_city_name"
    " from (select user_id, city_name from mts group by user_id, city_name) as t16"
    " group by user_id order by user_id)"    
                       
    " select user_id, age, is_male, max_request_cnt, avg_request_cnt, count_request_cnt"
    " from target left join sub_1 using(user_id)"
    " left join sub_2 using(user_id)"
    " left join sub_3 using(user_id)"
    " left join sub_4 using(user_id)"                   
    " left join sub_5 using(user_id)"                   
    " left join sub_6 using(user_id)"                  
    " left join sub_7 using(user_id)"                   
    " left join sub_8 using(user_id)"                  
    " left join sub_9 using(user_id)"   
    " left join sub_10 using(user_id)"                  
    " left join sub_11 using(user_id)"                   
    " left join sub_12 using(user_id)"                   
    " left join sub_13 using(user_id)"
    " left join sub_14 using(user_id)"                   
    " left join sub_15 using(user_id)"
    " left join sub_16 using(user_id)"                  
    " left join sub_17 using(user_id)"
    " left join sub_18 using(user_id)"                   
    " left join sub_19 using(user_id)"                   
    )

In [9]:
%%time
data_learn.write.parquet(path="data_out/data_for_learn_parquet_last/", mode="overwrite")

CPU times: user 28.3 ms, sys: 16.5 ms, total: 44.7 ms
Wall time: 3min 28s
