## Импорты

In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')
import time
from pyspark.sql import SparkSession, DataFrameWriter
import pyspark.sql.functions as F

from pyspark.sql.functions import isnan, isnull

In [2]:
# Создание спарк сессии
spark = SparkSession.builder.master("local").enableHiveSupport().appName("extract-transform").getOrCreate()
spark

In [3]:
# Читаем данные из паркета
df = spark.read.format("parquet").load('data_in/competition_data_only_target_pqt/')

In [4]:
df.printSchema()

root
 |-- region_name: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- url_host: string (nullable = true)
 |-- date: date (nullable = true)
 |-- part_of_day: string (nullable = true)
 |-- request_cnt: long (nullable = true)
 |-- user_id: long (nullable = true)



In [5]:
# Создаем локальное представление датафрейма, как sql таблицы mts
df.createOrReplaceTempView("mts")

## Создаем признаки на основе request_cnt

### Создаем признаки на основе request_cnt и агрегирующих функций макс, среднее, подсчет

In [7]:
%%time
spark.sql("select user_id, max(request_cnt) as max_request_cnt, "
          "round(avg(request_cnt), 3) as avg_request_cnt, count(request_cnt)as count_request_cnt "
          "from mts group by user_id").show(20)

+-------+---------------+---------------+-----------------+
|user_id|max_request_cnt|avg_request_cnt|count_request_cnt|
+-------+---------------+---------------+-----------------+
| 366198|              6|          2.352|             2017|
| 166581|             10|           1.77|             1424|
| 414978|              4|          1.351|               57|
| 158464|              6|          1.626|              878|
|  97699|              6|          1.676|              982|
|  79418|              6|           1.57|              790|
| 324572|              6|          1.745|             1381|
|  38510|              6|          1.654|              593|
| 102613|              4|          1.235|               98|
| 266206|              5|          1.287|              254|
| 299953|              6|          1.348|              417|
| 134319|              4|          1.627|              308|
| 116139|              6|          1.678|             1479|
| 152419|              6|          1.889

In [8]:
%%time
# Посчитаем агрегирующие функции по request_cnt для части суток ночь
spark.sql("select user_id, max(request_cnt) as max_night_request_cnt, "
          "round(avg(request_cnt), 3) as avg_night_request_cnt, count(request_cnt)as count_night_request_cnt "
          "from mts where part_of_day = 'night' group by user_id").show(20)

+-------+---------------+---------------+-----------------+
|user_id|max_request_cnt|avg_request_cnt|count_request_cnt|
+-------+---------------+---------------+-----------------+
| 366198|              6|          2.297|              249|
| 166581|              8|          1.742|              163|
| 158464|              2|          1.091|               11|
|  97699|              5|          1.552|               67|
|  79418|              4|           1.25|               12|
| 324572|              2|          1.088|               34|
|  38510|              4|           1.38|               92|
| 102613|              1|            1.0|                4|
| 266206|              2|          1.025|               40|
| 299953|              1|            1.0|               13|
| 134319|              2|          1.214|               14|
| 116139|              5|           1.47|              181|
| 152419|              6|          2.135|              251|
| 296974|              6|          1.921

In [10]:
spark.sql("select distinct part_of_day from mts").show(4)

+-----------+
|part_of_day|
+-----------+
|        day|
|      night|
|    morning|
|    evening|
+-----------+



In [11]:
%%time
# Посчитаем агрегирующие функции по request_cnt для части суток день
spark.sql("select user_id, max(request_cnt) as max_day_request_cnt, "
          "round(avg(request_cnt), 3) as avg_day_request_cnt, count(request_cnt)as count_day_request_cnt "
          "from mts where part_of_day = 'day' group by user_id").show(20)

+-------+-------------------+-------------------+---------------------+
|user_id|max_day_request_cnt|avg_day_request_cnt|count_day_request_cnt|
+-------+-------------------+-------------------+---------------------+
| 366198|                  6|               2.43|                  660|
| 166581|                 10|              2.233|                  339|
| 414978|                  4|              1.556|                   36|
| 158464|                  5|              1.621|                  232|
|  97699|                  6|              1.763|                  295|
|  79418|                  6|              1.909|                  309|
| 324572|                  5|              1.891|                  476|
|  38510|                  6|              1.737|                  167|
| 102613|                  4|              1.245|                   49|
| 266206|                  4|              1.321|                   78|
| 299953|                  6|              1.391|               

In [12]:
%%time
# Посчитаем агрегирующие функции по request_cnt для части суток утро
spark.sql("select user_id, max(request_cnt) as max_morning_request_cnt, "
          "round(avg(request_cnt), 3) as avg_morning_request_cnt, count(request_cnt)as count_morning_request_cnt "
          "from mts where part_of_day = 'morning' group by user_id").show(20)

+-------+-----------------------+-----------------------+-------------------------+
|user_id|max_morning_request_cnt|avg_morning_request_cnt|count_morning_request_cnt|
+-------+-----------------------+-----------------------+-------------------------+
| 366198|                      6|                  2.252|                      330|
| 166581|                      8|                  1.845|                      277|
| 414978|                      1|                    1.0|                       20|
| 158464|                      6|                  1.742|                      476|
|  97699|                      6|                  1.528|                      373|
|  79418|                      5|                  1.429|                      326|
| 324572|                      6|                  1.872|                      516|
|  38510|                      6|                  1.646|                      161|
| 102613|                      4|                  1.526|                   

In [14]:
%%time
# Посчитаем агрегирующие функции по request_cnt для части суток вечер
spark.sql("select user_id, max(request_cnt) as max_evening_request_cnt, "
          "round(avg(request_cnt), 3) as avg_evening_request_cnt, count(request_cnt)as count_evening_request_cnt "
          "from mts where part_of_day = 'evening' group by user_id").show(20)

+-------+-----------------------+-----------------------+-------------------------+
|user_id|max_evening_request_cnt|avg_evening_request_cnt|count_evening_request_cnt|
+-------+-----------------------+-----------------------+-------------------------+
| 366198|                      6|                  2.346|                      778|
| 166581|                      8|                  1.502|                      645|
| 414978|                      1|                    1.0|                        1|
| 158464|                      4|                  1.327|                      159|
|  97699|                      5|                   1.83|                      247|
|  79418|                      3|                  1.182|                      143|
| 324572|                      4|                  1.428|                      355|
|  38510|                      6|                  1.728|                      173|
| 102613|                      2|                  1.038|                   

In [18]:
%%time
# Посчитаем агрегирующие функции для request_cnt по датам
spark.sql("select user_id, max(sum_date_request_cnt) as max_sum_date_request_cnt, min(sum_date_request_cnt) as min_sum_date_request_cnt, "
          "round(avg(sum_date_request_cnt), 3) as avg_sum_date_request_cnt"
          " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts group by user_id, date) as t1"
          " group by user_id").show(20)

+-------+------------------------+------------------------+------------------------+
|user_id|max_sum_date_request_cnt|min_sum_date_request_cnt|avg_sum_date_request_cnt|
+-------+------------------------+------------------------+------------------------+
|     26|                     152|                      11|                    71.8|
|     29|                     163|                       4|                  39.491|
|    474|                      52|                       4|                  21.295|
|    964|                     218|                       8|                  94.711|
|   1677|                      78|                       2|                  23.143|
|   1697|                     104|                     104|                   104.0|
|   1806|                       4|                       4|                     4.0|
|   2214|                     178|                       2|                  48.324|
|   2250|                     265|                       4|      