In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')
import time
from pyspark.sql import SparkSession, DataFrameWriter
import pyspark.sql.functions as F

In [2]:
# Создание спарк сессии
spark = SparkSession.builder.master("local").enableHiveSupport().appName("extract-transform").getOrCreate()
spark

In [3]:
# Читаем данные из паркета
df = spark.read.format("parquet").load('data_in/competition_data_final_pqt')

In [4]:
df.printSchema()

root
 |-- region_name: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- cpe_manufacturer_name: string (nullable = true)
 |-- cpe_model_name: string (nullable = true)
 |-- url_host: string (nullable = true)
 |-- cpe_type_cd: string (nullable = true)
 |-- cpe_model_os_type: string (nullable = true)
 |-- price: double (nullable = true)
 |-- date: date (nullable = true)
 |-- part_of_day: string (nullable = true)
 |-- request_cnt: long (nullable = true)
 |-- user_id: long (nullable = true)



In [5]:
# Создаем локальное представление датафрейма, как sql таблицы mts
df.createOrReplaceTempView("mts")

In [15]:
spark.sql("select price, cpe_model_name from mts where cpe_model_name = 'Mi 8 Lite'").show(50)

+-----+--------------+
|price|cpe_model_name|
+-----+--------------+
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|     Mi 8 Lite|
| null|    

In [6]:
data = spark.sql("select user_id, max(price) as price from mts group by user_id")

In [7]:
data.count()

415317

In [8]:
df_sub_1 = spark.sql("select user_id, round(avg(request_cnt), 3) as avg_request_cnt, count(request_cnt)as count_request_cnt "
    " from mts group by user_id")
data_learn = data.select("user_id", "price").join(df_sub_1, "user_id", 'left')
df_sub_2 = spark.sql("select user_id, round(avg(request_cnt), 3) as avg_night_request_cnt, count(request_cnt)as count_night_request_cnt "
    " from mts where part_of_day = 'night' group by user_id")
data_learn = data_learn.join(df_sub_2, "user_id", 'left')
df_sub_3 = spark.sql("select user_id, round(avg(request_cnt), 3) as avg_day_request_cnt, count(request_cnt)as count_day_request_cnt "
    " from mts where part_of_day = 'day' group by user_id")
data_learn = data_learn.join(df_sub_3, "user_id", 'left')
df_sub_4 = spark.sql("select user_id, round(avg(request_cnt), 3) as avg_morning_request_cnt, count(request_cnt)as count_morning_request_cnt "
    " from mts where part_of_day = 'morning' group by user_id")
data_learn = data_learn.join(df_sub_4, "user_id", 'left')
df_sub_5 = spark.sql("select user_id, round(avg(request_cnt), 3) as avg_evening_request_cnt, count(request_cnt)as count_evening_request_cnt "
    " from mts where part_of_day = 'evening' group by user_id")
data_learn = data_learn.join(df_sub_5, "user_id", 'left')

df_sub_6 = spark.sql("select user_id, "
    " round(avg(sum_date_request_cnt), 3) as avg_sum_date_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts group by user_id, date) as t1"
    " group by user_id")
data_learn = data_learn.join(df_sub_6, "user_id", 'left')
df_sub_7 = spark.sql("select user_id, "
    " round(avg(sum_date_request_cnt), 3) as day_avg_sum_date_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'day' group by user_id, date) as t2"
    " group by user_id")
data_learn = data_learn.join(df_sub_7, "user_id", 'left')
df_sub_8 = spark.sql("select user_id, "
    " round(avg(sum_date_request_cnt), 3) as night_avg_sum_date_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'night' group by user_id, date) as t3"
    " group by user_id")
data_learn = data_learn.join(df_sub_8, "user_id", 'left')
df_sub_9 = spark.sql("select user_id, "
    " round(avg(sum_date_request_cnt), 3) as morning_avg_sum_date_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'morning' group by user_id, date) as t4"
    " group by user_id")
data_learn = data_learn.join(df_sub_9, "user_id", 'left')
df_sub_10 = spark.sql("select user_id, "
    " round(avg(sum_date_request_cnt), 3) as evening_avg_sum_date_request_cnt"
    " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'evening' group by user_id, date) as t5"
    " group by user_id")
data_learn = data_learn.join(df_sub_10, "user_id", 'left')

df_sub_11 = spark.sql("select user_id, count(date) as count_date"
    " from (select user_id, date from mts group by user_id, date) as t6 group by user_id")
data_learn = data_learn.join(df_sub_11, "user_id", 'left')
df_sub_12 = spark.sql("select user_id, count(date) as count_day_date"
    " from (select user_id, date from mts where part_of_day = 'day' group by user_id, date) as t7 group by user_id")
data_learn = data_learn.join(df_sub_12, "user_id", 'left')
df_sub_13 = spark.sql("select user_id, count(date) as count_night_date"
    " from (select user_id, date from mts where part_of_day = 'night' group by user_id, date) as t8 group by user_id")
data_learn = data_learn.join(df_sub_13, "user_id", 'left')
df_sub_14 = spark.sql("select user_id, count(date) as count_morning_date"
    " from (select user_id, date from mts where part_of_day = 'morning' group by user_id, date) as t9 group by user_id")
data_learn = data_learn.join(df_sub_14, "user_id", 'left')
df_sub_15 = spark.sql("select user_id, count(date) as count_evening_date"
    " from (select user_id, date from mts where part_of_day = 'evening' group by user_id, date) as t10 group by user_id")
data_learn = data_learn.join(df_sub_15, "user_id", 'left')

df_sub_16 = spark.sql("select user_id, avg(count_part_of_day_date) as avg_count_part_of_day_date"
    " from (select user_id, date, count(part_of_day) as count_part_of_day_date"
    " from (select user_id, date, part_of_day from mts group by user_id, date, part_of_day) as t11"
    " group by user_id, date) as t12 group by user_id")
data_learn = data_learn.join(df_sub_16, "user_id", 'left')

df_sub_17 = spark.sql("select user_id, avg(lag_date) as avg_lag_date, max(lag_date) as max_lag_date, min(lag_date) as min_lag_date"
    " from (select user_id, int(date - lag(date) over (partition by user_id order by date)) as lag_date"
    " from (select user_id, date from mts group by user_id, date order by user_id, date) as t13) as t14"
    " group by user_id order by user_id")
data_learn = data_learn.join(df_sub_17, "user_id", 'left')

df_sub_18 = spark.sql("select user_id, count(region_name) as count_region_name"
    " from (select user_id, region_name from mts group by user_id, region_name) as t15"
    " group by user_id order by user_id")
data_learn = data_learn.join(df_sub_18, "user_id", 'left')
df_sub_19 = spark.sql("select user_id, count(city_name) as count_city_name"
    " from (select user_id, city_name from mts group by user_id, city_name) as t16"
    " group by user_id order by user_id")
data_learn = data_learn.join(df_sub_19, "user_id", 'left')

In [9]:
%%time
data_learn.write.parquet(path="data_out/data_transform_last", mode="overwrite")

CPU times: user 146 ms, sys: 76.9 ms, total: 223 ms
Wall time: 18min 43s


In [10]:
# Чтобы не формировать data_learn сразу формируя запросы прочитаем уже имеющийся, а затем прибавим новые фичи
data_last = spark.read.format("parquet").load('data_out/data_transform_last')

In [11]:
%%time
df_sub_20 = spark.sql("select user_id, url_host as top_1_url_sum_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by sum_request_cnt desc) as rank_sum_request_cnt "
          " from (select user_id, url_host, sum(request_cnt) as sum_request_cnt"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_sum_request_cnt = 1")
data_learn = data_last.join(df_sub_20, "user_id", 'left')
df_sub_21 = spark.sql("select user_id, url_host as top_2_url_sum_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by sum_request_cnt desc) as rank_sum_request_cnt "
          " from (select user_id, url_host, sum(request_cnt) as sum_request_cnt"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_sum_request_cnt = 2")
data_learn = data_learn.join(df_sub_21, "user_id", 'left')
df_sub_22 = spark.sql("select user_id, url_host as top_3_url_sum_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by sum_request_cnt desc) as rank_sum_request_cnt "
          " from (select user_id, url_host, sum(request_cnt) as sum_request_cnt"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_sum_request_cnt = 3")
data_learn = data_learn.join(df_sub_22, "user_id", 'left')
df_sub_23 = spark.sql("select user_id, url_host as top_4_url_sum_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by sum_request_cnt desc) as rank_sum_request_cnt "
          " from (select user_id, url_host, sum(request_cnt) as sum_request_cnt"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_sum_request_cnt = 4")
data_learn = data_learn.join(df_sub_23, "user_id", 'left')
df_sub_24 = spark.sql("select user_id, url_host as top_5_url_sum_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by sum_request_cnt desc) as rank_sum_request_cnt "
          " from (select user_id, url_host, sum(request_cnt) as sum_request_cnt"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_sum_request_cnt = 5")
data_learn = data_learn.join(df_sub_24, "user_id", 'left')
df_sub_25 = spark.sql("select user_id, url_host as top_1_url_count_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by count_url_host desc) as rank_count_url_host "
          " from (select user_id, url_host, count(url_host) as count_url_host"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_count_url_host = 1")
data_learn = data_learn.join(df_sub_25, "user_id", 'left')
df_sub_26 = spark.sql("select user_id, url_host as top_2_url_count_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by count_url_host desc) as rank_count_url_host "
          " from (select user_id, url_host, count(url_host) as count_url_host"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_count_url_host = 2")
data_learn = data_learn.join(df_sub_26, "user_id", 'left')
df_sub_27 = spark.sql("select user_id, url_host as top_3_url_count_request_cnt"
          " from (select user_id, url_host, ROW_NUMBER() over (partition by user_id order by count_url_host desc) as rank_count_url_host "
          " from (select user_id, url_host, count(url_host) as count_url_host"
          " from mts group by user_id, url_host) as t1) as t2"
          " where rank_count_url_host = 3")
data_learn = data_learn.join(df_sub_27, "user_id", 'left')
df_sub_28 = spark.sql("select user_id, region_name as region_name"
          " from (select user_id, region_name, ROW_NUMBER() over (partition by user_id order by count_url_host desc) as rank_count_url_host "
          " from (select user_id, region_name, count(url_host) as count_url_host"
          " from mts group by user_id, region_name) as t1) as t2"
          " where rank_count_url_host = 1")
data_learn = data_learn.join(df_sub_28, "user_id", 'left')
df_sub_29 = spark.sql("select user_id, cpe_model_name from mts group by user_id, cpe_model_name")
data_learn = data_learn.join(df_sub_29, "user_id", 'left')

CPU times: user 18.7 ms, sys: 6.26 ms, total: 25 ms
Wall time: 438 ms


In [12]:
%%time
data_learn.write.parquet(path="data_out/data_transform_last_1", mode="overwrite")

CPU times: user 109 ms, sys: 49.3 ms, total: 158 ms
Wall time: 12min 52s


In [13]:
data_test = spark.read.format("parquet").load('data_out/data_transform_last')

In [14]:
data_test.count()

415317

In [8]:
%%time
df_sub_28 = spark.sql("select user_id, region_name, ROW_NUMBER() over (partition by user_id order by count_url_host desc) as rank_count_url_host "
          " from (select user_id, region_name, count(url_host) as count_url_host"
          " from mts group by user_id, region_name) as t1")
df_sub_28.show(30)

+-------+--------------------+-------------------+
|user_id|         region_name|rank_count_url_host|
+-------+--------------------+-------------------+
|      0|              Москва|                  1|
|      1|              Москва|                  1|
|      1|     Санкт-Петербург|                  2|
|      1|  Московская область|                  3|
|      2|     Республика Коми|                  1|
|      3| Воронежская область|                  1|
|      4|  Краснодарский край|                  1|
|      4|Республика Башкор...|                  2|
|      4|Волгоградская обл...|                  3|
|      4|   Самарская область|                  4|
|      4| Саратовская область|                  5|
|      5|Ленинградская обл...|                  1|
|      5|     Санкт-Петербург|                  2|
|      6|  Московская область|                  1|
|      7|              Москва|                  1|
|      7|  Московская область|                  2|
|      8|Нижегородская обл...| 

In [10]:
%%time
df_sub_28 = spark.sql("select user_id, cpe_model_name from mts group by user_id, cpe_model_name")
print(df_sub_28.count())
df_sub_28.show(30)

415317
+-------+--------------------+
|user_id|      cpe_model_name|
+-------+--------------------+
| 222203|   iPhone 12 Pro Max|
|  83980|       Honor 7S Dual|
| 106223|       iPhone 6 Plus|
| 250496|Galaxy A71 Dual R...|
|  33908|           iPhone XR|
| 323731|       iPhone 7 Plus|
| 131179|           iPhone 11|
| 106000|           iPhone SE|
|  94313| Galaxy J7 2017 Dual|
| 179669|  Galaxy J3 LTE Dual|
|  84503|            Redmi 4X|
| 303462|      Galaxy A32 LTE|
|   2732|            iPhone 7|
| 208843|            iPhone 7|
| 307944|     Galaxy A10 Dual|
| 355218|       iPhone 12 Pro|
| 336597| Galaxy J4 2018 Dual|
| 281195|     Galaxy A40 Dual|
| 120897|           iPhone XR|
| 267029|             Redmi 8|
| 343268|     Galaxy A50 Dual|
| 409254|             Y7 2019|
|  56820|        Redmi 5 Plus|
| 401639|        Redmi 5 Plus|
| 212281|            Redmi 9C|
| 143308|           iPhone 11|
| 303765|             Y9 2018|
| 271161|                MI 6|
|  46647|           iPhone 11|
|

415317


415317

In [None]:
# googleads.g.doubl... 38 mail.yandex.ru  144

In [40]:
%%time
spark.sql("select user_id, url_host, ROW_NUMBER() over (partition by user_id order by sum_request_cnt desc) as rank_sum_request_cnt, "
          " ROW_NUMBER() over (partition by user_id order by count_url_host desc) as rank_count_url_host"
          " from (select user_id, url_host, sum(request_cnt) as sum_request_cnt, count(url_host) as count_url_host"
          " from mts group by user_id, url_host) as t1").orderBy("user_id").show(30)

+-------+--------------------+--------------------+-------------------+
|user_id|            url_host|rank_sum_request_cnt|rank_count_url_host|
+-------+--------------------+--------------------+-------------------+
|      0|googleads.g.doubl...|                   1|                  1|
|      0|node1.online.sber...|                   3|                  2|
|      0|  online.sberbank.ru|                   5|                  3|
|      0|        yastatic.net|                   2|                  4|
|      0|avatars.mds.yande...|                   4|                  5|
|      0|              vk.com|                   6|                  6|
|      0|tpc.googlesyndica...|                   7|                  7|
|      0|           yandex.ru|                   9|                  8|
|      0|         s0.2mdn.net|                  11|                  9|
|      0|          ad.mail.ru|                  12|                 10|
|      0|        ads.adfox.ru|                  13|             

In [15]:
# data_learn = spark.sql("with sub_1 as"
#     " (select user_id, max(request_cnt) as max_request_cnt, round(avg(request_cnt), 3) as avg_request_cnt, count(request_cnt)as count_request_cnt "
#     " from mts group by user_id)"
#     ", sub_2 as (select user_id, max(request_cnt) as max_night_request_cnt, "
#     " round(avg(request_cnt), 3) as avg_night_request_cnt, count(request_cnt)as count_night_request_cnt "
#     " from mts where part_of_day = 'night' group by user_id)"
    # ", sub_3 as (select user_id, max(request_cnt) as max_day_request_cnt, "
    # " round(avg(request_cnt), 3) as avg_day_request_cnt, count(request_cnt)as count_day_request_cnt "
    # " from mts where part_of_day = 'day' group by user_id)"
    # ", sub_4 as (select user_id, max(request_cnt) as max_morning_request_cnt, "
    # " round(avg(request_cnt), 3) as avg_morning_request_cnt, count(request_cnt)as count_morning_request_cnt "
    # " from mts where part_of_day = 'morning' group by user_id)"
    # ", sub_5 as (select user_id, max(request_cnt) as max_evening_request_cnt, "
    # " round(avg(request_cnt), 3) as avg_evening_request_cnt, count(request_cnt)as count_evening_request_cnt "
    # " from mts where part_of_day = 'evening' group by user_id)"
    # ", sub_6 as (select user_id, max(sum_date_request_cnt) as max_sum_date_request_cnt, min(sum_date_request_cnt) as min_sum_date_request_cnt, "
    # " round(avg(sum_date_request_cnt), 3) as avg_sum_date_request_cnt"
    # " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts group by user_id, date) as t1"
    # " group by user_id)"
    # ", sub_7 as (select user_id, max(sum_date_request_cnt) as max_sum_date_day_request_cnt, min(sum_date_request_cnt) as min_sum_date_day_request_cnt, "
    # " round(avg(sum_date_request_cnt), 3) as avg_sum_date_day_request_cnt"
    # " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'day' group by user_id, date) as t2"
    # " group by user_id)"
    # ", sub_8 as (select user_id, max(sum_date_request_cnt) as max_sum_date_night_request_cnt, min(sum_date_request_cnt) as min_sum_date_night_request_cnt, "
    # "round(avg(sum_date_request_cnt), 3) as avg_sum_date_night_request_cnt"
    # " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'night' group by user_id, date) as t3"
    # " group by user_id)"
    # ", sub_9 as (select user_id, max(sum_date_request_cnt) as max_sum_date_morning_request_cnt, min(sum_date_request_cnt) as min_sum_date_morning_request_cnt, "
    # " round(avg(sum_date_request_cnt), 3) as avg_sum_date_morning_request_cnt"
    # " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'morning' group by user_id, date) as t4"
    # " group by user_id)"
    # ", sub_10 as (select user_id, max(sum_date_request_cnt) as max_sum_date_evening_request_cnt, min(sum_date_request_cnt) as min_sum_date_evening_request_cnt, "
    # " round(avg(sum_date_request_cnt), 3) as avg_sum_date_evening_request_cnt"
    # " from (select user_id, date, sum(request_cnt) as sum_date_request_cnt from mts where part_of_day = 'evening' group by user_id, date) as t5"
    # " group by user_id)"
    # ", sub_11 as (select user_id, count(date) as count_date"
    # " from (select user_id, date from mts group by user_id, date) as t6 group by user_id)"
    # ", sub_12 as (select user_id, count(date) as count_day_date"
    # " from (select user_id, date from mts where part_of_day = 'day' group by user_id, date) as t7 group by user_id)"
    # ", sub_13 as (select user_id, count(date) as count_night_date"
    # " from (select user_id, date from mts where part_of_day = 'night' group by user_id, date) as t8 group by user_id)"
    # ", sub_14 as (select user_id, count(date) as count_morning_date"
    # " from (select user_id, date from mts where part_of_day = 'morning' group by user_id, date) as t9 group by user_id)"
    # ", sub_15 as (select user_id, count(date) as count_evening_date"
    # " from (select user_id, date from mts where part_of_day = 'evening' group by user_id, date) as t10 group by user_id)"
    # ", sub_16 as (select user_id, avg(count_part_of_day_date) as avg_count_part_of_day_date,"
    # " max(count_part_of_day_date) as max_count_part_of_day_date, min(count_part_of_day_date) as min_count_part_of_day_date"
    # " from (select user_id, date, count(part_of_day) as count_part_of_day_date"
    # " from (select user_id, date, part_of_day from mts group by user_id, date, part_of_day) as t11"
    # " group by user_id, date) as t12 group by user_id)"
    # ", sub_17 as (select user_id, avg(lag_date) as avg_lag_date, max(lag_date) as max_lag_date, min(lag_date) as min_lag_date"
    # " from (select user_id, int(date - lag(date) over (partition by user_id order by date)) as lag_date"
    # " from (select user_id, date from mts group by user_id, date order by user_id, date) as t13) as t14"
    # " group by user_id order by user_id)"
    # ", sub_18 as (select user_id, count(region_name) as count_region_name"
    # " from (select user_id, region_name from mts group by user_id, region_name) as t15"
    # " group by user_id order by user_id)"
    # ", sub_19 as (select user_id, count(city_name) as count_city_name"
    # " from (select user_id, city_name from mts group by user_id, city_name) as t16"
    # " group by user_id order by user_id)"                     
    # ).show(3, vertical=True)

-RECORD 0-------------------
 user_id           | 99002  
 age               | 41.0   
 is_male           | 0      
 max_request_cnt   | 8      
 avg_request_cnt   | 1.34   
 count_request_cnt | 639    
-RECORD 1-------------------
 user_id           | 155506 
 age               | 33.0   
 is_male           | 0      
 max_request_cnt   | 5      
 avg_request_cnt   | 1.727  
 count_request_cnt | 22     
-RECORD 2-------------------
 user_id           | 188276 
 age               | 35.0   
 is_male           | 1      
 max_request_cnt   | 4      
 avg_request_cnt   | 1.414  
 count_request_cnt | 111    
only showing top 3 rows



In [11]:
data_test.show(3, vertical=True)

-RECORD 0------------------
 user_id           | 0     
 age               | 35.0  
 is_male           | 0     
 max_request_cnt   | 5     
 avg_request_cnt   | 1.473 
 count_request_cnt | 131   
-RECORD 1------------------
 user_id           | 1     
 age               | 41.0  
 is_male           | 0     
 max_request_cnt   | 6     
 avg_request_cnt   | 1.496 
 count_request_cnt | 700   
-RECORD 2------------------
 user_id           | 2     
 age               | 25.0  
 is_male           | 0     
 max_request_cnt   | 4     
 avg_request_cnt   | 1.154 
 count_request_cnt | 356   
only showing top 3 rows

