In [9]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

## Подготовка данных

In [10]:
val df = spark.read.format("csv").option("header", "true").load("../data/ua_reactions.csv")
val excluded = sc.textFile("../data/excluded.txt").collect()
df.describe().show()

val exlude: (String => Boolean) = (s: String) => !excluded.contains(s)
val udfFiltering = udf(exlude)
val filtered = df.filter(udfFiltering($"ua"))
filtered.describe().show()

+-------+--------------------+-------------------+
|summary|                  ua|           is_click|
+-------+--------------------+-------------------+
|  count|                 747|                747|
|   mean|                null|0.49531459170013387|
| stddev|                null| 0.5003130401706647|
|    min|Mozilla/5.0 (Linu...|                  0|
|    max|Mozilla/5.0 (X11;...|                  1|
+-------+--------------------+-------------------+

+-------+--------------------+------------------+
|summary|                  ua|          is_click|
+-------+--------------------+------------------+
|  count|                 543|               543|
|   mean|                null|0.5174953959484346|
| stddev|                null|0.5001545770994442|
|    min|Mozilla/5.0 (Linu...|                 0|
|    max|Mozilla/5.0 (X11;...|                 1|
+-------+--------------------+------------------+



df = [ua: string, is_click: string]
excluded = Array(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36, Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36, Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36, Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36, Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36, Mozilla/5.0 ...


Array(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36, Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36, Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36, Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36, Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36, Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36, Mozilla/5.0 ...

## Задача №1
Взять все UA для которых было больше 5 показов рекламы посчитать CTR (clicks / shows) для каждого UA и вывести топ 5.

In [11]:
filtered
    .groupBy("ua")
    .agg(sum($"is_click"), count($"*"))
    .filter($"count(1)" > 5)
    .sort($"sum(is_click)".desc)
    .limit(5)
    .show()

+--------------------+-------------+--------+
|                  ua|sum(is_click)|count(1)|
+--------------------+-------------+--------+
|Mozilla/5.0 (Linu...|         17.0|      28|
|Mozilla/5.0 (Linu...|         16.0|      22|
|Mozilla/5.0 (Wind...|         16.0|      25|
|Mozilla/5.0 (Wind...|         14.0|      21|
|Mozilla/5.0 (Wind...|         13.0|      30|
+--------------------+-------------+--------+



## Задача №2
Вывести все UA на которых приходится 50% рекламных показов.

In [13]:
val w = Window.orderBy($"fraction".desc)  
filtered
    .groupBy("ua")
    .count()
    .withColumn(
        "fraction", 
        round(($"count" / sum($"count").over()) * 100, 2))
    .sort($"fraction".desc)
    .withColumn(
        "cum_sum_frac", 
        round(sum($"fraction").over(w), 2))
    .filter($"cum_sum_frac" <= 50)
    .show()

+--------------------+-----+--------+------------+
|                  ua|count|fraction|cum_sum_frac|
+--------------------+-----+--------+------------+
|Mozilla/5.0 (Wind...|   30|    5.52|        5.52|
|Mozilla/5.0 (Wind...|   29|    5.34|       10.86|
|Mozilla/5.0 (Linu...|   28|    5.16|       16.02|
|Mozilla/5.0 (Wind...|   27|    4.97|       25.96|
|Mozilla/5.0 (Wind...|   27|    4.97|       25.96|
|Mozilla/5.0 (Wind...|   26|    4.79|       30.75|
|Mozilla/5.0 (Wind...|   25|     4.6|       39.95|
|Mozilla/5.0 (Wind...|   25|     4.6|       39.95|
|Mozilla/5.0 (Wind...|   24|    4.42|       44.37|
|Mozilla/5.0 (Linu...|   23|    4.24|       48.61|
+--------------------+-----+--------+------------+



w = org.apache.spark.sql.expressions.WindowSpec@13c191ab


org.apache.spark.sql.expressions.WindowSpec@13c191ab