In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = (
    SparkSession
    .builder
    .master("local[*]")
    .config("spark.driver.memory", "22g")
    .config("spark.executor.memory", "22g")
    .getOrCreate()
)

In [2]:
BASE_PATH = "C:/BigData/avito-context-ad-clicks/"

In [3]:
ads_info = spark.read.csv(BASE_PATH+"AdsInfo.tsv/AdsInfo.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t")

In [4]:
ads_info.printSchema()

root
 |-- AdID: integer (nullable = true)
 |-- LocationID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- Params: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Title: string (nullable = true)
 |-- IsContext: integer (nullable = true)



In [5]:
ads_info.show(5, truncate=50)

+----+----------+----------+--------------------------------------------------+--------+---------------------------------------+---------+
|AdID|LocationID|CategoryID|                                            Params|   Price|                                  Title|IsContext|
+----+----------+----------+--------------------------------------------------+--------+---------------------------------------+---------+
|   1|       343|        43|{1283:'С пробегом', 633:'Синий', 1159:0, 210:'T...|160000.0|                    Toyota Estima, 1993|        0|
|   2|       992|        34|{817:'Кузов', 5:'Запчасти', 598:'Для автомобилей'}|   750.0|Передние брызговики Форд Фокус 2 родные|        0|
|   3|      3771|        53|                              {181:'Промышленное'}| 18000.0|                               Дровокол|        0|
|   4|      4294|        57|                      {130:'Приборы и аксессуары'}|  1500.0|                 Продам ходули складные|        0|
|   5|      1344|        34

In [6]:
category = spark.read.csv(BASE_PATH+"Category.tsv/Category.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t") 
category.printSchema()

root
 |-- CategoryID: integer (nullable = true)
 |-- Level: integer (nullable = true)
 |-- ParentCategoryID: integer (nullable = true)
 |-- SubcategoryID: integer (nullable = true)



In [7]:
category.show(5, truncate=50)

+----------+-----+----------------+-------------+
|CategoryID|Level|ParentCategoryID|SubcategoryID|
+----------+-----+----------------+-------------+
|         0|    1|              10|           45|
|         1|    2|               9|           45|
|         2|    3|              12|            5|
|         3|    3|               9|           25|
|         4|    3|               2|           39|
+----------+-----+----------------+-------------+
only showing top 5 rows



In [8]:
location = spark.read.csv(BASE_PATH+"Location.tsv/Location.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t") 
location.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Level: integer (nullable = true)
 |-- RegionID: integer (nullable = true)
 |-- CityID: integer (nullable = true)



In [9]:
location.show(5, truncate=50)

+----------+-----+--------+------+
|LocationID|Level|RegionID|CityID|
+----------+-----+--------+------+
|         7|    3|      83|  2386|
|        23|    3|      28|  3224|
|        26|    3|      41|  1316|
|        30|    3|      63|  2565|
|        32|    3|      28|  2819|
+----------+-----+--------+------+
only showing top 5 rows



In [10]:
phone_request_stream = spark.read.csv(BASE_PATH+"PhoneRequestsStream.tsv/PhoneRequestsStream.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t") 
phone_request_stream.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- IPID: integer (nullable = true)
 |-- AdID: integer (nullable = true)
 |-- PhoneRequestDate: timestamp (nullable = true)



In [11]:
phone_request_stream.show(5, truncate=50)

+-------+-------+--------+-------------------+
| UserID|   IPID|    AdID|   PhoneRequestDate|
+-------+-------+--------+-------------------+
| 352278|2135799|11720717|2015-04-25 00:00:00|
| 392193| 298552|32569552|2015-04-25 00:00:00|
| 670687|1426242|22443326|2015-04-25 00:00:00|
|1504499| 437051|24774519|2015-04-25 00:00:00|
|2648778| 818978|16455042|2015-04-25 00:00:00|
+-------+-------+--------+-------------------+
only showing top 5 rows



In [12]:
search_info = spark.read.csv(BASE_PATH+"SearchInfo.tsv/SearchInfo.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t") 
search_info.printSchema()

root
 |-- SearchID: integer (nullable = true)
 |-- SearchDate: timestamp (nullable = true)
 |-- IPID: integer (nullable = true)
 |-- UserID: integer (nullable = true)
 |-- IsUserLoggedOn: integer (nullable = true)
 |-- SearchQuery: string (nullable = true)
 |-- LocationID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- SearchParams: string (nullable = true)



In [13]:
search_info.show(5, truncate=50)

+--------+-------------------+-------+-------+--------------+-----------+----------+----------+-------------------------------------------+
|SearchID|         SearchDate|   IPID| UserID|IsUserLoggedOn|SearchQuery|LocationID|CategoryID|                               SearchParams|
+--------+-------------------+-------+-------+--------------+-----------+----------+----------+-------------------------------------------+
|       1|2015-05-18 19:54:32|1717090|3640266|             0|       NULL|      1729|         5|                                       NULL|
|       2|2015-05-12 14:21:28|1731568| 769304|             0|       NULL|       697|        50|                                       NULL|
|       3|2015-05-12 07:09:42| 793143| 640089|             0|       NULL|      1261|        12|                                       NULL|
|       4|2015-05-10 18:11:01| 898705|3573776|             0|       NULL|      3960|        22|{83:'Обувь', 175:'Женская одежда', 88:'38'}|
|       5|2015-04-25

In [14]:
train_search_stream = spark.read.csv(BASE_PATH+"trainSearchStream.tsv/trainSearchStream.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t") 
train_search_stream.printSchema()

root
 |-- SearchID: integer (nullable = true)
 |-- AdID: integer (nullable = true)
 |-- Position: integer (nullable = true)
 |-- ObjectType: integer (nullable = true)
 |-- HistCTR: double (nullable = true)
 |-- IsClick: integer (nullable = true)



In [16]:
train_search_stream.show(5, truncate=50)

+--------+--------+--------+----------+--------+-------+
|SearchID|    AdID|Position|ObjectType| HistCTR|IsClick|
+--------+--------+--------+----------+--------+-------+
|       2|11441863|       1|         3|0.001804|      0|
|       2|22968355|       7|         3|0.004723|      0|
|       3|  212187|       7|         3|0.029701|      0|
|       3|34084553|       1|         3|  0.0043|      0|
|       3|36256251|       2|         2|    NULL|   NULL|
+--------+--------+--------+----------+--------+-------+
only showing top 5 rows



In [17]:
user_info = spark.read.csv(BASE_PATH+"UserInfo.tsv/UserInfo.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t") 
user_info.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- UserAgentID: integer (nullable = true)
 |-- UserAgentOSID: integer (nullable = true)
 |-- UserDeviceID: integer (nullable = true)
 |-- UserAgentFamilyID: integer (nullable = true)



In [18]:
user_info.show(5, truncate=50)

+------+-----------+-------------+------------+-----------------+
|UserID|UserAgentID|UserAgentOSID|UserDeviceID|UserAgentFamilyID|
+------+-----------+-------------+------------+-----------------+
|     1|      44073|           30|        2019|                9|
|     2|      12505|           20|        2014|               85|
|     3|      24256|           20|        2014|               64|
|     4|      57133|           20|        2014|               25|
|     5|      57133|           20|        2014|               25|
+------+-----------+-------------+------------+-----------------+
only showing top 5 rows



In [19]:
visits_stream = spark.read.csv(BASE_PATH+"VisitsStream.tsv/VisitsStream.tsv",
                        header=True,
                        inferSchema=True,
                        sep="\t")
visits_stream.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- IPID: integer (nullable = true)
 |-- AdID: integer (nullable = true)
 |-- ViewDate: timestamp (nullable = true)



In [20]:
visits_stream.show(5, truncate=50)

+------+-------+--------+-------------------+
|UserID|   IPID|    AdID|           ViewDate|
+------+-------+--------+-------------------+
| 59703|1259356|  469877|2015-04-25 00:00:00|
|154389|1846749|27252551|2015-04-25 00:00:00|
|218628|2108380|31685325|2015-04-25 00:00:00|
|231535| 837110|18827716|2015-04-25 00:00:00|
|282306|1654210|29363673|2015-04-25 00:00:00|
+------+-------+--------+-------------------+
only showing top 5 rows

