In [1]:
from functools import reduce
from glob import glob

In [2]:
# import pyspark.pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "22g")
    .config("spark.executor.memory", "22g")
    .getOrCreate()
)

In [3]:
train = spark.read.csv("./alphabattle2.0/alfabattle2_train_target.csv",
                        header=True,
                        inferSchema=True).limit(10_000)
train.show(3, truncate=50)

+------+-------+----+
|app_id|product|flag|
+------+-------+----+
|     0|      3|   0|
|     1|      1|   0|
|     2|      1|   0|
+------+-------+----+
only showing top 3 rows



In [4]:
train.count()

10000

In [5]:
train.select(F.min("app_id")).show()

+-----------+
|min(app_id)|
+-----------+
|          0|
+-----------+



In [6]:
train.select(F.max("app_id")).show()

+-----------+
|max(app_id)|
+-----------+
|      10372|
+-----------+



In [7]:
train.select("flag").distinct().show()

+----+
|flag|
+----+
|   0|
|   1|
+----+



In [8]:
transaction_data_list = glob("./alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest/*.parquet")
transaction_data_list

['./alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_000_0_to_23646.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_001_23647_to_47415.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_002_47416_to_70092.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_003_70093_to_92989.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_004_92990_to_115175.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_005_115176_to_138067.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_006_138068_to_159724.parquet',
 './alphabattle2.0/alfabattle2_train_transactions_contest/train_transactions_contest\\part_007_159725_to_180735.parquet',
 './alphabattle2.0/alfabattle2_train_

In [14]:
transaction_data = spark.read.parquet(*transaction_data_list)
transaction_data.show(3, truncate=50)

+------+------------------+--------+--------------+---------+--------------+--------------------+--------------+--------------+-----------+---+-------+----+------------+-----------+----+-----------+----------+---------+------------------+-----------------+
|app_id|              amnt|currency|operation_kind|card_type|operation_type|operation_type_group|ecommerce_flag|payment_system|income_flag|mcc|country|city|mcc_category|day_of_week|hour|days_before|weekofyear|hour_diff|transaction_number|__index_level_0__|
+------+------------------+--------+--------------+---------+--------------+--------------------+--------------+--------------+-----------+---+-------+----+------------+-----------+----+-----------+----------+---------+------------------+-----------------+
|823300|0.2975890792412704|       1|             4|        2|             4|                   2|             1|             3|          2|  2|      1|  93|           2|          2|  10|        240|        48|       -1|          

In [15]:
transaction_data.printSchema()

root
 |-- app_id: integer (nullable = true)
 |-- amnt: double (nullable = true)
 |-- currency: integer (nullable = true)
 |-- operation_kind: integer (nullable = true)
 |-- card_type: integer (nullable = true)
 |-- operation_type: integer (nullable = true)
 |-- operation_type_group: integer (nullable = true)
 |-- ecommerce_flag: integer (nullable = true)
 |-- payment_system: integer (nullable = true)
 |-- income_flag: integer (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- country: integer (nullable = true)
 |-- city: integer (nullable = true)
 |-- mcc_category: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- days_before: integer (nullable = true)
 |-- weekofyear: integer (nullable = true)
 |-- hour_diff: long (nullable = true)
 |-- transaction_number: integer (nullable = true)
 |-- __index_level_0__: long (nullable = true)



In [16]:
transaction_data.limit(500).toPandas().head()

Unnamed: 0,app_id,amnt,currency,operation_kind,card_type,operation_type,operation_type_group,ecommerce_flag,payment_system,income_flag,...,country,city,mcc_category,day_of_week,hour,days_before,weekofyear,hour_diff,transaction_number,__index_level_0__
0,823300,0.297589,1,4,2,4,2,1,3,2,...,1,93,2,2,10,240,48,-1,1,4280508
1,823300,0.313568,1,1,2,1,1,1,1,1,...,1,93,1,2,10,240,48,0,2,4280509
2,823300,0.305395,1,1,2,1,1,1,1,1,...,1,2,3,6,15,238,46,53,3,4280510
3,823300,0.302912,1,1,2,1,1,1,1,1,...,1,93,1,4,15,236,46,48,4,4280511
4,823300,0.17345,1,1,2,1,1,1,1,1,...,1,93,7,3,15,235,46,24,5,4280512


In [17]:
sample_transaction_data = transaction_data.join(train, on=["app_id"])
sample_transaction_data.columns

['app_id',
 'amnt',
 'currency',
 'operation_kind',
 'card_type',
 'operation_type',
 'operation_type_group',
 'ecommerce_flag',
 'payment_system',
 'income_flag',
 'mcc',
 'country',
 'city',
 'mcc_category',
 'day_of_week',
 'hour',
 'days_before',
 'weekofyear',
 'hour_diff',
 'transaction_number',
 '__index_level_0__',
 'product',
 'flag']

In [18]:
sample_transaction_data.limit(500).toPandas().head()

Unnamed: 0,app_id,amnt,currency,operation_kind,card_type,operation_type,operation_type_group,ecommerce_flag,payment_system,income_flag,...,mcc_category,day_of_week,hour,days_before,weekofyear,hour_diff,transaction_number,__index_level_0__,product,flag
0,0,0.465425,1,4,98,4,2,3,7,3,...,2,4,19,351,34,-1,1,0,3,0
1,0,0.0,1,2,98,7,1,3,7,3,...,2,4,20,351,34,0,2,1,3,0
2,0,0.521152,1,2,98,3,1,3,7,3,...,2,4,20,351,34,0,3,2,3,0
3,0,0.356078,1,1,5,2,1,3,7,3,...,7,2,0,348,34,52,4,3,3,0
4,0,0.0,1,2,98,7,1,3,7,3,...,2,4,16,337,53,280,5,4,3,0


In [32]:
sample_transaction_data.select("flag").distinct().show(5)

+----+
|flag|
+----+
| 0.5|
+----+

