In [2]:
filepath = 'bydata/by-day/*.csv'

In [3]:
df = spark.read.format('csv').option('header','true')\
.option('inferSchema','true').load(filepath).coalesce(5)
df.cache()  # 데이터를 캐쉬해서 반복작업에서 성능향상
df.createOrReplaceTempView('dfTable')

                                                                                

In [5]:
df.schema

StructType([StructField('InvoiceNo', StringType(), True), StructField('StockCode', StringType(), True), StructField('Description', StringType(), True), StructField('Quantity', IntegerType(), True), StructField('InvoiceDate', TimestampType(), True), StructField('UnitPrice', DoubleType(), True), StructField('CustomerID', DoubleType(), True), StructField('Country', StringType(), True)])

In [6]:
from pyspark.sql.functions import *

In [9]:
# StockCode - count
df.select(count("StockCode").alias('cnt')).show()

+------+
|   cnt|
+------+
|541909|
+------+



In [12]:
df.select(approx_count_distinct('StockCode', 0.2)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            2944|
+--------------------------------+



In [13]:
df.select(first('StockCode'), last('StockCode')).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|           23084|          22168|
+----------------+---------------+



In [17]:
df.select(min('Quantity'), max('Quantity')).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [18]:
df.select(sum('Quantity')).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [19]:
df.select(sumDistinct('Quantity')).show()



+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [20]:
df.select(sum_distinct('Quantity')).show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



In [21]:
# Quantity  sum, count, avg , mean
df.select(
    count('Quantity').alias('quantity_total'),
    sum('Quantity').alias('quantity_sum'),
    avg('Quantity').alias('quantity_avg'),
    mean('Quantity').alias('quantity_mean'),
    median('Quantity').alias('quantity_median'),
).show()

+--------------+------------+----------------+----------------+---------------+
|quantity_total|quantity_sum|    quantity_avg|   quantity_mean|quantity_median|
+--------------+------------+----------------+----------------+---------------+
|        541909|     5176450|9.55224954743324|9.55224954743324|            3.0|
+--------------+------------+----------------+----------------+---------------+



In [24]:
# corr 변수들간의 상관 관계 -1 ~ 1 사이의 범위를 가지며.. -1 또는 1가까울 수록 높은상관관계
# 주의 할점... 이것이 인과관계를 나타내지 않는다.
df.select(corr("InvoiceNo","Quantity")).show()

+-------------------------+
|corr(InvoiceNo, Quantity)|
+-------------------------+
|     4.912186085617365E-4|
+-------------------------+



In [25]:
# covar_pop, covar_samp
# 공분산 양수면 두 변수가 함께 증가 또는 감소, 음수면 한개가 증가할때 나머지가 감소


In [31]:
# collection_set, collection_list
df.select(collect_set('Country'), collect_list('Country')).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [32]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [37]:
# group by   InvoiceNo
# Quantity  count
df.groupBy('InvoiceNo').agg( count('Quantity'), expr('count(Quantity)')
                           ).show(3)

+---------+---------------+---------------+
|InvoiceNo|count(Quantity)|count(Quantity)|
+---------+---------------+---------------+
|   574966|              8|              8|
|   575091|             38|             38|
|   578057|             28|             28|
+---------+---------------+---------------+
only showing top 3 rows



In [57]:
df.select('InvoiceDate').show(2)
#  년 월 일
# to_date(value, formatstring)
df.select(to_date(col('InvoiceDate'), "yyyy-MM-dd H:mm")).show(2)  # 확인해 볼 것-->

+-------------------+
|        InvoiceDate|
+-------------------+
|2011-12-05 08:38:00|
|2011-12-05 08:38:00|
+-------------------+
only showing top 2 rows

+-------------------------------------+
|to_date(InvoiceDate, yyyy-MM-dd H:mm)|
+-------------------------------------+
|                           2011-12-05|
|                           2011-12-05|
+-------------------------------------+
only showing top 2 rows



In [64]:
dfWithDate = df.withColumn('date',to_date('InvoiceDate'))
dfWithDate.show(2)

+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|        Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      date|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+----------+
|   580538|    23084| RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|2011-12-05|
|   580538|    23077|DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|2011-12-05|
+---------+---------+-------------------+--------+-------------------+---------+----------+--------------+----------+
only showing top 2 rows



In [58]:
#  시계열성 데이터중에서 파티션별로 정렬

In [60]:
df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [65]:
from pyspark.sql.window import Window
windowSpec = Window.partitionBy('CustomerID','date').orderBy(desc('Quantity'))\
.rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [66]:
dense_rank_data =  dense_rank().over(windowSpec)
rank_data = rank().over(windowSpec)

In [69]:
dfWithDate.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country',
 'date']

In [71]:
dfWithDate.where('CustomerID IS NOT NULL').orderBy('CustomerID')\
.select(
    'CustomerID','date','Quantity',
    rank_data.alias('rank_data'),dense_rank_data.alias('dense_rank_data')
).show()



+----------+----------+--------+---------+---------------+
|CustomerID|      date|Quantity|rank_data|dense_rank_data|
+----------+----------+--------+---------+---------------+
|   12346.0|2011-01-18|   74215|        1|              1|
|   12346.0|2011-01-18|  -74215|        2|              2|
|   12347.0|2010-12-07|      36|        1|              1|
|   12347.0|2010-12-07|      30|        2|              2|
|   12347.0|2010-12-07|      24|        3|              3|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              4|
|   12347.0|2010-12-07|      12|        4|              