In [1]:
myrange = spark.range(1000).toDF("number")
myrange

DataFrame[number: bigint]

In [2]:
myrange.show()

                                                                                

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [2]:
staticDataFrame = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("./bydata/by-day/*.csv")

                                                                                

# 임시테이블 생성

In [4]:
staticDataFrame.createOrReplaceTempView("retail_data")

# 데이터 프레임의 스키마구조를 복사

In [5]:
staticSchema = staticDataFrame.schema

# 정적인 데이터를 쿼리

In [6]:
from pyspark.sql.functions import window, column, desc, col
staticDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")\
  .show(5)



+----------+--------------------+-----------------+
|CustomerId|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|{2011-12-05 09:00...|            -37.6|
|   14126.0|{2011-11-29 09:00...|643.6300000000001|
|   13500.0|{2011-11-16 09:00...|497.9700000000001|
|   17160.0|{2011-11-08 09:00...|516.8499999999999|
|   15608.0|{2011-11-11 09:00...|            122.4|
+----------+--------------------+-----------------+
only showing top 5 rows



                                                                                

# 실시간 처리를 위해서 스트리밍 기술을 사용
# 특정폴더의 데이터를 수집

In [7]:
streamingDataFrame = spark.readStream\
    .schema(staticSchema)\
    .option("maxFilesPerTrigger", 1)\
    .format("csv")\
    .option("header", "true")\
    .load("./bydata/by-day/*.csv")

                                                                                

# 수집한 데이터를 출력 - 테스트

In [8]:
purchaseByCustomerPerHour = streamingDataFrame\
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
  .sum("total_cost")

# 실시간 처리를 위해서 reading 하고 있는데이터를 메모리에 실시간으로 적재

In [12]:
purchaseByCustomerPerHour.writeStream\
    .format("memory")\
    .queryName("customer_purchases")\
    .outputMode("complete")\
    .start()

24/03/20 14:25:17 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b63e6ae0-3dfe-4697-b83e-fa0cd7ce139b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/20 14:25:17 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f8dec9c25b0>

                                                                                

# 메모리에 적재된 데이터를 읽어옴

In [14]:
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)\
  .show(5)



+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   18102.0|{2010-12-07 09:00...|          25920.37|
|      NULL|{2010-12-06 09:00...|23395.099999999904|
|      NULL|{2010-12-03 09:00...| 23021.99999999999|
|      NULL|{2010-12-01 09:00...|12584.299999999988|
|   15061.0|{2010-12-02 09:00...| 9407.339999999998|
+----------+--------------------+------------------+
only showing top 5 rows





# 스파크를이용한 전처리 방법
### 결측치(NA )를 0으로 채움
### 피처를 발굴 InvoiceDate를 기준으로 요일을 full name으로 추출해서 day_of_week 컬럼을 만들고 저장
### 날자와같은 연속형 데이터를 범주형 데이터로 새롭게 추출

In [15]:
from pyspark.sql.functions import date_format, col
preppedDataFrame = staticDataFrame\
  .na.fill(0)\
  .withColumn("day_of_week", date_format(col("InvoiceDate"), "EEEE"))\
  .coalesce(5)



In [17]:
preppedDataFrame.show()



+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|     Monday|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|     Monday|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|     Monday|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|     Monday|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|     Monday|
|   580538|    21544|SKULLS  WATER TRA..



## 스파크에서 지원하는 머신러닝을위한 훈련데이터와 검증데이터를 만들기

In [18]:
trainDataFrame = preppedDataFrame\
  .where("InvoiceDate < '2011-07-01'")
testDataFrame = preppedDataFrame\
  .where("InvoiceDate >= '2011-07-01'")



In [20]:
trainDataFrame.show(3)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|     Monday|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|     Monday|
|   537226|    22927|GREEN GIANT GARDE...|       2|2010-12-06 08:34:00|     5.95|   15987.0|United Kingdom|     Monday|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
only showing top 3 rows





In [23]:
!pip install numpy



Collecting numpy




  Downloading numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[K     |▋                               | 358 kB 885 kB/s eta 0:00:21



[K     |█▉                              | 1.0 MB 885 kB/s eta 0:00:20



[K     |███▎                            | 1.9 MB 885 kB/s eta 0:00:19



[K     |████▎                           | 2.5 MB 1.1 MB/s eta 0:00:15



[K     |█████▎                          | 3.0 MB 1.1 MB/s eta 0:00:14



[K     |███████▌                        | 4.3 MB 6.1 MB/s eta 0:00:03



[K     |████████▊                       | 5.0 MB 6.1 MB/s eta 0:00:03



[K     |███████████▍                    | 6.5 MB 6.1 MB/s eta 0:00:02

                                                                                

[K     |██████████████▉                 | 8.5 MB 333 kB/s eta 0:00:30



[K     |█████████████████▋              | 10.1 MB 333 kB/s eta 0:00:25



[K     |██████████████████▏             | 10.3 MB 1.0 MB/s eta 0:00:08



[K     |██████████████████▎             | 10.4 MB 1.0 MB/s eta 0:00:08



[K     |██████████████████▋             | 10.6 MB 1.0 MB/s eta 0:00:08



[K     |██████████████████▊             | 10.7 MB 238 kB/s eta 0:00:32



[K     |███████████████████             | 10.8 MB 238 kB/s eta 0:00:32



[K     |███████████████████▎            | 11.0 MB 126 kB/s eta 0:00:58



[K     |███████████████████▍            | 11.0 MB 126 kB/s eta 0:00:57



[K     |███████████████████▉            | 11.3 MB 683 kB/s eta 0:00:11



[K     |████████████████████            | 11.4 MB 683 kB/s eta 0:00:11

                                                                                

[K     |█████████████████████           | 12.0 MB 118 kB/s eta 0:00:53



[K     |██████████████████████          | 12.5 MB 1.8 MB/s eta 0:00:04



[K     |██████████████████████▍         | 12.8 MB 1.8 MB/s eta 0:00:04



[K     |██████████████████████▊         | 13.0 MB 1.8 MB/s eta 0:00:03



[K     |█████████████████████████▌      | 14.5 MB 5.1 MB/s eta 0:00:01



[K     |████████████████████████████▌   | 16.3 MB 5.1 MB/s eta 0:00:01



[K     |████████████████████████████████| 18.2 MB 5.1 MB/s eta 0:00:01



[K     |████████████████████████████████| 18.2 MB 5.1 MB/s 




[?25hInstalling collected packages: numpy




Successfully installed numpy-1.26.4


                                                                                

In [26]:
# 범주형 변수를 숫자형으로 바꾸는 작업 - 높낮이 또는 크기가 존재하는 범주형 데이터
# label encoding
# 2XL,XL,L,M,S  ---> 1,2,3,4,5
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer().setInputCol("day_of_week").setOutputCol("day_of_week_index")



In [27]:
# one-hot encoding
# 데이터들 사이 연관성을 없애는 방법
# 사과, 딸기 ,바나나
# 001, 010, 100



In [28]:
# 숫자형 데이터를 가지고 변경
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder().setInputCol('day_of_week_index').setOutputCol("day_of_week_encoded")



In [35]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler().setInputCols(["UnitPrice","day_of_week_encoded"]).setOutputCol("features")

                                                                                

In [36]:
# 파이프라인을 구축해서 각 단계별로 구성한 기능을 순차적으로 실행(적용)하기
from pyspark.ml import Pipeline
transformationPipeLine =  Pipeline().setStages([indexer,encoder,vectorAssembler])



In [37]:
# 파이프라인 실행 -- fit
fittedPipeLine = transformationPipeLine.fit(trainDataFrame)

                                                                                

In [38]:
# 변환
trainsformedTraning =  fittedPipeLine.transform(trainDataFrame)

24/03/20 15:10:25 WARN FileStreamSource: Listed 305 file(s) in 2440 ms          
24/03/20 15:10:27 WARN FileStreamSource: Listed 305 file(s) in 2772 ms          
24/03/20 15:10:30 WARN FileStreamSource: Listed 305 file(s) in 2236 ms          
                                                                                

In [39]:
trainsformedTraning.show()

                                                                                

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|day_of_week|day_of_week_index|day_of_week_encoded|            features|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+-----------------+-------------------+--------------------+
|   537226|    22811|SET OF 6 T-LIGHTS...|       6|2010-12-06 08:34:00|     2.95|   15987.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])|(6,[0,3],[2.95,1.0])|
|   537226|    21713|CITRONELLA CANDLE...|       8|2010-12-06 08:34:00|      2.1|   15987.0|United Kingdom|     Monday|              2.0|      (5,[2],[1.0])| (6,[0,3],[2.1,1.0])|
|   537226|    22927|GREEN GIANT GARDE...|       2|2010-12-06 08:34:00|     5.95|   15987.0|United Kingdo

                                                                                