In [1]:
# Load everything

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import os 
import findspark 
findspark.init()

# for sql
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,avg,max,count

# for time 
import time 
import datetime as dt

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType,TimestampType,LongType
import time
# Initialize Spark Session
spark = SparkSession \
    .builder \
    .appName("eCommerce") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Define the schema, if you know it (optional but recommended for better performance)
schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("user_session", StringType(), True)

    # Add other fields as necessary
])

# Read CSV files
# Read 3 files in same variable
spark_df = spark.read.csv(["../../../2019-Oct.csv","../../../2019-Nov.csv","../../../2019-Dec.csv"],schema=schema)

In [3]:
spark_df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



### 2019's file have 177493624 筆資料

In [4]:
spark_df.count()

177493624

### use RDD

In [4]:
spark_rdd = spark.sparkContext.textFile("../../../2019-Oct.csv,../../../2019-Nov.csv,../../../2019-Dec.csv")

In [5]:
print("partition count:"+str(spark_rdd.getNumPartitions()))

partition count:717


In [7]:
#spark_rdd.cache()

../../../2019-Oct.csv,../../../2019-Nov.csv,../../../2019-Dec.csv MapPartitionsRDD[8] at textFile at <unknown>:0

In [None]:
spark_rdd.count()

177493624

In [6]:
from pyspark import StorageLevel

In [7]:
spark_rdd.persist(StorageLevel.MEMORY_AND_DISK)

../../../2019-Oct.csv,../../../2019-Nov.csv,../../../2019-Dec.csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [8]:
spark_rdd.filter(lambda x: x.split(",")[1] == "purchase").count()

2821836

In [10]:
# spark_rdd selected only event_type = purchase
spark_rdd_purchase = spark_rdd.filter(lambda x: x.split(",")[1] == "purchase")

In [15]:
spark_rdd_purchase.count()

2821836

### re-partition

In [11]:
reparRdd = spark_rdd.repartition(16)
print("re-partition count:"+str(reparRdd.getNumPartitions()))

re-partition count:16


In [12]:
reparRdd.count()

177493624

### re-partition for purchase

In [34]:
reparRdd_purchase = spark_rdd_purchase.repartition(16)

print("re-partition count:"+str(reparRdd_purchase.getNumPartitions()))

re-partition count:16


In [28]:
reparRdd_purchase.count()

2821836

### not use repartition => coalesce

In [19]:
spark_rdd_purchase.cache()

PythonRDD[22] at RDD at PythonRDD.scala:53

In [11]:

coalesceRdd_purchase = spark_rdd_purchase.coalesce(200, shuffle = True)
print("coalesce count:"+str(coalesceRdd_purchase.getNumPartitions()))

coalesce count:200


In [12]:
coalesceRdd_purchase.count()

2821836