In [7]:
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.sql import functions as F
from pyspark.ml import Pipeline


spark = SparkSession.builder \
    .master("local") \
    .appName("ecommercehw") \
    .getOrCreate()


eCommerce = spark.read.csv('eCommerce.csv', header='true')

## Step 1. Get needed columns and split data into train and test

In [63]:
split_col  = F.split(eCommerce['category_code'], '\.')
eCommerce= eCommerce.withColumn('category', split_col.getItem(0))
data = eCommerce.selectExpr(['category', 'brand', 'price', 'event_type as label'])

In [81]:
data.groupBy('label').count().show()

+--------+-------+
|   label|  count|
+--------+-------+
|purchase| 181846|
|    view|8316245|
|    cart| 266914|
+--------+-------+



## One hot encode the columns

In [70]:
cat_cols = ['category', 'brand']
string_indexer = StringIndexer(inputCols= cat_cols, \
                               outputCols=[col+'_index' for col in cat_cols])
onehot_encoder = OneHotEncoder(inputCols= [col+'_index' for col in cat_cols],\
                      outputCols= [col+'_onehot' for col in cat_cols])


In [73]:
pipeline = Pipeline(stages=[string_indexer, onehot_encoder])

pipeline_model = pipeline.fit(data)

In [75]:
pipeline_model.transform(data).show(5)

+-----------+-------+------+-----+--------------+-----------+---------------+------------------+
|   category|  brand| price|label|category_index|brand_index|category_onehot|      brand_onehot|
+-----------+-------+------+-----+--------------+-----------+---------------+------------------+
|electronics| huawei|270.42| view|           0.0|        3.0| (12,[0],[1.0])|  (1675,[3],[1.0])|
|electronics|  yasin|359.08| view|           0.0|       65.0| (12,[0],[1.0])| (1675,[65],[1.0])|
| appliances|almacom|180.16| view|           1.0|      116.0| (12,[1],[1.0])|(1675,[116],[1.0])|
|    apparel|respect| 44.79| view|           3.0|       11.0| (12,[3],[1.0])| (1675,[11],[1.0])|
|electronics|samsung|150.95| view|           0.0|        0.0| (12,[0],[1.0])|  (1675,[0],[1.0])|
+-----------+-------+------+-----+--------------+-----------+---------------+------------------+
only showing top 5 rows

