In [1]:
from xml.etree.ElementTree import fromstring
from pyspark.sql import SparkSession
# from Transformers import data_aggregation, impute_mean, lagtransformer, logtransformer, negative_sales, test_train_split,antilogtransformer
from Preprocessing.data_manipulation import DataPreparation
from Transformers.data_aggregation import AggregateData
from Transformers.impute_mean import ImputePrice
from Transformers.negative_sales import NegativeSales
from Transformers.logtransformer import Log
from pyspark.ml import Pipeline
from Transformers.scalar_na_filler import ScalarNAFiller
from Transformers.lagtransformer import Lags
from Transformers.test_train_split import Split

In [2]:
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("project_spark") \
        .master("local[*]") \
        .config("spark.driver.memory", "8g")\
        .getOrCreate()

In [3]:
data = DataPreparation()
df = data.get_data()

In [None]:
df = data.filter_store(df, "WI_1")
df.show(5)

+--------+-----------+--------+--------------------+-------+------+--------+-----+----------+---------+----+-----+----+-------------+------------+------------+------------+-------+-------+-------+----------+
|store_id|    item_id|wm_yr_wk|                  id|dept_id|cat_id|state_id|sales|      date|  weekday|wday|month|year| event_name_1|event_type_1|event_name_2|event_type_2|snap_CA|snap_TX|snap_WI|sell_price|
+--------+-----------+--------+--------------------+-------+------+--------+-----+----------+---------+----+-----+----+-------------+------------+------------+------------+-------+-------+-------+----------+
|    WI_1|FOODS_1_001|   11507|FOODS_1_001_WI_1_...|FOODS_1| FOODS|      WI|    0|2015-03-14| Saturday|   1|    3|2015|         null|        null|        null|        null|      0|      0|      1|      2.24|
|    WI_1|FOODS_1_001|   11507|FOODS_1_001_WI_1_...|FOODS_1| FOODS|      WI|    0|2015-03-15|   Sunday|   2|    3|2015|         null|        null|        null|        n

In [None]:
df.columns

['store_id',
 'item_id',
 'wm_yr_wk',
 'id',
 'dept_id',
 'cat_id',
 'state_id',
 'sales',
 'date',
 'weekday',
 'wday',
 'month',
 'year',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI',
 'sell_price']

In [None]:
imputeNegativePrice = ImputePrice()
negativeSales = NegativeSales(column="sales")
aggregate = AggregateData(columns=["store_id", "dept_id", "year", "month","date"],
                                expressions={"sales": "sum",
                                "sell_price": "avg",
                                "event_name_1": "count",
                                "event_name_2": "count",
                                "snap_WI": "sum"})

In [None]:
log_transform = Log(inputCols=["sales","sell_price","event_name_1","snap_WI"])

In [None]:
null_values_transofrm = NegativeSales(column="event_name_1")

In [None]:
lag_feature_transform = Lags(lags=[1,2], target="sales", partitionBy=["dept_id","month"], orderBy=["store_id", "year"])

In [None]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(
    inputCols=["sell_price"],
    outputCol="features")

In [None]:
assembler.explainParams

<bound method Params.explainParams of VectorAssembler_04043151abe1>

In [None]:
test_train_transform = Split()

In [None]:
transformed = Pipeline(stages=[imputeNegativePrice, negativeSales, aggregate, log_transform, null_values_transofrm, lag_feature_transform]).fit(df).transform(df)

In [None]:
transformed.show()

+--------+-----------+----+-----+----------+------------------+------------------+------------------+------------------+------------+------------------+------------------+
|store_id|    dept_id|year|month|      date|           snap_WI|      event_name_1|             sales|        sell_price|event_name_2|             lag_1|             lag_2|
+--------+-----------+----+-----+----------+------------------+------------------+------------------+------------------+------------+------------------+------------------+
|    WI_1|HOUSEHOLD_1|2011|    7|2011-07-28|              null|              null| 5.484796933490655|1.6448471166443395|           0|              null|              null|
|    WI_1|HOUSEHOLD_1|2011|    7|2011-07-02|6.2766434893416445|              null|  5.71042701737487|1.6448471166443395|           0| 5.484796933490655|              null|
|    WI_1|HOUSEHOLD_1|2011|    7|2011-07-23|              null|              null| 5.968707559985366|1.6448471166443395|           0|  5.710

In [None]:
transformed.columns

['store_id',
 'dept_id',
 'year',
 'month',
 'date',
 'snap_WI',
 'event_name_1',
 'sales',
 'sell_price',
 'event_name_2',
 'lag_1',
 'lag_2']