In [43]:
from xml.etree.ElementTree import fromstring
from pyspark.sql import SparkSession
from Preprocessing.data_manipulation import DataPreparation
from Transformers.data_aggregation import AggregateData
from Transformers.impute_mean import ImputePrice
from Transformers.negative_sales import NegativeSales
from Transformers.logtransformer import Log
from pyspark.ml import Pipeline
from Transformers.scalar_na_filler import ScallerNAFiller
from Transformers.lagtransformer import Lags
from Transformers.test_train_split import Split
from pyspark.ml.feature import VectorAssembler, StringIndexer
from Evaluator.Mape import MAPE
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from Estimator.random_forest import RandomForest
from Evaluator.Mape import MAPE
from Estimator.XGBoost import XGBoost

In [44]:
# Creating a Spark Session
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("project_spark") \
        .master("local[*]") \
        .config("spark.driver.memory", "15g")\
        .getOrCreate()

In [45]:
# df = Data Frame 
data = DataPreparation()
df_m5 = data.get_data()

In [46]:
df_m5 = data.filter_store(df_m5, "WI_1")
df_m5.show(5)

+--------+-----------+--------+--------------------+-------+------+--------+-----+----------+---------+----+-----+----+-------------+------------+------------+------------+-------+-------+-------+----------+
|store_id|    item_id|wm_yr_wk|                  id|dept_id|cat_id|state_id|sales|      date|  weekday|wday|month|year| event_name_1|event_type_1|event_name_2|event_type_2|snap_CA|snap_TX|snap_WI|sell_price|
+--------+-----------+--------+--------------------+-------+------+--------+-----+----------+---------+----+-----+----+-------------+------------+------------+------------+-------+-------+-------+----------+
|    WI_1|FOODS_1_001|   11507|FOODS_1_001_WI_1_...|FOODS_1| FOODS|      WI|    0|2015-03-14| Saturday|   1|    3|2015|         null|        null|        null|        null|      0|      0|      1|      2.24|
|    WI_1|FOODS_1_001|   11507|FOODS_1_001_WI_1_...|FOODS_1| FOODS|      WI|    0|2015-03-15|   Sunday|   2|    3|2015|         null|        null|        null|        n

In [47]:
df_m5.columns

['store_id',
 'item_id',
 'wm_yr_wk',
 'id',
 'dept_id',
 'cat_id',
 'state_id',
 'sales',
 'date',
 'weekday',
 'wday',
 'month',
 'year',
 'event_name_1',
 'event_type_1',
 'event_name_2',
 'event_type_2',
 'snap_CA',
 'snap_TX',
 'snap_WI',
 'sell_price']

# Initiating different Transformers #

In [48]:
imputeNegativePrice = ImputePrice()
negativeSales = NegativeSales(column="sales")
aggregate = AggregateData(columns=["store_id", "dept_id", "year", "month"],
                                expressions={"sales": "sum",
                                "sell_price": "avg",
                                "snap_WI": "sum"})

In [49]:
log_transform = Log(inputCols=["sales","sell_price"])

In [50]:
lag_feature_transform = Lags(lags=[1,2,3], target="sales", partitionBy=["store_id","dept_id"], orderBy=["year", "month"])

In [51]:
na_filler = ScallerNAFiller()

In [52]:
storeIndexer = StringIndexer(inputCol="store_id", outputCol="store_id_index")
yearIndexer = StringIndexer(inputCol="year", outputCol="year_index")

In [53]:
inputColumns = ['month',
        'sell_price',
        'snap_WI',
        'lag_1',
        'lag_2',
        'lag_3',
        'store_id_index',
        'year_index']
assembler = VectorAssembler(inputCols=inputColumns, 
                                    outputCol="features")

#  Preprocessing/FeatureEngineering Pipeline Complete #

In [54]:
transformed = Pipeline(stages=[imputeNegativePrice, negativeSales, aggregate, 
                       log_transform, lag_feature_transform, storeIndexer, yearIndexer, na_filler, assembler]).fit(df_m5).transform(df_m5)

KeyboardInterrupt: 

# Random Forest Training #

In [None]:
spliting = DataPreparation()

In [None]:
train_df, test_df = spliting.train_test_split(transformed, 2016)

In [None]:
rfModel = RandomForest(featuresCol="features", labelCol="sales").fit(train_df)

In [None]:
rfModel.columns

['store_id',
 'dept_id',
 'year',
 'month',
 'sell_price',
 'snap_WI',
 'sales',
 'lag_1',
 'lag_2',
 'lag_3',
 'store_id_index',
 'year_index',
 'features',
 'prediction']

In [None]:
rfModel.select(["store_id","year","sales","prediction"]).show(10)

+--------+----+-----------------+-----------------+
|store_id|year|            sales|       prediction|
+--------+----+-----------------+-----------------+
|    WI_1|2015|8.448485993406447|8.299308821829937|
|    WI_1|2015|8.445052513638554|8.486747116006214|
|    WI_1|2015|8.589699882202986|8.499727021409225|
|    WI_1|2015| 8.46695197497949|8.499727021409225|
|    WI_1|2015|8.495765244002618|8.499727021409225|
|    WI_1|2015|8.473868066677865|8.682446834072161|
|    WI_1|2015|8.536799721055155|8.499727021409225|
|    WI_1|2015|8.518991573357617|8.499727021409225|
|    WI_1|2015|8.377011160816375|8.499727021409225|
|    WI_1|2015|8.399085102935908| 8.50781892320288|
+--------+----+-----------------+-----------------+
only showing top 10 rows



In [None]:
Evaluator_mape = MAPE(predictionCol="prediction", labelCol="sales")

In [None]:
evaluation = Evaluator_mape.evaluate(rfModel)

In [None]:
print("Evaluation matrix Score for RFMODEL:", evaluation)

Evaluation matrix Score for RFMODEL: 0.01341233405185784


# XGBOOST ESTIMATOR IMPLEMENTATION #

In [None]:
train_df.printSchema()

root
 |-- store_id: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- sell_price: double (nullable = false)
 |-- snap_WI: long (nullable = true)
 |-- sales: double (nullable = false)
 |-- lag_1: double (nullable = false)
 |-- lag_2: double (nullable = false)
 |-- lag_3: double (nullable = false)
 |-- store_id_index: double (nullable = false)
 |-- year_index: double (nullable = false)
 |-- features: vector (nullable = true)



In [None]:
XGBoostModel = XGBoost(inputCols=inputColumns, labelCol="sales").fit(train_df)

In [None]:
pred = XGBoostModel.transform(test_df)

  for column, series in pdf.iteritems():


In [None]:
pred.show()

+-----+----+-----+------------------+------------------+
|store|year|month|        prediction|            actual|
+-----+----+-----+------------------+------------------+
|  0.0| 5.0|    1|  9.59622573852539| 9.607033787697222|
|  0.0| 5.0|    2| 9.203985214233398| 9.190647738630446|
|  0.0| 5.0|    3| 9.421991348266602| 9.763363044441961|
|  0.0| 5.0|    4| 9.182626724243164| 9.285726098882073|
|  0.0| 5.0|    1| 8.666375160217285| 9.727525729694754|
|  0.0| 5.0|    2| 7.343536853790283| 8.441607204459642|
|  0.0| 5.0|    3| 8.536872863769531| 9.754697478950966|
|  0.0| 5.0|    4| 8.854217529296875| 8.931155429778348|
|  0.0| 5.0|    1|  7.92768669128418| 8.516793111394898|
|  0.0| 5.0|    2| 8.364577293395996| 9.826822467600643|
|  0.0| 5.0|    3| 9.656170845031738| 9.353747835270912|
|  0.0| 5.0|    4| 7.531565189361572|6.7226297948554485|
|  0.0| 5.0|    1|10.656360626220703| 9.920393832602487|
|  0.0| 5.0|    2|11.167980194091797| 10.82017820443161|
|  0.0| 5.0|    3|10.1623306274

In [None]:
Evaluator_mape_xgBoost = MAPE(predictionCol="prediction", labelCol="actual")

In [None]:
evaluation_xgb = Evaluator_mape_xgBoost.evaluate(pred)

In [None]:
print("Evaluation matrix Score for XGFMODEL:", evaluation_xgb)

Evaluation matrix Score for XGFMODEL: 0.13241363950334317
