In [1]:
import warnings

In [2]:
import pandas as pd

In [3]:
warnings.filterwarnings('ignore')

In [5]:
data_or = pd.read_excel('store.xlsx', 'ordinary', header=0, dtype=str).fillna("")
data_or.columns = ['store_code', 'store_cn_name', 'store_en_name',
                   'dept', 'A-order_day', 'A-delivery_day', 'B-order_day', 'B-delivery_day', 'remarks']

data_or_A = data_or[['store_code', 'store_cn_name', 'store_en_name',
                     'dept', 'A-order_day', 'A-delivery_day', 'remarks']]

data_or_A.columns = ['store_code', 'store_cn_name', 'store_en_name',
                     'dept', 'order_type', 'deliver_type', 'remarks']
data_or_A['class'] = 'A'

data_or_B = data_or[['store_code', 'store_cn_name', 'store_en_name',
                     'dept', 'B-order_day', 'B-delivery_day', 'remarks']]

data_or_B.columns = ['store_code', 'store_cn_name', 'store_en_name',
                     'dept', 'order_type', 'deliver_type', 'remarks']

data_or_B['class'] = 'B'

data_or_f = pd.concat([data_or_A, data_or_B], ignore_index=True)

mapping_or = pd.read_excel('order_deliver_mapping.xlsx', 'ordinary', header=0, dtype=str)

res_or = data_or_f.merge(mapping_or, left_on=['class', 'order_type', 'deliver_type'], \
                         right_on=['class', 'order_type', 'deliver_type'])

res_or["week_shift"] = res_or["week_shift"].astype(int)

In [6]:
stores = res_or[['store_code', 'store_cn_name', 'store_en_name','dept']].drop_duplicates()

In [7]:
store_dp = res_or[['store_code','dept']].drop_duplicates()

store_dp["dept1"] = store_dp["dept"]

store_dp = store_dp.set_index(['store_code','dept'])

store_dp = store_dp.stack().str.split(',', expand=True) \
    .stack().apply(pd.Series).stack() \
    .unstack(level=2).reset_index(-1, drop=True).reset_index()

store_dp.columns =['store_code', 'dept', 'dummy', 'dept_code']

store_dp.dept_code = store_dp.dept_code.str.split(' ', 1, expand=True)

store_dp = store_dp[['store_code', 'dept', 'dept_code']].drop_duplicates()

In [8]:
import pyspark
import os
from os.path import expanduser, join, abspath
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import SparkSession

warehouse_location = abspath('spark-warehouse')
os.environ["PYSPARK_SUBMIT_ARGS"] = '--jars /data/jupyter/kudu-spark2_2.11-1.8.0.jar pyspark-shell'

sc = SparkSession.builder \
    .appName("process_store_file") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.exec.compress.output", 'false') \
    .enableHiveSupport() \
    .getOrCreate()

sqlc = HiveContext(sc)

In [9]:
onstock_df = sqlc.createDataFrame(res_or)
onstock_df.write.mode("overwrite").saveAsTable("vartefact.ordinary_onstock_order_deliver_mapping")

In [None]:
store_df = sqlc.createDataFrame(stores)
store_df.write.mode("overwrite").saveAsTable("vartefact.dm_stores")

In [None]:
store_dp = sqlc.createDataFrame(store_dp)
store_dp.write.mode("overwrite").saveAsTable("vartefact.dm_stores_dept")

In [None]:
sc.stop()

In [None]:
sc.table("vartefact.dm_stores_dept").show()

In [None]:
sc.sql