In [1]:
# # Prepare
# prepare.py

# Time Series Data Aquisition Exercises
import pandas as pd
from acquire import acquire_data


# # Data Dictionary:
# 
# - date - Date of the sale data. There are no holiday effects or store closures.
#  
# - store_address, store_id, store_city, store_state, store_zipcode
#  
# - item_brand, item_id, item_name, item_price, item_upc12, item_upc14
#  
# - sales.item: item id in the transaction
#  
# - sale_amount: Number of items sold at a particular store on a particular date.
#  
# - sale_date: Date of the transaction
#  
# - sale_id: ID of the sale of that item of that transaction.
#  
# - sales.store: store where the sale took place
#  
# - /stores[/{store_id}]
#  
# - /items[/{item_id}]
#  
# - /sales[/{sale_id}]


# - Write a function to convert a date to a datetime data type.
# class datetime.datetime(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]])
def parse_sales_date(df):
    
    df = df.copy()
    df.sale_date = pd.to_datetime(arg=df.sale_date, 
                          dayfirst=True, 
                          utc=True, 
                          box=True, 
                          exact=True,
                          format='%a, %d %b %Y %H:%M:%S %Z',
                          cache=True,
                          errors='raise') 
    return df


# - Write a function to change a datetime to UTC. 
#     - done by setting parameter in my to_datetime function

# - Write a function to parse a date column into 6 additional columns (while keeping the original date): year, quarter, month, day of month, day of week, weekend vs. weekday

def add_date_parts(df):
    df = df.copy()
    df['year'] = df.sale_date.dt.year
    df['quarter'] = df.sale_date.dt.quarter
    df['month'] = df.sale_date.dt.month
    df['day'] = df.sale_date.dt.day
    df['hour'] = df.sale_date.dt.hour
    df['dayofweek'] = df.sale_date.dt.dayofweek
    df['weekday'] = df.sale_date.dt.day_name().str[:3]
    # df['weekday'] = df.sale_date.dt.day_name().str[:3]
    df['is_weekend'] = ((pd.DatetimeIndex(df.sale_date).dayofweek) // 5 == 1)
    return df.set_index('sale_date')


# - Add a column to your dataframe, sales_total, which is a derived from sale_amount (total items) and item_price.
def improve_sales_data(df):
    df = df.copy()
    df.rename(columns={'sale_amount':'quantity'}, inplace=True)
    df['sale_total'] = df['quantity'] * df['item_price']
    return df


# # - Create a new dataframe that aggregates the sales_total and sale_amount(item count) using sum and median by day of week.
# def aggregate_by_weekday(df):
#     df = df.copy()
#     by_dayofweek = pd.DataFrame()
#     by_dayofweek['quantity_sum'] = df.groupby(['weekday']).quantity.sum()
#     by_dayofweek['item_cnt_sum'] = df.groupby(['weekday']).sale_total.sum()
#     by_dayofweek['quantity_median'] = df.groupby(['weekday']).quantity.median()
#     by_dayofweek['item_cnt_median'] = df.groupby(['weekday']).sale_total.median()
#     return by_dayofweek



# - Explore the pandas DataFrame.diff() function. Create a new column that is the result of the current sales - the previous days sales.
def add_sales_difference(df):
    df = df.copy()
    df['diff_from_last_day'] = df.sale_total.diff()
    return df


# - Write a function to set the index to be the datetime variable.
#     - done in the return of parse_date
def prep_store_data(df):
    df = df.copy()
    df = parse_sales_date(df)
    df = add_date_parts(df)
    df = improve_sales_data(df)
    df = add_sales_difference(df)
    return(df)

In [4]:
if __name__ == '__main__':
    df = acquire_data()
    df = prep_store_data(df)

Reading  items  from local csv
Reading  stores  from local csv
Reading  sales  from local csv
items:  (50, 6)
stores:  (10, 5)
sales:  (913000, 5)


In [5]:
df.head()

Unnamed: 0_level_0,item_id,quantity,sale_id,store_id,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,...,year,quarter,month,day,hour,dayofweek,weekday,is_weekend,sale_total,diff_from_last_day
sale_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01 00:00:00+00:00,1,13.0,1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,...,2013,1,1,1,0,1,Tue,False,10.92,
2013-01-02 00:00:00+00:00,1,11.0,2,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,...,2013,1,1,2,0,2,Wed,False,9.24,-1.68
2013-01-03 00:00:00+00:00,1,14.0,3,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,...,2013,1,1,3,0,3,Thu,False,11.76,2.52
2013-01-04 00:00:00+00:00,1,13.0,4,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,...,2013,1,1,4,0,4,Fri,False,10.92,-0.84
2013-01-05 00:00:00+00:00,1,10.0,5,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,...,2013,1,1,5,0,5,Sat,True,8.4,-2.52
