Time Series Data Aquisition Exercises

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import itertools

# JSON API
import requests
import json

# data visualization
import matplotlib
import seaborn as sns
import statsmodels.api as sm

%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

Data Dictionary:

- date - Date of the sale data. There are no holiday effects or store closures.

- store_address, store_id, store_city, store_state, store_zipcode

- item_brand, item_id, item_name, item_price, item_upc12, item_upc14

- sales.item: item id in the transaction

- sale_amount: Number of items sold at a particular store on a particular date.

- sale_date: Date of the transaction

- sale_id: ID of the sale of that item of that transaction.

- sales.store: store where the sale took place

- /stores[/{store_id}]

- /items[/{item_id}]

- /sales[/{sale_id}]

In [2]:
def get_data(category):
    
    if os.path.exists(category + '.csv'):
        print('Reading ', category, ' from local csv')
        return pd.read_csv(category + '.csv')
    
    base_url = 'https://python.zach.lol'
    # this is the http request, the get function from requests
    response = requests.get(base_url + '/api/v1/' + category)
    data = response.json()
    page_df = pd.DataFrame(data['payload'][category])

    print('Downloading data for ', category, '...')
    max = data['payload']['max_page'] - 1
    print('max page = ', max+1)
    count = 1
    print(count, ' ', end='')    
    
    for page in range(max):
        response = requests.get(base_url + data['payload']['next_page'])
        data = response.json()
        page_df = pd.concat([page_df, pd.DataFrame(data['payload'][category])]).reset_index()
        page_df.drop(columns='index', inplace=True)
        count += 1
        print(count, ' ', end='')
        
    print()

    return page_df

In [3]:
def acquire_data():
    items = get_data('items')
    stores = get_data('stores')
    sales = get_data('sales')
    print('items: ', items.shape)
    items.to_csv('items.csv', index=False)
    print('stores: ', stores.shape)
    stores.to_csv('stores.csv', index=False)
    print('sales: ', sales.shape)
    sales.to_csv('sales.csv', index=False)
    
    sales.rename(columns={'store': 'store_id', 'item': 'item_id'}, inplace=True)
    df = pd.merge(sales, items, on='item_id')
    df = pd.merge(df, stores, on='store_id')
    
#     df = sales.merge(items, left_on='item', right_on='item_id')
#     df.set_index('sale_id')
#     df.drop(columns=['item'], inplace=True)
#     df = sales.merge(stores, left_on='store', right_on='store_id')
#     df.set_index('sale_id')
#     df.drop(columns=['store'], inplace=True)
    df.drop(df.index[1:-1])
    return df

In [4]:
df = acquire_data()
df.drop(df.index[1:-1])

Downloading data for  items ...
max page =  3
1  2  3  
Downloading data for  stores ...
max page =  1
1  
Downloading data for  sales ...
max page =  183
1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99  100  101  102  103  104  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180  181  182  183  
items:  (50, 6)
stores:  (10, 5)
sale

Unnamed: 0,item,sale_amount,sale_date,sale_id,store_address,store_city,store_id,store_state,store_zipcode
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253
912999,50,82.0,"Sun, 31 Dec 2017 00:00:00 GMT",913000,8503 NW Military Hwy,San Antonio,10,TX,78231


In [4]:
df = acquire_data()
df.drop(df.index[1:-1])

Reading  items  from local csv
Reading  stores  from local csv
Reading  sales  from local csv
items:  (50, 6)
stores:  (10, 5)
sales:  (913000, 5)


Unnamed: 0,item_id,sale_amount,sale_date,sale_id,store_id,item_brand,item_name,item_price,item_upc12,item_upc14,store_address,store_city,store_state,store_zipcode
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1,Riceland,Riceland American Jazmine Rice,0.84,35200264013,35200264013,12125 Alamo Ranch Pkwy,San Antonio,TX,78253
912999,50,82.0,"Sun, 31 Dec 2017 00:00:00 GMT",913000,10,Choice,Choice Organic Teas Black Tea Classic Black - ...,5.2,47445919221,47445919221,8503 NW Military Hwy,San Antonio,TX,78231


Reading from csv to confirm I did not save the index as "unnamed 0"...

In [5]:
read_df_items = pd.read_csv('items.csv')
read_df_items.head(3)

Unnamed: 0,item_brand,item_id,item_name,item_price,item_upc12,item_upc14
0,Riceland,1,Riceland American Jazmine Rice,0.84,35200264013,35200264013
1,Caress,2,Caress Velvet Bliss Ultra Silkening Beauty Bar...,6.44,11111065925,11111065925
2,Earths Best,3,Earths Best Organic Fruit Yogurt Smoothie Mixe...,2.43,23923330139,23923330139


In [6]:
read_df_stores = pd.read_csv('stores.csv')
read_df_stores.head(3)

Unnamed: 0,store_address,store_city,store_id,store_state,store_zipcode
0,12125 Alamo Ranch Pkwy,San Antonio,1,TX,78253
1,9255 FM 471 West,San Antonio,2,TX,78251
2,2118 Fredericksburg Rdj,San Antonio,3,TX,78201


In [7]:
read_df_sales = pd.read_csv('sales.csv')
read_df_sales.head(3)

Unnamed: 0,item,sale_amount,sale_date,sale_id,store
0,1,13.0,"Tue, 01 Jan 2013 00:00:00 GMT",1,1
1,1,11.0,"Wed, 02 Jan 2013 00:00:00 GMT",2,1
2,1,14.0,"Thu, 03 Jan 2013 00:00:00 GMT",3,1


The following are the original instructions and my work as I created it...

1. Using the code from the lesson as a guide, create a data frame named items that has all of the data for items.

In [None]:
response = requests.get('https://python.zach.lol/api/v1/items')
data = response.json()
print('max page = ', data['payload']['max_page'])
items = pd.DataFrame(data['payload']['items'])
items

In [None]:
items = get_data('items')
# items.drop(items.index[2:-2])
# items

2. Do the same thing, but for stores.

In [None]:
response = requests.get('https://python.zach.lol/api/v1/stores')
data = response.json()
print('max page = ', data['payload']['max_page'])
stores = pd.DataFrame(data['payload']['stores'])
stores

In [None]:
stores = get_data('stores')
# stores.drop(stores.index[2:-2])
# stores

3. Extract the data for sales.
    There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted. There should be 913,000 rows.

In [None]:
base_url = 'https://python.zach.lol'
response = requests.get('https://python.zach.lol/api/v1/sales')
print('max page = ', data['payload']['max_page'])
data = response.json()
sales = pd.DataFrame(data['payload']['sales'])

count = 1
max = data['payload']['max_page'] - 1
print('max = ', max)
for page in range(max):
    print(count, ' ', end='')
    response = requests.get(base_url + data['payload']['next_page'])
    data = response.json()
    sales = pd.concat([sales, pd.DataFrame(data['payload']['sales'])]).reset_index()
    sales.drop(columns='index', inplace=True)
    count += 1
print()
print(sales.drop(sales.index[3:-3]))

In [None]:
sales = get_data('sales')
sales
# sales.drop(sales.index[2:-2])

4. Save the data in your files to local csv files so that it will be faster to access in the future.

In [None]:
print('items: ', items.shape)
items.to_csv('items.csv')
print('stores: ', stores.shape)
stores.to_csv('stores.csv')
print('sales: ', sales.shape)
sales.to_csv('sales.csv')

5. Combine the data from your three separate data frames into one large data frame.

In [None]:
df = sales.merge(items, left_on='item', right_on='item_id')
df.set_index('sale_id')
df.drop(columns=['item'], inplace=True)

In [None]:
df = sales.merge(stores, left_on='store', right_on='store_id')
df.set_index('sale_id')
df.drop(columns=['store'], inplace=True)

In [None]:
df.drop(df.index[1:-1])

6. Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions and be able to re-run the functions and get the same results.