In [1]:
# Common imports
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import os

# Plotting imports and settings
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (13,8)

In [2]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = "."

imgpath = os.path.join(PROJECT_ROOT_DIR, "images")
if not os.path.exists(imgpath):
    os.makedirs(imgpath)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
# Volume Sold & Time
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"
import random

# Set the random seed to ensure reproducibility
random.seed(42)
# IowaSales_Copy_Filled
np.random.seed(42)
tf.random.set_seed(42)

## Load

In [4]:
%%time
IowaSales = pd.read_csv("IowaSalesCleaned.csv", low_memory=False)
IowaSales.shape

CPU times: total: 36.6 s
Wall time: 48.4 s


(24592365, 21)

In [5]:
size = IowaSales.shape[0]
if(pd.options.display.max_info_rows <= size):
  pd.options.display.max_info_rows = size + 1
pd.options.display.max_rows = 500
print(pd.options.display.max_info_rows)

24592366


In [6]:
IowaSales['Date'] = pd.to_datetime(IowaSales['Date'])
IowaSales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24592365 entries, 0 to 24592364
Data columns (total 21 columns):
 #   Column                 Non-Null Count     Dtype         
---  ------                 --------------     -----         
 0   Date                   24592365 non-null  datetime64[ns]
 1   Store Number           24592365 non-null  int64         
 2   City                   24592365 non-null  object        
 3   Zip Code               24592365 non-null  int64         
 4   County Number          24592365 non-null  float64       
 5   Category               24592365 non-null  float64       
 6   Category Name          24592365 non-null  object        
 7   Vendor Number          24592365 non-null  float64       
 8   Item Number            24592365 non-null  int64         
 9   Pack                   24592365 non-null  int64         
 10  Bottle Volume (ml)     24592365 non-null  int64         
 11  State Bottle Cost      24592365 non-null  float64       
 12  State Bottle

In [7]:
IowaSales.head()

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,Category,Category Name,Vendor Number,Item Number,Pack,...,State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons),Longtidute,Latitude,Month,Pack Volume (ml)
0,2017-10-16,2569,CEDAR RAPIDS,52402,57.0,1012200.0,WHISKY,55.0,8828,6,...,10.85,16.28,1,16.77,1.75,0.46,-91.630677,42.027805,10,10500
1,2017-10-10,4167,DUBUQUE,52001,31.0,1062500.0,RUM,65.0,44557,12,...,6.83,10.25,1,10.25,0.75,0.2,-90.668138,42.504959,10,9000
2,2017-10-13,4147,CEDAR FALLS,50613,7.0,1012100.0,WHISKY,260.0,10791,12,...,15.59,23.39,12,280.68,9.0,2.38,-92.474689,42.539076,10,9000
3,2017-10-11,2556,ESTHERVILLE,51334,32.0,1062400.0,RUM,259.0,43028,6,...,11.55,17.33,6,103.98,10.5,2.77,-94.829962,43.402096,10,10500
4,2017-10-13,3629,COUNCIL BLUFFS,51503,78.0,1062400.0,RUM,259.0,43028,6,...,11.55,17.33,12,207.96,21.0,5.55,-95.848696,41.226557,10,10500


In [8]:
# One Hot encoder - converting 
IowaSales_Copy = pd.get_dummies(IowaSales, columns=["Category Name"], prefix_sep='_', drop_first=True)
IowaSales_Copy = pd.get_dummies(IowaSales_Copy, columns=["County Number"], prefix_sep='_', drop_first=True)


In [9]:
IowaSales_Reduced = IowaSales_Copy.drop(
    ['City', 'Category', 'Zip Code', 'Vendor Number', 'Store Number', 'Item Number', 'Volume Sold (Gallons)', 'State Bottle Retail'], 
    axis=1)

In [11]:
from sklearn.model_selection import train_test_split
IowaSales_Main = IowaSales_Copy.copy()
# y_Sale = IowaSales_Main[['Sale (Dollars)']].values
# X_Sale = IowaSales_Main.drop(['Sale (Dollars)', 'City'], axis=1).values
# X_Sale_train, X_Sale_test, y_Sale_train, y_Sale_test = train_test_split(X_Sale, y_Sale)

## Model

In [12]:
IowaSales_Reduced.head()

Unnamed: 0,Date,Pack,Bottle Volume (ml),State Bottle Cost,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Longtidute,Latitude,Month,...,County Number_90.0,County Number_91.0,County Number_92.0,County Number_93.0,County Number_94.0,County Number_95.0,County Number_96.0,County Number_97.0,County Number_98.0,County Number_99.0
0,2017-10-16,6,1750,10.85,1,16.77,1.75,-91.630677,42.027805,10,...,0,0,0,0,0,0,0,0,0,0
1,2017-10-10,12,750,6.83,1,10.25,0.75,-90.668138,42.504959,10,...,0,0,0,0,0,0,0,0,0,0
2,2017-10-13,12,750,15.59,12,280.68,9.0,-92.474689,42.539076,10,...,0,0,0,0,0,0,0,0,0,0
3,2017-10-11,6,1750,11.55,6,103.98,10.5,-94.829962,43.402096,10,...,0,0,0,0,0,0,0,0,0,0
4,2017-10-13,6,1750,11.55,12,207.96,21.0,-95.848696,41.226557,10,...,0,0,0,0,0,0,0,0,0,0


In [13]:
import dask.dataframe as dd

# Convert your Pandas DataFrame to a Dask DataFrame
IowaSales_Reduced_dd = dd.from_pandas(IowaSales_Reduced, npartitions=50)


# Set 'Date' as the index
IowaSales_Reduced_dd = IowaSales_Reduced_dd.set_index('Date')

# Perform the resampling and aggregation using Dask
IowaSales_Resampled_dd = IowaSales_Reduced_dd.resample('W-Mon').sum().reset_index().compute().sort_values(by='Date')

# The result is already a Pandas DataFrame since we called compute()
IowaSales_Resampled = IowaSales_Resampled_dd



In [14]:


# Read data using Dask (adjust the blocksize according to your system's memory)
IowaSales_Reduced_dd = dd.from_pandas(IowaSales_Reduced, npartitions= 50)

# Set the index to 'Date' column
IowaSales_Reduced_dd = IowaSales_Reduced_dd.set_index('Date')

# Resample and sum the data
IowaSales_Resampled_ByDay = IowaSales_Reduced_dd.resample('1D').sum()

# Compute the result (this step will execute the computation)
IowaSales_Resampled_ByDay = IowaSales_Resampled_ByDay.compute()

# Reset the index and sort by 'Date'
IowaSales_Resampled_ByDay = IowaSales_Resampled_ByDay.reset_index().sort_values(by='Date')


In [15]:
# generate the rolling of total sale and smooth the curve.
window = 14
rolling_sum = IowaSales_Resampled_ByDay["Sale (Dollars)"].rolling(window).sum()
rolling_avg = (rolling_sum / window)[window-1:]
IowaSales_Resampled_WithRollAvg = IowaSales_Resampled_ByDay[window:]
IowaSales_Resampled_WithRollAvg["Sale Days (Dollars)"] = rolling_avg[:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  IowaSales_Resampled_WithRollAvg["Sale Days (Dollars)"] = rolling_avg[:]


In [16]:
IowaSales_Resampled.shape, IowaSales_Resampled_ByDay.shape, IowaSales_Resampled_WithRollAvg.shape


((587, 128), (4106, 128), (4092, 129))

In [17]:
IowaSales_Resampled_WithRollAvg.head(8)

Unnamed: 0,Date,Pack,Bottle Volume (ml),State Bottle Cost,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Longtidute,Latitude,Month,...,County Number_91.0,County Number_92.0,County Number_93.0,County Number_94.0,County Number_95.0,County Number_96.0,County Number_97.0,County Number_98.0,County Number_99.0,Sale Days (Dollars)
14,2012-01-17,101632,8061000,73525.4,68091,842912.95,66453.73,-775001.835562,349581.720541,8354,...,97,9,0,0,70,0,0,0,47,485706.536429
15,2012-01-18,86669,6937728,63562.57,64226,811725.39,64060.86,-671871.801508,301044.385524,7178,...,17,139,0,389,0,0,160,0,75,484431.14
16,2012-01-19,100719,7037878,67726.6,79211,947073.96,71123.01,-721799.339014,326499.45828,7815,...,52,0,0,0,0,0,0,0,0,490269.047857
17,2012-01-20,0,0,0.0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,490269.047857
18,2012-01-21,0,0,0.0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,490269.047857
19,2012-01-22,0,0,0.0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,490269.047857
20,2012-01-23,90501,7458477,70034.17,75757,941721.27,70113.54,-716596.718882,324572.582565,7670,...,0,0,59,29,29,156,744,32,29,493717.139286
21,2012-01-24,101899,8371852,76714.94,75534,949654.05,75030.12,-796900.850213,359587.97426,8589,...,73,8,0,0,81,0,0,0,46,501644.032857


In [18]:
# Remove dummy data - whole row with 0
IowaSales_Cleaned = IowaSales_Resampled_WithRollAvg.drop(['Sale (Dollars)'], axis=1)
IowaSales_Cleaned.shape

(4092, 128)

In [19]:
IowaSales_Cleaned.to_csv("DATAFINAL.csv")