In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import glob
import sys
import time
from copy import deepcopy
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, clear_output

In [4]:
ind_files = sorted(glob.glob("Data_Copy/Economic Indicators/*.csv"))
ind_names = [i[73:] for i in ind_files]
print("Choose one or multiple indicators.")
w = widgets.SelectMultiple(
    options=ind_names,
    description='Options:',
    disable=False,
    layout=Layout(width='30%', height='300px')
)
display(w)

Choose one or multiple indicators.


SelectMultiple(description='Options:', layout=Layout(height='300px', width='30%'), options=('ADP_Employment_Ch…

In [5]:
ind_files_used = w.value
start = "historical_country_United_States_indicator_"
indicators = [start+str(x) for x in ind_files_used]
indicators

['historical_country_United_States_indicator_Terms_of_Trade.csv',
 'historical_country_United_States_indicator_Terrorism_Index.csv',
 'historical_country_United_States_indicator_Total_Vehicle_Sales.csv',
 'historical_country_United_States_indicator_Tourism_Revenues.csv',
 'historical_country_United_States_indicator_Tourist_Arrivals.csv',
 'historical_country_United_States_indicator_Unemployed_Persons.csv',
 'historical_country_United_States_indicator_Unemployment_Rate.csv',
 'historical_country_United_States_indicator_Wage_Growth.csv']

# Exploratory Data Analysis (EDA)

The purpose of the following cell is to list all the available economic indicators that we will use as features in our modeling.

In [None]:
indicatorIdxs = []
for indicator_file in glob.glob("Data_Copy/Economic indicators/*"):
    print(indicator_file)
    indi_cols = list(pd.read_csv(indicator_file).columns)
    if indi_cols.index('Value'):
        indicatorIdxs.append(indi_cols.index('Value'))
    else:
        sys.exit(0)
    print(indi_cols, end="\n"+"-"*len(str(indi_cols))+"\n")
print("Num Indicators:", len(indicatorIdxs))

# Functions

In [None]:
months = {"JAN":1, "FEB":2, "MAR":3, "APR":4, "MAY":5, "JUN":6, 
          "JUL":7, "AUG":8, "SEP":9, "OCT":10, "NOV":11, "DEC":12}

In [None]:
def add_dates(dataframe, startYear, endYear, startMonth, endMonth):
    """
    Create Dense neural network model for evaluating each indicator's explanatory strength
    
    inputs
    --
    dataframe (pd.DataFrame())
    startYear (int) : starting year of the data (ex. 2009)
    endYear (int) : ending year of the data (ex. 2021)
    startMonth (int) : index of the month (1 is january, 12 is december)
    endMonth (int) :  index of the month (1 is january, 12 is december)
    
    
    outputs
    --
    modified dataframe
    """
    
#     if "Month" and "Year" in dataframe.columns:
#         dataframe["Indicator"] = 0
#         return dataframe
    
    # Initialize lists for new columns in DataFrame
    columnMonth = []
    columnYear = []
    columnIndicator = []
    # Append year, month, and empty indicator values for each remaining month in the first year
    for p in range(startMonth, 13):
        columnMonth.append(p)
        columnYear.append(startYear)
        columnIndicator.append(0)
    # Append year, month, and empty indicator value for each month in the remaining years (besides the last)
    for x in range(startYear + 1, endYear):
        for i in range(1, 13):
            columnMonth.append(i)
            columnYear.append(x)
            columnIndicator.append(0)
    # Append year, month, and empty indicator value for the remaining months in the last year
    for r in range(1, endMonth + 1):
        columnMonth.append(r)
        columnYear.append(endYear)
        columnIndicator.append(0)
    # Add the year, month, and empty indicator columns to the original dataset
    dataframe['Year'] = columnYear
    dataframe['Month'] = columnMonth
    dataframe['Indicator'] = columnIndicator
    return dataframe

In [None]:
def create_model(numNodes, includeIndicator=True, activation='relu',
                 optimizer='adam', loss='mean_squared_error'):
    """
    Create Dense neural network model for evaluating each indicator's explanatory strength
    
    inputs
    --
    numNodes (int) : baseline number of nodes used for constructing the neural network layers
    includeIndicator (bool) : Used to mark whether a model is fitted with an indicator feature
    activation, optimizer, loss (string) : Tensorflow neural network parameters
    
    outputs
    --
    model (tf.keras.Model()) : neural network model
    """
    
    
    model = Sequential()
    model.add(Dense(numNodes*5/3, activation = activation, input_dim=2+int(includeIndicator))) # increase input_dim
#     model.add(Dense(numNodes*2/3, activation = activation))
    model.add(Dropout(0.2))
    model.add(Dense(numNodes*1/3, activation = activation))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation = 'linear'))
    model.compile(optimizer=optimizer, loss=loss)
    return model

# TODO: add parameters for input size, num layers, etc.
def train_and_test(dataframe, trainYears, testYear, indexOfIndicator, 
                   indicatorData, numNodes, SalesColName, includeIndicator):
    """
    Train and test neural network on data
    
    inputs
    --
    dataframe (pd.DataFrame()) : dataset used for inputs into model
    trainYears : used for assigning training/testing partitions
    testYear : used for assigning training/testing partitions
    indexOfIndicator (int) : the index of the indicator value
    indicatorData : DataFrame/Series of indicator values for each time period
    numNodes (int) : baseline number of nodes used for constructing the neural network layers
    SalesColName (string) : the name of the sales column used by the dataframe parameter variable
    includeIndicator (bool) : Used to mark whether a model is fitted with an indicator feature
    
    outputs
    --
    trainStDev (float) : standard deviation of the training data
    testRMSE : root mean squared error of the model predictions
    """
    
    # dataframe with an index 0 to length - 1, sales as the first column, followed by year, and month
    # not optimized for multiple indicators yet, will continue updating for multiple indicator
    if includeIndicator:
        features = ['Sales', 'Year', 'Month', 'Indicator']
        IndicatorValues = indicatorData.iloc[:, indexOfIndicator]
        dataframe['Indicator'] = IndicatorValues
    else:
        features = ['Sales', 'Year', 'Month']
    
    # preprocess training data
    dataframe["Sales"] = dataframe[SalesColName]
    if SalesColName != "Sales":
        dataframe = dataframe.drop(SalesColName, axis=1)
    
    # assign testing data
    testData = dataframe.loc[dataframe['Year'] == testYear]
    testData = testData.reset_index()
    testData = testData[features] # add 'UnemploymentRateValues'
    
    # assign training data
    trainYearIdx = dataframe['Year'].apply(lambda x: any([x==ty for ty in trainYears]))
    trainData = dataframe.loc[trainYearIdx]
    trainData = trainData.reset_index()
    trainData = trainData[features] # add 'UnemploymentRateValues'
    
    # Isolate training and test sets
    X = trainData.iloc[:, 1:3+int(includeIndicator)] # increase index
    Y = trainData.iloc[:, 0]
    
    Xi_Test = testData.iloc[:, 1:3+int(includeIndicator)] # increase index
    Yi_Test = testData.iloc[:, 0]
    XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size = 0.2, shuffle = True)

    # Create and Fit Model
    model = create_model(numNodes, includeIndicator)
    model.fit(XTrain, YTrain, epochs=500, batch_size=128, verbose=0) #change batch size to a variable
    
    # Test results from model using training data
    YPreds = model.predict(XTest)
    try:
        testRMSE = mean_squared_error(YTest, YPreds, squared=False)
        trainStDev = trainData['Sales'].std()
    except:
        return np.nan, np.nan
    return trainStDev, testRMSE

In [None]:
def run_pipeline(data, indicatorData, indexOfIndicator, salesColumn, trainYears, testYear, 
                 numNodes, startYear, endYear, startMonth, endMonth, includeIndicator):
    
    """
    parameters are a combination of the add_dates() and train_and_test()
    """
    dataNew = pd.DataFrame(data)
    # specify your sales column. New is my sales in this case #5
    if "Month" not in dataNew and "Year" not in dataNew:
        dataNew = dataNew[[salesColumn]]
        # specify start year, end year, start month, end month
        dataNew = add_dates(dataNew, startYear, endYear, startMonth, endMonth)
    else:
        dataNew = dataNew[[salesColumn, "Month", "Year"]]
    #the data now has 3 columns: Sales, Month, Year, indicator (all 0s won't make a difference)
    StDev, testRMSE = train_and_test(dataNew, trainYears, testYear,
                                     indexOfIndicator, indicatorData, 
                                     numNodes, salesColumn, includeIndicator)
    return StDev, testRMSE

### Feature engineering ideas
    - inverse years from current year (years closer to test year should be weighted more heavily)
    - one-hot encoded month and possibly day

# Testing

## Costco Dataset

In [None]:
Costco_Monthly = pd.read_csv("Data_Copy/Costco_Monthly Sales from 2012 to 2021.csv", header=0)
Costco_Monthly["Day"] = Costco_Monthly.Date.apply(lambda x: int(x.split("-")[0]))
Costco_Monthly = Costco_Monthly.drop(["Date"], axis=1)
Costco_Monthly = Costco_Monthly.dropna()
Costco_Monthly

In [None]:
numNodes = 153 # approx 2/3 of the number of your rows. Make this divisible by a three #6
start = time.time()
SalesColName = "Net Sales (billions)"
TestYears = [2018, 2019, 2020]
startYear = 2002 # 7
endYear = 2021 # 8
startMonth = 1 # 9
endMonth = 4 # 10

# With Indicator

In [None]:
includeIndicator = True
resultsCostco = pd.DataFrame()

Loops through all the economic indicators and for each one, trains a model and evaluates its explained variance ratio.

In [None]:
Costco_Monthly

In [None]:
for i, indicator_file in enumerate(glob.glob("Data_Copy/Economic indicators/*")):
    print(i+1, "_".join(indicator_file.split("/")[-1].split("_")[5:])[:-4])
    indexOfIndicator = indicatorIdxs[i]
    indicatorData = pd.read_csv(indicator_file)
    for TestYear in TestYears:
        TrainYears = list(range(startYear,TestYear))
        trainStDev, testRMSE = train_and_test(Costco_Monthly, TrainYears, TestYear, indexOfIndicator, 
                                              indicatorData, numNodes, SalesColName, includeIndicator)
        currResults = {
            "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
            "Pred Year": int(TestYear),
            "Indicator": indicator_file.split("/")[-1],
            "Train StDev": round(trainStDev, 3), 
            "Test RMSE": round(testRMSE, 3),
            "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
        }
        resultsCostco = resultsCostco.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)            
resultsCostco

In [None]:
resultsCostco.sort_values(by="Explained Variance Ratio").dropna().tail(50)

In [None]:
resultsCostco.to_csv("Results/resultsCostco.csv")
resultsCostco.sort_values(by="Explained Variance Ratio").to_csv("Results/resultsCostco_sorted.csv")

# Without Indicator

In [None]:
includeIndicator = False
SalesColName = "Net Sales (billions)"
resultsCostcoNoInd = pd.DataFrame()

In [None]:
start = time.time()
for TestYear in TestYears:
    TrainYears = list(range(startYear,TestYear))
    trainStDev, testRMSE = train_and_test(Costco_Monthly, TrainYears, TestYear, "", 
                                             "", numNodes, SalesColName, includeIndicator)
    currResults = {
        "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
        "Pred Year": int(TestYear),
        "Indicator": indicator_file.split("/")[-1],
        "trainStDev": round(trainStDev, 3), 
        "Test RMSE": round(testRMSE, 3),
        "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
    }
    resultsCostcoNoInd = resultsCostcoNoInd.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)            

In [None]:
resultsCostcoNoInd

## Maryland Vehicles Dataset

In [None]:
SalesDataSet = pd.read_pickle('Data/MarylandVehicleSales2002-2021')
SalesDataSet

In [None]:
#IndexColumnOfIndicator = 2 #which index is your indicator columns #3
SalesColumnName = 'New' # specify your sales column. New is my sales in this case #4
TestYears = [2018, 2019, 2020] #5
NumNodes = 153 # approx 2/3 of the number of your rows. Make this divisible by a three #6
startYear = 2002 # 7
endYear = 2021 # 8
startMonth = 1 # 9
endMonth = 4 # 10

# With Indicators

In [None]:
includeIndicators = True
resultsVehicles = pd.DataFrame()

In [None]:
start = time.time()
for i, indicator_file in enumerate(glob.glob("Data_CopyData/Economic indicators/*")):
    print("\n----------\n\n"+indicator_file.split("/")[-1])
    dataframe = SalesDataSet.copy()
    
    IndicatorDataSet = pd.read_csv(indicator_file)
    IndexColumnOfIndicator = indicatorIdxs[i] - 1 # adjust for when DateTime is set as index
    IndicatorDataSetDates = IndicatorDataSet['DateTime']
    IndicatorDataSetDates = IndicatorDataSetDates.apply(lambda x: x[0:10])
    IndicatorDataSet['DateTime'] = IndicatorDataSetDates
    
    IndicatorDataSet = IndicatorDataSet.set_index('DateTime')
    IndicatorDataSet = IndicatorDataSet.loc['2002-01-30':'2021-04-30']
    IndicatorDataSet.reset_index(drop=True, inplace = True)
    
    for TestYear in TestYears:
        TrainYears = list(range(startYear,TestYear))
        trainStDev, testRMSE = run_pipeline(SalesDataSet, IndicatorDataSet, IndexColumnOfIndicator, 
                                            SalesColumnName, TrainYears, TestYear,
                                            NumNodes, startYear, endYear, startMonth,
                                            endMonth, includeIndicators)
        currResults = {
            "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
            "Pred Year": int(TestYear),
            "Indicator": indicator_file.split("/")[-1], 
            "Train StDev": round(trainStDev, 3), 
            "Test RMSE": round(testRMSE, 3),
            "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
        }
        resultsVehicles = resultsVehicles.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)

In [None]:
pd.options.display.max_colwidth = 100
resultsVehicles

# Without Indicators

In [None]:
includeIndicators = False
resultsVehiclesNoInd = pd.DataFrame()

In [None]:
start = time.time()
for TestYear in TestYears:
    TrainYears = list(range(startYear,TestYear))
    trainStDev, testRMSE = train_and_test(SalesDataSet, [], "", 
                             SalesColumnName, TrainYears, TestYear,
                             NumNodes, startYear, endYear, startMonth, endMonth,
                             includeIndicators)
    currResults = {
        "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
        "Pred Year": int(TestYear),
        "Train Std": round(trainStDev, 3), 
        "Test RMSE": round(testRMSE, 3),
        "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
    }
    resultsVehiclesNoInd = resultsVehiclesNoInd.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)

In [None]:
resultsVehiclesNoInd

# MRTS Sales Dataset

In [None]:
mrts_data = pd.read_csv("Data_Copy/mrtssales_92-present.csv")
mrts_data['Date'] = mrts_data.iloc[:,0].apply(lambda x: x.replace(".", ""))
mrts_data["Month"] = mrts_data["Date"].apply(lambda x: x[:3].upper()).tolist()
mrts_data["Year"] = mrts_data["Date"].apply(lambda x: x[4:].upper()).tolist()
mrts_data = mrts_data.drop([mrts_data.columns[0],"Date"], axis=1)
mrts_data = mrts_data[:-1] # delete present month
mrts_data["Year"] = mrts_data["Year"].astype(int)
mrts_data["Month"] = mrts_data["Month"].apply(lambda x: months[x])
mrts_data

In [None]:
includeIndicators = True
TestYears = [2018, 2019, 2020] #5
NumNodes = 153 # approx 2/3 of the number of your rows. Make this divisible by a three #6
startYear = 2002 # 7
endYear = 2021 # 8
startMonth = 1 # 9
endMonth = 4 # 10

In [None]:
# Every sales column in the the mrts dataset
start = time.time()
resultsMRTS = pd.DataFrame()
for s, SalesColName in enumerate(mrts_data.columns):
    if s==1:
        break
    # Every indicator
    for i, indicator_file in enumerate(glob.glob("Data_Copy/Economic indicators/*")):
        ind_name = indicator_file.split("/")[-1]
        print("\n----------\n\n"+ind_name, SalesColName)

        indicator = pd.read_csv(indicator_file)
        indicator['Year'] = indicator['DateTime'].apply(lambda x: int(x.split("-")[0]))
        indicator['Month'] = indicator['DateTime'].apply(lambda x: int(x.split("-")[1]))
        indicator = indicator[["Year", "Month", "Value"]].rename(columns={"Value":"Indicator"})
        new_sales_col = []
        for idx, row in indicator.iterrows():
            try:
                sales = mrts_data[((mrts_data['Year'] == row.Year) &
                                   (mrts_data['Month'] == row.Month))][SalesColName].tolist()[0]
                new_sales_col.append(sales)
            except IndexError:
                new_sales_col.append(sales)
                continue
        indicator["Sales"] = new_sales_col
        data = indicator.copy()
        
        for testYear in TestYears:
            trainSet = data[data["Year"]<testYear]
            X_train = trainSet[["Month", "Year", "Indicator"]].to_numpy()
            y_train = trainSet["Sales"].to_numpy()
            
            testSet = data[data["Year"]==testYear]
            X_test = testSet[["Month", "Year", "Indicator"]].to_numpy()
            y_test = testSet["Sales"].to_numpy()
            
            model = create_model(NumNodes)
            model.fit(X_train, y_train, epochs=500, batch_size=128, verbose=0) #change batch size to a variable
    
            # Test results from model using training data
            y_preds = model.predict(X_test)
            testRMSE = mean_squared_error(y_test, y_preds, squared=False)
            trainStDev = trainSet["Sales"].std()
            evr = (trainStDev - testRMSE) / trainStDev
               
            currResults = {
                "Sales Column": SalesColName,
                "Indicator" : ind_name,
                "Train Year Start": trainSet["Year"].min(),
                "Pred Year": int(testYear),
                "Train Std": round(trainStDev, 3), 
                "Test RMSE": round(testRMSE, 3),
                "Explained Variance Ratio": round(evr, 3)
            }
            resultsMRTS = resultsMRTS.append(currResults, ignore_index=True)
        if i>3:
            break
                

print("Time elapsed:", time.time()-start)
resultsMRTS