In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import glob
import sys
import time
from copy import deepcopy

# Exploratory Data Analysis (EDA)

The purpose of the following cell is to list all the available economic indicators that we will use as features in our modeling.

In [76]:
indicatorIdxs = []
for indicator_file in glob.glob("Data/Economic indicators/*"):
    print(indicator_file)
    indi_cols = list(pd.read_csv(indicator_file).columns)
    if indi_cols.index('Value'):
        indicatorIdxs.append(indi_cols.index('Value'))
    else:
        sys.exit(0)
    print(indi_cols, end="\n"+"-"*len(str(indi_cols))+"\n")
print("Num Indicators:", len(indicatorIdxs))

Data/Economic indicators/historical_country_United_States_indicator_API_Product_Imports.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_GDP_From_Manufacturing.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Private_Debt_to_GDP.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Pending_Home_Sales.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequenc

['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Challenger_Job_Cuts.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Loans_to_Private_Sector.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Interbank_Rate.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------

['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_GDP_Deflator.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Changes_in_Inventories.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Personal_Savings.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------

['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_API_Cushing_Number.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_API_Crude_Runs.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Data/Economic indicators/historical_country_United_States_indicator_Terms_of_Trade.csv
['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
---------------------------------------------------------------

['Country', 'Category', 'DateTime', 'Value', 'Frequency', 'HistoricalDataSymbol', 'LastUpdate']
-----------------------------------------------------------------------------------------------
Num Indicators: 192


# Functions

In [105]:
months = {"JAN":1, "FEB":2, "MAR":3, "APR":4, "MAY":5, "JUN":6, 
          "JUL":7, "AUG":8, "SEP":9, "OCT":10, "NOV":11, "DEC":12}

In [85]:
def add_dates(dataframe, startYear, endYear, startMonth, endMonth):
    """
    Create Dense neural network model for evaluating each indicator's explanatory strength
    
    inputs
    --
    dataframe (pd.DataFrame())
    startYear (int) : starting year of the data (ex. 2009)
    endYear (int) : ending year of the data (ex. 2021)
    startMonth (int) : index of the month (1 is january, 12 is december)
    endMonth (int) :  index of the month (1 is january, 12 is december)
    
    
    outputs
    --
    modified dataframe
    """
    
#     if "Month" and "Year" in dataframe.columns:
#         dataframe["Indicator"] = 0
#         return dataframe
    
    # Initialize lists for new columns in DataFrame
    columnMonth = []
    columnYear = []
    columnIndicator = []
    # Append year, month, and empty indicator values for each remaining month in the first year
    for p in range(startMonth, 13):
        columnMonth.append(p)
        columnYear.append(startYear)
        columnIndicator.append(0)
    # Append year, month, and empty indicator value for each month in the remaining years (besides the last)
    for x in range(startYear + 1, endYear):
        for i in range(1, 13):
            columnMonth.append(i)
            columnYear.append(x)
            columnIndicator.append(0)
    # Append year, month, and empty indicator value for the remaining months in the last year
    for r in range(1, endMonth + 1):
        columnMonth.append(r)
        columnYear.append(endYear)
        columnIndicator.append(0)
    # Add the year, month, and empty indicator columns to the original dataset
    dataframe['Year'] = columnYear
    dataframe['Month'] = columnMonth
    dataframe['Indicator'] = columnIndicator
    return dataframe

In [111]:
def create_model(numNodes, includeIndicator, activation='relu',
                 optimizer='adam', loss='mean_squared_error'):
    """
    Create Dense neural network model for evaluating each indicator's explanatory strength
    
    inputs
    --
    numNodes (int) : baseline number of nodes used for constructing the neural network layers
    includeIndicator (bool) : Used to mark whether a model is fitted with an indicator feature
    activation, optimizer, loss (string) : Tensorflow neural network parameters
    
    outputs
    --
    model (tf.keras.Model()) : neural network model
    """
    
    
    model = Sequential()
    model.add(Dense(numNodes*5/3, activation = activation, input_dim=2+int(includeIndicator))) # increase input_dim
#     model.add(Dense(numNodes*2/3, activation = activation))
    model.add(Dropout(0.2))
    model.add(Dense(numNodes*1/3, activation = activation))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation = 'linear'))
    model.compile(optimizer=optimizer, loss=loss)
    return model

# TODO: add parameters for input size, num layers, etc.
def train_and_test(dataframe, trainYears, testYear, indexOfIndicator, 
                   indicatorData, numNodes, SalesColName, includeIndicator):
    """
    Train and test neural network on data
    
    inputs
    --
    dataframe (pd.DataFrame()) : dataset used for inputs into model
    trainYears : used for assigning training/testing partitions
    testYear : used for assigning training/testing partitions
    indexOfIndicator (int) : the index of the indicator value
    indicatorData : DataFrame/Series of indicator values for each time period
    numNodes (int) : baseline number of nodes used for constructing the neural network layers
    SalesColName (string) : the name of the sales column used by the dataframe parameter variable
    includeIndicator (bool) : Used to mark whether a model is fitted with an indicator feature
    
    outputs
    --
    trainStDev (float) : standard deviation of the training data
    testRMSE : root mean squared error of the model predictions
    """
    
    # dataframe with an index 0 to length - 1, sales as the first column, followed by year, and month
    # not optimized for multiple indicators yet, will continue updating for multiple indicator
    if includeIndicator:
        features = ['Sales', 'Year', 'Month', 'Indicator']
        IndicatorValues = indicatorData.iloc[:, indexOfIndicator]
        dataframe['Indicator'] = IndicatorValues
    else:
        features = ['Sales', 'Year', 'Month']
    
    # preprocess training data
    dataframe["Sales"] = dataframe[SalesColName]
    if SalesColName != "Sales":
        dataframe = dataframe.drop(SalesColName, axis=1)
    
    # assign testing data
    testData = dataframe.loc[dataframe['Year'] == testYear]
    testData = testData.reset_index()
    testData = testData[features] # add 'UnemploymentRateValues'
    
    # assign training data
    trainYearIdx = dataframe['Year'].apply(lambda x: any([x==ty for ty in trainYears]))
    trainData = dataframe.loc[trainYearIdx]
    trainData = trainData.reset_index()
    trainData = trainData[features] # add 'UnemploymentRateValues'
    
    # Isolate training and test sets
    X = trainData.iloc[:, 1:3+int(includeIndicator)] # increase index
    Y = trainData.iloc[:, 0]
    
    Xi_Test = testData.iloc[:, 1:3+int(includeIndicator)] # increase index
    Yi_Test = testData.iloc[:, 0]
    XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size = 0.2, shuffle = True)

    # Create and Fit Model
    model = create_model(numNodes, includeIndicator)
    model.fit(XTrain, YTrain, epochs=100, batch_size=128, verbose=0) #change batch size to a variable
    
    # Test results from model using training data
    YPreds = model.predict(XTest)
    try:
        testRMSE = mean_squared_error(YTest, YPreds, squared=False)
        trainStDev = trainData['Sales'].std()
    except:
        return np.nan, np.nan
    return trainStDev, testRMSE

In [102]:
def run_pipeline(data, indicatorData, indexOfIndicator, salesColumn, trainYears, testYear, 
                 numNodes, startYear, endYear, startMonth, endMonth, includeIndicator):
    
    """
    parameters are a combination of the add_dates() and train_and_test()
    """
    dataNew = pd.DataFrame(data)
    # specify your sales column. New is my sales in this case #5
    if "Month" not in dataNew and "Year" not in dataNew:
        dataNew = dataNew[[salesColumn]]
        # specify start year, end year, start month, end month
        dataNew = add_dates(dataNew, startYear, endYear, startMonth, endMonth)
    else:
        dataNew = dataNew[[salesColumn, "Month", "Year"]]
    #the data now has 3 columns: Sales, Month, Year, indicator (all 0s won't make a difference)
    StDev, testRMSE = train_and_test(dataNew, trainYears, testYear,
                                     indexOfIndicator, indicatorData, 
                                     numNodes, salesColumn, includeIndicator)
    return StDev, testRMSE

### Feature engineering ideas
    - inverse years from current year (years closer to test year should be weighted more heavily)
    - one-hot encoded month and possibly day

# Testing

## Costco Dataset

In [67]:
Costco_Monthly = pd.read_csv("Data/Costco_Monthly Sales from 2012 to 2021.csv", header=0)
Costco_Monthly["Day"] = Costco_Monthly.Date.apply(lambda x: int(x.split("-")[0]))
Costco_Monthly = Costco_Monthly.drop(["Date"], axis=1)
Costco_Monthly = Costco_Monthly.dropna()
Costco_Monthly

Unnamed: 0,Month,Year,Net Sales (billions),Growth Rate,Day
0,1,2012,7.00,11%,12
2,3,2012,9.13,10%,12
3,4,2012,7.25,7%,12
4,5,2012,7.67,7%,12
5,6,2012,9.18,6%,12
...,...,...,...,...,...
109,2,2021,14.05,15%,21
110,3,2021,18.21,18%,21
111,4,2021,15.21,34%,21
112,5,2021,15.59,24%,21


In [68]:
numNodes = 153 # approx 2/3 of the number of your rows. Make this divisible by a three #6
start = time.time()
SalesColName = "Net Sales (billions)"
TestYears = [2018, 2019, 2020]
startYear = 2002 # 7
endYear = 2021 # 8
startMonth = 1 # 9
endMonth = 4 # 10

# With Indicator

In [69]:
includeIndicator = True
resultsCostco = pd.DataFrame()

Loops through all the economic indicators and for each one, trains a model and evaluates its explained variance ratio.

In [70]:
Costco_Monthly

Unnamed: 0,Month,Year,Net Sales (billions),Growth Rate,Day
0,1,2012,7.00,11%,12
2,3,2012,9.13,10%,12
3,4,2012,7.25,7%,12
4,5,2012,7.67,7%,12
5,6,2012,9.18,6%,12
...,...,...,...,...,...
109,2,2021,14.05,15%,21
110,3,2021,18.21,18%,21
111,4,2021,15.21,34%,21
112,5,2021,15.59,24%,21


In [63]:
for i, indicator_file in enumerate(glob.glob("Data/Economic indicators/*")):
    print(i+1, "_".join(indicator_file.split("/")[-1].split("_")[5:])[:-4])
    indexOfIndicator = indicatorIdxs[i]
    indicatorData = pd.read_csv(indicator_file)
    for TestYear in TestYears:
        TrainYears = list(range(startYear,TestYear))
        trainStDev, testRMSE = train_and_test(Costco_Monthly, TrainYears, TestYear, indexOfIndicator, 
                                              indicatorData, numNodes, SalesColName, includeIndicator)
        currResults = {
            "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
            "Pred Year": int(TestYear),
            "Indicator": indicator_file.split("/")[-1],
            "Train StDev": round(trainStDev, 3), 
            "Test RMSE": round(testRMSE, 3),
            "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
        }
        resultsCostco = resultsCostco.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)            
resultsCostco.head()

1 API_Product_Imports


2021-12-05 10:24:43.680036: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-05 10:24:44.051896: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


2 GDP_From_Manufacturing
3 Private_Debt_to_GDP
4 Pending_Home_Sales
5 Crude_Oil_Rigs
6 Crude_Oil_Imports
7 Wages_in_Manufacturing
8 Balance_of_Trade
9 Wage_Growth
10 Net_Long_Term_Tic_Flows


KeyboardInterrupt: 

In [18]:
resultsCostco.sort_values(by="Explained Variance Ratio").head(15)

Unnamed: 0,Train Years,Pred Year,Indicator,Train StDev,Test RMSE,Explained Variance Ratio
669,[2002 ... 2018],2019.0,historical_country_United_States_indicator_Cor...,1.811,64539.118,-35644.429
670,[2002 ... 2019],2020.0,historical_country_United_States_indicator_Cor...,2.062,60357.003,-29268.046
668,[2002 ... 2017],2018.0,historical_country_United_States_indicator_Cor...,1.555,13474.93,-8662.961
562,[2002 ... 2019],2020.0,historical_country_United_States_indicator_Tou...,2.062,12682.064,-6148.94
560,[2002 ... 2017],2018.0,historical_country_United_States_indicator_Tou...,1.555,5949.237,-3824.174
561,[2002 ... 2018],2019.0,historical_country_United_States_indicator_Tou...,1.811,2419.059,-1335.064
629,[2002 ... 2017],2018.0,historical_country_United_States_indicator_Ext...,1.555,1988.275,-1277.399
729,[2002 ... 2018],2019.0,historical_country_United_States_indicator_Gov...,1.811,689.15,-379.622
728,[2002 ... 2017],2018.0,historical_country_United_States_indicator_Gov...,1.555,577.127,-370.074
587,[2002 ... 2017],2018.0,historical_country_United_States_indicator_Mon...,1.555,376.572,-241.124


In [16]:
resultsCostco.to_csv("Results/resultsCostco.csv")
resultsCostco.sort_values(by="Explained Variance Ratio").to_csv("Results/resultsCostco_sorted.csv")

# Without Indicator

In [12]:
includeIndicator = False
SalesColName = "Net Sales (billions)"
resultsCostcoNoInd = pd.DataFrame()

In [13]:
start = time.time()
for TestYear in TestYears:
    TrainYears = list(range(startYear,TestYear))
    trainStDev, testRMSE = iChooseYouPikachu(Costco_Monthly, TrainYears, TestYear, "", 
                                             "", numNodes, SalesColName, includeIndicator)
    currResults = {
        "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
        "Pred Year": int(TestYear),
        "Indicator": indicator_file.split("/")[-1],
        "trainStDev": round(trainStDev, 3), 
        "Test RMSE": round(testRMSE, 3),
        "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
    }
    resultsCostcoNoInd = resultsCostcoNoInd.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)            

Time elapsed: 1.8591079711914062


In [14]:
resultsCostcoNoInd

Unnamed: 0,Train Years,Pred Year,Indicator,trainStDev,Test RMSE,Explained Variance Ratio
0,[2002 ... 2017],2018.0,historical_country_United_States_indicator_Gov...,1.555,3.822,-1.457
1,[2002 ... 2018],2019.0,historical_country_United_States_indicator_Gov...,1.811,4.236,-1.339
2,[2002 ... 2019],2020.0,historical_country_United_States_indicator_Gov...,2.062,3.728,-0.808


## Maryland Vehicles Dataset

In [97]:
SalesDataSet = pd.read_pickle('Data/MarylandVehicleSales2002-2021')
SalesDataSet

Unnamed: 0,Year,Month,New,Used,Total Sales New,Total Sales Used
0,2002,JAN,31106,49927,755015820,386481929
1,2002,FEB,27520,50982,664454223,361353242
2,2002,MAR,34225,58794,805666244,419385387
3,2002,APR,36452,59817,846368297,433061150
4,2002,MAY,37359,60577,855005784,442569410
...,...,...,...,...,...,...
227,2020,DEC,28061,44427,1256023978,620534357
228,2021,JAN,21674,47934,834312337,622526750
229,2021,FEB,20579,42306,797908033,556318165
230,2021,MAR,34794,66410,1267127282,848644184


In [98]:
#IndexColumnOfIndicator = 2 #which index is your indicator columns #3
SalesColumnName = 'New' # specify your sales column. New is my sales in this case #4
TestYears = [2018, 2019, 2020] #5
NumNodes = 153 # approx 2/3 of the number of your rows. Make this divisible by a three #6
startYear = 2002 # 7
endYear = 2021 # 8
startMonth = 1 # 9
endMonth = 4 # 10

# With Indicators

In [99]:
includeIndicators = True
resultsVehicles = pd.DataFrame()

In [103]:
start = time.time()
for i, indicator_file in enumerate(glob.glob("Data/Economic indicators/*")):
    print("\n----------\n\n"+indicator_file.split("/")[-1])
    dataframe = SalesDataSet.copy()
    
    IndicatorDataSet = pd.read_csv(indicator_file)
    IndexColumnOfIndicator = indicatorIdxs[i] - 1 # adjust for when DateTime is set as index
    IndicatorDataSetDates = IndicatorDataSet['DateTime']
    IndicatorDataSetDates = IndicatorDataSetDates.apply(lambda x: x[0:10])
    IndicatorDataSet['DateTime'] = IndicatorDataSetDates
    
    IndicatorDataSet = IndicatorDataSet.set_index('DateTime')
    IndicatorDataSet = IndicatorDataSet.loc['2002-01-30':'2021-04-30']
    IndicatorDataSet.reset_index(drop=True, inplace = True)
    
    for TestYear in TestYears:
        TrainYears = list(range(startYear,TestYear))
        trainStDev, testRMSE = run_pipeline(SalesDataSet, IndicatorDataSet, IndexColumnOfIndicator, 
                                            SalesColumnName, TrainYears, TestYear,
                                            NumNodes, startYear, endYear, startMonth,
                                            endMonth, includeIndicators)
        currResults = {
            "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
            "Pred Year": int(TestYear),
            "Indicator": indicator_file.split("/")[-1], 
            "Train StDev": round(trainStDev, 3), 
            "Test RMSE": round(testRMSE, 3),
            "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
        }
        resultsVehicles = resultsVehicles.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)


----------

historical_country_United_States_indicator_API_Product_Imports.csv
     New
0  31106
1  27520
2  34225
3  36452
4  37359
     Sales  Year  Month  Indicator
0    31106  2002      1     -0.129
1    27520  2002      2      0.088
2    34225  2002      3     -0.030
3    36452  2002      4     -0.217
4    37359  2002      5     -0.170
..     ...   ...    ...        ...
187  29644  2017      8      0.110
188  28175  2017      9     -0.524
189  27886  2017     10      0.424
190  24970  2017     11     -0.171
191  24944  2017     12      0.167

[192 rows x 4 columns]
    Sales  Year  Month  Indicator
0   28607  2018      1     -0.070
1   21448  2018      2      0.315
2   26914  2018      3     -0.087
3   26729  2018      4     -0.018
4   30967  2018      5     -0.200
5   31779  2018      6     -0.201
6   30342  2018      7      0.620
7   30188  2018      8     -0.291
8   27180  2018      9     -0.013
9   28879  2018     10     -0.054
10  24846  2018     11      0.256
11  26350  201

KeyboardInterrupt: 

In [None]:
pd.options.display.max_colwidth = 100
resultsVehicles

# Without Indicators

In [None]:
includeIndicators = False
resultsVehiclesNoInd = pd.DataFrame()

In [None]:
start = time.time()
for TestYear in TestYears:
    TrainYears = list(range(startYear,TestYear))
    trainStDev, testRMSE = train_and_test(SalesDataSet, [], "", 
                             SalesColumnName, TrainYears, TestYear,
                             NumNodes, startYear, endYear, startMonth, endMonth,
                             includeIndicators)
    currResults = {
        "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
        "Pred Year": int(TestYear),
        "Train Std": round(trainStDev, 3), 
        "Test RMSE": round(testRMSE, 3),
        "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
    }
    resultsVehiclesNoInd = resultsVehiclesNoInd.append(currResults, ignore_index=True)

print("Time elapsed:", time.time()-start)

In [None]:
resultsVehiclesNoInd

# MRTS Sales Dataset

In [115]:
mrts_data = pd.read_csv("Data/mrtssales_92-present.csv")
mrts_data['Date'] = mrts_data.iloc[:,0].apply(lambda x: x.replace(".", ""))
mrts_data["Month"] = mrts_data["Date"].apply(lambda x: x[:3].upper()).tolist()
mrts_data["Year"] = mrts_data["Date"].apply(lambda x: x[4:].upper()).tolist()
mrts_data = mrts_data.drop([mrts_data.columns[0],"Date"], axis=1)
mrts_data = mrts_data[:-1] # delete present month
mrts_data["Year"] = mrts_data["Year"].astype(int)
mrts_data["Month"] = mrts_data["Month"].apply(lambda x: months[x])
mrts_data_new = mrts_data[mrts_data["Year"]>=2002]
mrts_data_new

Unnamed: 0,"Retail and food services sales, total",Retail sales and food services excl motor vehicle and parts,Retail sales and food services excl gasoline stations,Retail sales and food services excl motor vehicle and parts and gasoline stations,"Retail sales, total","Retail sales, total (excl. motor vehicle and parts dealers)",GAFO(1),Motor vehicle and parts dealers,Automobile and other motor vehicle dealers,Automobile dealers,...,Nonstore retailers,Electronic shopping and mail-order houses,Fuel dealers,Food services and drinking places,Drinking places,Restaurants and other eating places,Full service restaurants,Limited service eating places,Month,Year
120,255598,195033,238094,177529,230546,169981,63242,60565,55799,52948,...,15901,9912,2826,25052,1289.0,21704,11462.0,10242,1,2002
121,253086,190149,236382,173445,228084,165147,64043,62937,58319,54817,...,14323,8925,2303,25002,1274.0,21637,11355.0,10282,2,2002
122,285261,215490,265428,195657,257133,187362,73116,69771,64656,60116,...,15690,9670,2197,28128,1430.0,24344,12630.0,11714,3,2002
123,284747,213955,263641,192849,257357,186565,69137,70792,65420,60242,...,15105,9684,1746,27390,1343.0,23605,12076.0,11529,4,2002
124,300402,228248,278174,206020,271682,199528,74799,72154,66622,61020,...,15145,9713,1571,28720,1405.0,24811,12790.0,12021,5,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,633078,486171,586718,439811,567933,421026,126263,146907,136709,125144,...,87571,77955,3757,65145,,59882,,32616,3,2021
351,625527,481396,578253,434122,558181,414050,119664,144131,134233,122215,...,85066,76206,2614,67346,,61717,,33008,4,2021
352,643099,500389,592919,450209,570641,427931,126686,142710,133232,121429,...,84422,76293,2209,72458,,66471,,34025,5,2021
353,635437,498169,584134,446866,562929,425661,124474,137268,127542,115655,...,85052,77056,2146,72508,,66346,,33892,6,2021


In [107]:
includeIndicators = True
resultsMRTS = pd.DataFrame()

TestYears = [2018, 2019, 2020] #5
NumNodes = 153 # approx 2/3 of the number of your rows. Make this divisible by a three #6
startYear = 2002 # 7
endYear = 2021 # 8
startMonth = 1 # 9
endMonth = 4 # 10

In [118]:
# Every sales column in the the mrts dataset
for SalesColName in mrts_data.columns:
    # Every indicator
    for i, indicator_file in enumerate(glob.glob("Data/Economic indicators/*")):
        print("\n----------\n\n"+indicator_file.split("/")[-1], SalesColName)

        IndicatorDataSet = pd.read_csv(indicator_file)
        IndexColumnOfIndicator = indicatorIdxs[i] - 1 # adjust for when DateTime is set as index
        IndicatorDataSetDates = IndicatorDataSet['DateTime']
        IndicatorDataSetDates = IndicatorDataSetDates.apply(lambda x: x[0:10])
        IndicatorDataSet['DateTime'] = IndicatorDataSetDates
        IndicatorDataSet = IndicatorDataSet.set_index('DateTime')
        IndicatorDataSet = IndicatorDataSet.loc['2002-01-30':'2021-04-30']
        IndicatorDataSet.reset_index(drop=True, inplace = True)
        start = time.time()
        for testYear in TestYears:
            trainYears = list(range(startYear,TestYear))
            trainStDev, testRMSE = run_pipeline(mrts_data_new, IndicatorDataSet, IndexColumnOfIndicator,
                                                SalesColName, trainYears, testYear, numNodes,
                                                startYear, endYear, startMonth,
                                                endMonth, includeIndicator)
            currResults = {
                "Sales Column": SalesColName,
                "Train Years": f"[{TrainYears[0]} ... {TrainYears[-1]}]",
                "Pred Year": int(TestYear),
                "Train Std": round(trainStDev, 3), 
                "Test RMSE": round(testRMSE, 3),
                "Explained Variance Ratio": round((trainStDev - testRMSE) / trainStDev, 3)
            }
            resultsMRTS = resultsMRTS.append(currResults, ignore_index=True)

    print("Time elapsed:", time.time()-start)


----------

historical_country_United_States_indicator_API_Product_Imports.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002      0.013  255598
121      2  2002     -0.141  253086
122      3  2002      0.235  285261
123      4  2002     -0.099  284747
124      5  2002     -0.053  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1      0.013
1    253086  2002      2     -0.141
2    285261  2002      3      0.235
3    284747  2002      4     -0.099
4    300402  2002      5     -0.053
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12      


----------

historical_country_United_States_indicator_Private_Debt_to_GDP.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        NaN  255598
121      2  2002        NaN  253086
122      3  2002        NaN  285261
123      4  2002        NaN  284747
124      5  2002        NaN  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        NaN
1    253086  2002      2        NaN
2    285261  2002      3        NaN
3    284747  2002      4        NaN
4    300402  2002      5        NaN
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12      


----------

historical_country_United_States_indicator_Crude_Oil_Rigs.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        153  255598
121      2  2002        156  253086
122      3  2002        159  285261
123      4  2002        162  284747
124      5  2002        163  300402
..     ...   ...        ...     ...
350      3  2021        428  633078
351      4  2021        423  625527
352      5  2021        408  643099
353      6  2021        442  635437
354      7  2021        429  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        153
1    253086  2002      2        156
2    285261  2002      3        159
3    284747  2002      4        162
4    300402  2002      5        163
..      ...   ...    ...        ...
199  520544  2018      8        343
200  477137  2018      9        341
201  503602  2018     10        350
202  518457  2018     11        362
203  556614  2018     12        355


----------

historical_country_United_States_indicator_Wages_in_Manufacturing.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002      19.10  255598
121      2  2002      19.03  253086
122      3  2002      19.02  285261
123      4  2002      19.14  284747
124      5  2002      19.01  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1      19.10
1    253086  2002      2      19.03
2    285261  2002      3      19.02
3    284747  2002      4      19.14
4    300402  2002      5      19.01
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12   


----------

historical_country_United_States_indicator_Wage_Growth.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002       3.47  255598
121      2  2002       3.97  253086
122      3  2002       4.35  285261
123      4  2002       4.48  284747
124      5  2002       3.98  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1       3.47
1    253086  2002      2       3.97
2    285261  2002      3       4.35
3    284747  2002      4       4.48
4    300402  2002      5       3.98
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12        NaN

[


----------

historical_country_United_States_indicator_GDP.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        NaN  255598
121      2  2002        NaN  253086
122      3  2002        NaN  285261
123      4  2002        NaN  284747
124      5  2002        NaN  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        NaN
1    253086  2002      2        NaN
2    285261  2002      3        NaN
3    284747  2002      4        NaN
4    300402  2002      5        NaN
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12        NaN

[204 rows


----------

historical_country_United_States_indicator_Foreign_Direct_Investment.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        NaN  255598
121      2  2002        NaN  253086
122      3  2002        NaN  285261
123      4  2002        NaN  284747
124      5  2002        NaN  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        NaN
1    253086  2002      2        NaN
2    285261  2002      3        NaN
3    284747  2002      4        NaN
4    300402  2002      5        NaN
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12


----------

historical_country_United_States_indicator_Core_Inflation_Rate.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        2.3  255598
121      2  2002        2.2  253086
122      3  2002        2.3  285261
123      4  2002        2.3  284747
124      5  2002        2.3  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        2.3
1    253086  2002      2        2.2
2    285261  2002      3        2.3
3    284747  2002      4        2.3
4    300402  2002      5        2.3
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12      


----------

historical_country_United_States_indicator_Average_Hourly_Earnings.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        0.3  255598
121      2  2002        0.2  253086
122      3  2002        0.2  285261
123      4  2002        0.2  284747
124      5  2002        0.1  300402
..     ...   ...        ...     ...
350      3  2021        NaN  633078
351      4  2021        NaN  625527
352      5  2021        NaN  643099
353      6  2021        NaN  635437
354      7  2021        NaN  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        0.3
1    253086  2002      2        0.2
2    285261  2002      3        0.2
3    284747  2002      4        0.2
4    300402  2002      5        0.1
..      ...   ...    ...        ...
199  520544  2018      8        NaN
200  477137  2018      9        NaN
201  503602  2018     10        NaN
202  518457  2018     11        NaN
203  556614  2018     12  


----------

historical_country_United_States_indicator_Redbook_Index.csv Retail and food services sales, total
     Month  Year  Indicator   Sales
120      1  2002        2.4  255598
121      2  2002        1.8  253086
122      3  2002        1.7  285261
123      4  2002        2.0  284747
124      5  2002        1.4  300402
..     ...   ...        ...     ...
350      3  2021        4.1  633078
351      4  2021        5.2  625527
352      5  2021        3.1  643099
353      6  2021        3.3  635437
354      7  2021        3.7  634486

[235 rows x 4 columns]
      Sales  Year  Month  Indicator
0    255598  2002      1        2.4
1    253086  2002      2        1.8
2    285261  2002      3        1.7
3    284747  2002      4        2.0
4    300402  2002      5        1.4
..      ...   ...    ...        ...
199  520544  2018      8        2.4
200  477137  2018      9        2.6
201  503602  2018     10        0.5
202  518457  2018     11        4.7
203  556614  2018     12        5.1


KeyboardInterrupt: 