In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
import json
from src.metric_participants import ComputeMetrics
from pandasql import sqldf
from typing import Tuple, Dict, List

makeQuery = lambda q: sqldf(q, globals())

In [2]:
with open('./data/data_files/train_validation_split_10.json', 'r') as f: 
    cvSplit = json.load(f)

In [3]:
# load cleaned sales data
salesTrain = pd.read_csv('./data/data_files/sales_train_splitted.csv', index_col=0)
salesTrain.head()

Unnamed: 0,month,region,brand,sales
0,2020-01,region_0,brand_1,0.0
1,2020-01,region_0,brand_2,0.0
2,2020-01,region_0,brand_3,65007.49
3,2020-01,region_0,brand_12_market,509023.69
4,2020-01,region_0,brand_3_market,940469.05


In [4]:
# split by brand and transform time data to columns
brands = ['brand_1', 'brand_2']
months = set(salesTrain['month'].values.tolist())
regions = set(salesTrain['region'].values.tolist())

transformedData = {}
for brand in brands: 
    brandData = {}
    for month in months: 
        query = """
        SELECT region, sales
        FROM salesTrain
        WHERE brand = '{brand}'
        AND month = '{month}'
        """.format(brand=brand, month=month)
        df = makeQuery(query)
        df.set_index('region', drop=True, inplace=True)
        brandData[month] = df['sales'].to_dict()
    transformedData[brand] = pd.DataFrame.from_dict(brandData, orient='index').sort_index(ascending=True).T


In [5]:
transformedData['brand_1'].head()

Unnamed: 0,2020-01,2020-02,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11,2020-12,2021-01,2021-02,2021-03,2021-04,2021-05,2021-06,2021-07,2021-08
region_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,612.44,1653.59,1898.56,2939.71,4440.19,3888.99,2664.11,5603.83,2572.25,4807.65,3674.64,4225.84,2927.8
region_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,765.55,1286.12,1653.59,1408.61,2021.05,1500.48,1837.32,1377.99,2021.05,2572.25,2204.78,3009.96
region_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91.87,367.46,183.73,704.31,826.79,275.6,734.93,1561.72,1561.72,1469.86,2071.56
region_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,275.6,643.06,2021.05,275.6,2786.6,1224.88,2021.05,1010.53,2572.25,3858.37,2939.71,2374.68
region_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91.87,1500.48,459.33,1102.39,2939.71,1194.26,826.79,2572.25,2112.92,2112.92,1653.59,2847.85,2651.6


In [6]:
# fit a linear model on each region -> parameters are target values for prediction model
linearModels = {}
for brand in brands: 
    df = transformedData[brand].applymap(lambda x: np.log10(x) if x > 0 else 0)
    regionParameters = {}
    x = np.arange(6, df.shape[1]) # start at july
    for i, row in df.iterrows(): 
        k, d = np.polyfit(x, row.values[6:], deg=1)
        regionParameters[i] = {'k': k, 'd': d}
    linearModels[brand] = pd.DataFrame.from_dict(regionParameters, orient='index')

In [7]:
linearModels['brand_1']

Unnamed: 0,k,d
region_0,0.132059,1.557358
region_1,0.193254,0.354583
region_2,0.258045,-1.030203
region_3,0.213841,0.018298
region_4,0.147271,1.049335
...,...,...
region_146,0.166820,0.497426
region_147,0.228716,-0.401228
region_148,0.218698,0.608605
region_149,0.194706,0.244412


In [8]:
def predictSales(x, k, d): 
    return x*k + d

def reverseScaleSales(values): 
    return 10**values

In [9]:
def getSubmissionDataFromSales(predictions: Dict[str, pd.DataFrame], 
                               bounds: Dict[str, Tuple[pd.DataFrame, pd.DataFrame]], 
                              ) -> pd.DataFrame: 
    """
    Takse a dict for predictions and bounds. In both cases the keys are the brands. 
    The values for predictions are dataframes containing the predictions per region (index) and month (column). 
    The values for the bounds are tuples of two dataframes of lower and upper bounds with same index and 
    columns as the predictions df. 
    Transforms the data into a dataframe with columns, month, region, brand, sales, lower, upper, as required
    by the submission platform.
    """
    submissionData = []
    brands = list(predictions.keys())
    regions = predictions[brands[0]].index.values
    months = predictions[brands[0]].columns.values
    for brand in brands: 
        preds = predictions[brand]
        lower, upper = bounds[brand]
        for region in regions: 
            for month in months: 
                row = [month, region, brand, preds.at[region, month], lower.at[region, month], upper.at[region, month]]
                submissionData.append(row)
    submissionData = pd.DataFrame(submissionData, columns=['month', 'region', 'brand', 'sales', 'lower', 'upper'])
    submissionData.sort_values(by=['month', 'region', 'brand'], ascending=True, inplace=True)
    submissionData.reset_index(drop=True, inplace=True)
    return submissionData
    
    
def getCiFromCv(predictions: Dict[int, pd.DataFrame]) -> Tuple[pd.DataFrame, pd.DataFrame]: 
    """
    Calculate ci bounds, such that 80% of the yTrue points fall within the margin of yPred +- bounds. 
    CI is calculated for each data point
    """
    index = predictions[0].index.values
    stackedPredictions = np.stack([preds.values for preds in predictions.values()], axis=0)  # 3D -> fold, region, month
    # mean = stackedPredictions.mean(axis=0)  # average over folds
    lowerBound = np.percentile(stackedPredictions, 20, axis=0)
    upperBound = np.percentile(stackedPredictions, 80, axis=0)
    return (pd.DataFrame(lowerBound, index=index), pd.DataFrame(upperBound, index=index))

In [15]:
acc_metrics = {}
ci_metrics = {}

#modelType = LinearRegression
modelType = RandomForestRegressor
modelParams = {'min_samples_split': 32, 'min_samples_leaf': 16}
features = {brand: pd.read_csv('./data/data_files/region_features/training_{}.csv'.format(brand), index_col=0) for brand in brands}
monthsToPredict = np.arange(6, len(months))
monthColumns = transformedData['brand_1'].columns.values[6:]

models, scalers = {}, {}
for fold, splits in cvSplit.items():
    trainingSplit, testSplit = splits['training'], splits['test']
    trainingSplit = ['region_{}'.format(i) for i in trainingSplit]
    testSplit = ['region_{}'.format(i) for i in testSplit]
    print('CV fold:', fold)
    
    brandModels, brandScalers = {}, {}
    # salesForecasts = {}  # dict of brand and dataframe with index region and values sales forecasts, lower, and upper ci
    for brand in brands: 
        print(brand)
        
        # define models and parameters, yTrue and make cross-validation
        modelyTrue = linearModels[brand]
        currentFeatures = features[brand]
    
        xTrain, xTest = currentFeatures.loc[trainingSplit].values, currentFeatures.loc[testSplit].values
        yTrain, yTest = modelyTrue.loc[trainingSplit].values, modelyTrue.loc[testSplit].values 

        # scale x data
        scaler = StandardScaler()
        xTrainScaled = scaler.fit_transform(xTrain)
        xTestScaled = scaler.transform(xTest)

        # train model
        model = modelType(**modelParams)
        model.fit(xTrainScaled, yTrain)
        
        brandModels[brand] = model
        brandScalers[brand] = scaler
    
    models[fold] = brandModels
    scalers[fold] = brandScalers

        # predict sales with predicted parameters starting in july
        # parameters = model.predict(xTestScaled)
        # collectedSalesFiguresyPred = {}
        # collectedSalesFiguresyTrue = {}
        # for i in range(len(testSplit)): 
        #     region = testSplit[i]
        #     salesFigures = predictSales(monthsToPredict, *parameters[i, :])
        #     salesFiguresReverseScaled = reverseScaleSales(salesFigures)
            # collectedSalesFigures[region] = {
            #    'yPred': salesFiguresReverseScaled, 
            #    'yTrue': transformedData[brand].loc[region, :].values[6: ]  # only july onwards
            #}
        #     collectedSalesFiguresyPred[region] = salesFiguresReverseScaled
        #     collectedSalesFiguresyTrue[region] = transformedData[brand].loc[region, :].values[6: ]  # only july onwards
        
        # collectedSalesFiguresyPred = pd.DataFrame(collectedSalesFiguresyPred).T
        # collectedSalesFiguresyTrue = pd.DataFrame(collectedSalesFiguresyTrue).T
        
        # salesForecasts[brand] = collectedSalesFiguresyPred
        # ciBounds = getCiFromData(pd.DataFrame(collectedSalesFiguresyTrue).T, 
        #                         pd.DataFrame(collectedSalesFiguresyPred).T)
        
        #allPredictions = pd.concat([collectedSalesFigures.yPred, 
        #                            ciBounds[0], 
        #                            ciBounds[1]], axis=1)
        #allPredictions.reset_index(drop=False, inplace=True)  # get regions as value
        #allPredictions['month'] = transformedData[brand].columns.values[6:]
        
        #salesForecasts[brand] = allPredictions

    # yPred[fold] = salesForecasts
    # fill sales data
    #originalData = salesTrain[salesTrain['region'].isin(testSplit)]
    #submissionData = getSubmissionDataFromSales(originalData, 
    #                                           salesForecasts['brand_1'], 
    #                                           salesForecasts['brand_2'])

    
    #acc_metric, ci_metric = ComputeMetrics(submissionData, 
    #                                       originalData, 
    #                                       originalData[originalData.apply(lambda x: x in ['brand_1', 'brand_2'])])
    #acc_metrics[fold] = acc_metric
    #ci_metrics[fold] = ci_metric

CV fold: 0
brand_1
brand_2
CV fold: 1
brand_1
brand_2
CV fold: 2
brand_1
brand_2
CV fold: 3
brand_1
brand_2
CV fold: 4
brand_1
brand_2
CV fold: 5
brand_1
brand_2
CV fold: 6
brand_1
brand_2
CV fold: 7
brand_1
brand_2
CV fold: 8
brand_1
brand_2
CV fold: 9
brand_1
brand_2


In [16]:
# load test data
salesTest = pd.read_csv('./data/data_files/sales_test_splitted.csv', index_col=0)
testFeatures = {brand: pd.read_csv('./data/data_files/region_features/test_{}.csv'.format(brand), index_col=0) for brand in brands}
salesTest.shape, testFeatures['brand_1'].shape, testFeatures['brand_2'].shape

((3000, 4), (50, 25), (50, 18))

In [19]:
# predict entire test data with all models
predictions = {}
for brand in brands: 
    brandPredictions = {}
    for fold in range(len(cvSplit)): 
        model, scaler = models[str(fold)][brand], scalers[str(fold)][brand]
        currentFeatures = testFeatures[brand]
        xScaled = scaler.transform(currentFeatures.values)
        parameters = model.predict(xScaled)
        predictedSales = []
        for params in parameters: 
            salesForecasts = predictSales(monthsToPredict, *params)
            predictedSales.append(reverseScaleSales(salesForecasts))
        predictedSales = pd.DataFrame(predictedSales, index=currentFeatures.index)
        brandPredictions[fold] = predictedSales
    predictions[brand] = brandPredictions

In [20]:
ciBounds = {brand: getCiFromCv(predictions[brand]) for brand in brands}

In [21]:
# train model on entire trainings data and predict test data
predictions = {}
models, scalers = {}, {}
for brand in brands: 
    modelyTrue = linearModels[brand]
    currentFeatures = features[brand]
    
    scaler = StandardScaler()
    xTrainScaled = scaler.fit_transform(currentFeatures.values)
    xTestScaled = scaler.transform(testFeatures[brand].values)

    model = modelType(**modelParams)
    model.fit(xTrainScaled, modelyTrue)

    parameters = model.predict(xTestScaled)
    predictedSales = []
    for params in parameters: 
        salesForecasts = predictSales(monthsToPredict, *params)
        predictedSales.append(reverseScaleSales(salesForecasts))
    predictedSales = pd.DataFrame(predictedSales, index=testFeatures[brand].index)
    
    models[brand] = model
    scalers[brand] = scaler
    predictions[brand] = predictedSales

In [22]:
for brand, df in predictions.items(): 
    df.rename(columns={c: monthColumns[c] for c in df.columns.values}, inplace=True)
    
for brand, bounds in ciBounds.items(): 
    lower, upper = bounds
    lower.rename(columns={c: monthColumns[c] for c in lower.columns.values}, inplace=True)
    upper.rename(columns={c: monthColumns[c] for c in upper.columns.values}, inplace=True)

In [23]:
submissionData = getSubmissionDataFromSales(predictions, ciBounds)

In [24]:
submissionData.shape

(1400, 6)

In [25]:
submissionData.head(n=25)

Unnamed: 0,month,region,brand,sales,lower,upper
0,2020-07,region_151,brand_1,17.398143,13.422892,18.150842
1,2020-07,region_151,brand_2,47.299334,43.944888,57.306223
2,2020-07,region_152,brand_1,33.9196,26.121056,36.696694
3,2020-07,region_152,brand_2,37.82352,33.447604,41.530091
4,2020-07,region_153,brand_1,33.53053,34.146551,43.982492
5,2020-07,region_153,brand_2,167.342512,158.889838,182.897974
6,2020-07,region_154,brand_1,133.089737,128.425489,142.356486
7,2020-07,region_154,brand_2,448.65919,396.56533,438.48066
8,2020-07,region_155,brand_1,15.798213,10.142301,13.911361
9,2020-07,region_155,brand_2,30.850963,26.598303,35.38274


In [26]:
submissionData.to_csv('./data/data_files/submissions/log_regression_rf_split_32_leaf_16_rte_activity_features.csv', 
                     index=False)