# Homework 3 (main)

#### Kaggle team: HEHEDA
#### Group: A0148008J, A0141132B


## Importing, Constants, and data loading functions

In [133]:
# import statement and extra libraries used
%config IPCompleter.greedy=True
import numpy as np
import pandas as pd
import os
import csv
import math
import matplotlib.pyplot as plt
import sklearn
import statistics as st
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from datetime import datetime, date
from matplotlib import cm
from datetime import timedelta
from sklearn import linear_model
from sklearn import ensemble

#### Constants used

In [64]:
# Constant used in this homework
DATE_TIME_FORMAT_DEV = "%d/%m/%Y"
DATE_TIME_FORMAT_REAL = "%Y-%m-%d"
DATE_TIME_FORMAT_WEEK = "%Y-W%W-%w"
MIN_BOOLEAN_INDEX_TRAIN = 5
MAX_BOOLEAN_INDEX_TRAIN = 8
RAW_FEATURE_NUMBER_TRAIN = 9
MIN_BOOLEAN_INDEX_TEST = 4
MAX_BOOLEAN_INDEX_TEST = 7
RAW_FEATURE_NUMBER_TEST = 8
STORE_COMPETITION_SINCE_DEFAULT_TIME = date(2009, 3, 9)
STORE_NO_PROMOTION_SINCE_CONSTANT_TIME = date(2999, 1, 1) # we assume this datetime is big enough
STORE_NO_COMPETITION_SINCE_CONSTANT_TIME = date(2999, 1, 1) # we assume this datetime is big enough
STORE_NO_PROMO_INTERVAL_STRING = "No Promotion"


#### File path conversion

In [65]:
# File path processing
directory_path = current_pwd = os.getcwd()
directory_path = os.path.join(directory_path, "inpublic/homework3")
train_file_path = os.path.join(directory_path, "train_v2.csv")
test_file_path = os.path.join(directory_path, "test_v2.csv")
store_info_path = os.path.join(directory_path, "store.csv")
cheat_path = os.path.join(directory_path, "Cheat.csv")
cheat_total_path = os.path.join(directory_path, "CheatTotal.csv")

#### Write bcak to csv function

In [66]:
def writeToFile(numpyArray, filePath):
    with open(filePath, 'w') as csvFile:
        prediction_writer = csv.writer(csvFile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        prediction_writer.writerow(["\"Id\"", "\"Sales\""])
        for i in range(numpyArray.shape[0]):
            prediction_writer.writerow([i+1, numpyArray[i]])

#### Data loading methods

In [67]:
# data loading and extraction function, include sales = zeros for restoring entries later
def dataLoadExtract(filePath, logBoolean):
    rawDataMatrix = []
    firstRow = True
    header = []
    
    with open(filePath, newline='') as csvFile:
        train_raw = csv.reader(csvFile, delimiter=',')
        for row in train_raw:
            if (firstRow):
                firstRow = False
                header = row                
            else:
                currentRow = []
                for i in range(len(row)):
                    
                    ## if doing log transform on sales and customers
                    if (header[i] in ['Customers', 'Sales']) and (logBoolean is True):
                        if (int(row[i]) == 0):
                            currentRow.append(int(row[i]))
                        else:
                            currentRow.append(math.log(int(row[i]))) 

                    # Convert 0, 1 to boolean
                    elif (header[i] in ['Open', 'Promo', 'SchoolHoliday']):
                        if (row[i] == '0'):
                            currentRow.append(False)
                        else:
                            currentRow.append(True)
                    # Convert Date to Date object in python
                    elif (header[i] == 'Date'):
                        currentRow.append(
                            datetime.strptime(row[i], DATE_TIME_FORMAT_REAL).date())
                    
                    elif (header[i] == 'StateHoliday'):
                        currentRow.append(row[i])  
                        
                    else:
                        currentRow.append(int(row[i]))
                rawDataMatrix.append(currentRow)
    
    dataRaw = np.array(rawDataMatrix[0:]) # a numpy array with raw data
    return header, dataRaw

def storeLoadExtract(filePath):
    rawDataMatrix = []
    firstRow = True
    
    with open(filePath, newline='') as csvFile:
        train_raw = csv.reader(csvFile, delimiter=',')
        for row in train_raw:
            if (firstRow):
                header = ["Store Index", "Store Type", "Assortment", "Competition distance reciprocal", "Competition Since",
                          "Promotion Since", "Promotion Interval"]
                rawDataMatrix.append(header)
                firstRow = False
            else:
                currentRow = []
                currentRow.append(int(row[0])) # store index
                currentRow.append(row[1]) # store type
                currentRow.append(row[2]) # assortment
                
                if (row[3] == ""): # competition distance reciprocal
                    currentRow.append(0)
                    currentRow.append(STORE_NO_COMPETITION_SINCE_CONSTANT_TIME)
                else:
                    currentRow.append(1.0/int(row[3])) 
                
                    if (row[4] != ""): 
                        date_str = "1/"+row[4]+"/"+row[5]
                        date_object = datetime.strptime(date_str, DATE_TIME_FORMAT_DEV).date()
                        currentRow.append(date_object) # competition since time
                    else:
                        currentRow.append(STORE_COMPETITION_SINCE_DEFAULT_TIME)
                
                if (row[6] == "0"): # promotion specs
                    currentRow.append(STORE_NO_PROMOTION_SINCE_CONSTANT_TIME)
                    currentRow.append(STORE_NO_PROMO_INTERVAL_STRING)
                else:
                    date_str = row[8]+"-W"+row[7]+"-0"
                    date_object = datetime.strptime(date_str, DATE_TIME_FORMAT_WEEK).date()
                    currentRow.append(date_object)
                    currentRow.append(row[9])

                rawDataMatrix.append(currentRow)
    
    headerRaw = rawDataMatrix[0] # a list containing all the headers as string
    dataRaw = np.array(rawDataMatrix[1:]) # a numpy array with raw data
    return headerRaw, dataRaw

In [68]:
# Data loading and extraction
headerRawTrainZero, dataRawTrainZero = dataLoadExtract(train_file_path, False)
headerRawTestZero, dataRawTestZero = dataLoadExtract(test_file_path, False)
headerRawTrainZeroLog, dataRawTrainZeroLog = dataLoadExtract(train_file_path, True)
headerRawTestZeroLog, dataRawTestZeroLog = dataLoadExtract(test_file_path, True)
headerRawStore, dataRawStore = storeLoadExtract(store_info_path)

#### Data Loading and Zero (not open) Record Removing

In [69]:
def removeZeros(header, data):
    rawList = data.tolist()
    resultant_list = []
    i = header.index('Open')
    for row in rawList:
        if (row[i] is True):
            resultant_list.append(row)
    return header, np.array(resultant_list)

In [70]:
# remove zero (not open sales record) from the data raw read from the file
headerRawTrain, dataRawTrain = removeZeros(headerRawTrainZero, dataRawTrainZero)
headerRawTest, dataRawTest = removeZeros(headerRawTestZero, dataRawTestZero)
headerRawTrainLog, dataRawTrainLog = removeZeros(headerRawTrainZeroLog, dataRawTrainZeroLog)
headerRawTestLog, dataRawTestLog = removeZeros(headerRawTestZeroLog, dataRawTestZeroLog)

--------

## Data Manipulation and features conversion

These are the steps in this section:
1. We gather store information for each store in the store list using the training data
2. We split the training data into features and labels
3. We combine the store information and the training features to get the full information

In [71]:
def storeInfoConverter (headerStore, dataStore, dataTrain):
    newHeaderStore = headerStore.copy() # header processing
    newHeaderStore += ["Average Sales Without Promotion",
                       "Average Sales With Promotion",
                       "Average Sales",
                       "Variance Sales Without Promotion",
                       "Variance Sales With Promotion",
                       "Variance Sales",
                       "Average SC Ratio Without Promotion",
                       "Average SC Ratio With Promotion",
                       "Average SC Ratio",
                       "Variance SC Ratio Without Promotion",
                       "Variance SC Ratio With Promotion",
                       "Variance SC Ratio",
                       "Median Sales Without Promotion",
                       "Median Sales With Promotion",
                       "Median Sales",
                       "Average Open Ratio"]
    
    resultant_list = []
    for row in dataStore:
        currentRow = list(row.copy())
        
        # TO DO : maybe we should process the original raw categorical data here
        
        currentStore = int(row[0])
        sales_list_without_promotion = []
        sales_list_with_promotion = []
        sc_ratio_list_without_promotion = []
        sc_ratio_list_with_promotion = []
        openDayCount = 0
        entryCount = 0
        
        for salesRow in dataTrain:
            if (int(salesRow[0]) == currentStore): # It's the store we want to analyze in this round
                
                # sales centric
                if (salesRow[3] > 0):
                    sc_ratio = salesRow[3]/salesRow[4]
                    if (salesRow[6] is True):
                        sales_list_with_promotion.append(salesRow[3])
                        sc_ratio_list_with_promotion.append(sc_ratio)
                    else:
                        sales_list_without_promotion.append(salesRow[3])
                        sc_ratio_list_without_promotion.append(sc_ratio)
                
                # open centric
                if (salesRow[5] is True):
                    openDayCount += 1
            entryCount += 1
        
        # data processing and adding
        currentRow.append(st.mean(sales_list_without_promotion))
        currentRow.append(st.mean(sales_list_with_promotion))
        currentRow.append(st.mean(sales_list_with_promotion+sales_list_without_promotion))
        currentRow.append(st.variance(sales_list_without_promotion))
        currentRow.append(st.variance(sales_list_with_promotion))
        currentRow.append(st.variance(sales_list_with_promotion+sales_list_without_promotion))
        currentRow.append(st.mean(sc_ratio_list_without_promotion))
        currentRow.append(st.mean(sc_ratio_list_with_promotion))
        currentRow.append(st.mean(sc_ratio_list_with_promotion+sc_ratio_list_without_promotion))
        currentRow.append(st.variance(sc_ratio_list_without_promotion))
        currentRow.append(st.variance(sc_ratio_list_with_promotion))
        currentRow.append(st.variance(sc_ratio_list_with_promotion+sc_ratio_list_without_promotion))
        currentRow.append(st.median(sales_list_without_promotion))
        currentRow.append(st.median(sales_list_with_promotion))
        currentRow.append(st.median(sales_list_with_promotion+sales_list_without_promotion))
        currentRow.append(float(openDayCount/entryCount))
        
        # Recording of data
        resultant_list.append(currentRow)
    return newHeaderStore, np.array(resultant_list)

#### Store information getter

In [72]:
# header information processing
headerStore, dataStore = storeInfoConverter(headerRawStore, dataRawStore, dataRawTrain)
# logged header information processing
headerStoreLog, dataStoreLog = storeInfoConverter(headerRawStore, dataRawStore, dataRawTrainLog)

#### Split the label and training features

In [73]:
def trainDataLabelSplit(dataTrain, headerTrain):
    # This function makes the training data exactly in the same format of testing data read by our functions
    dataTrainNew = np.hstack((dataTrain[:,:3], dataTrain[:,4:])) # remove the sales column
    return dataTrainNew, headerTrain[:3]+headerTrain[4:], dataTrain[:,3] # return data, header and label

# train data label split
dataTrain, headerTrain, labelTrain = trainDataLabelSplit(dataRawTrain, headerRawTrain)
dataTrainLog, headerTrainLog, labelTrainLog = trainDataLabelSplit(dataRawTrainLog, headerRawTrainLog)

#### Store info joining functions

In [74]:
def storeInfoPromotionIntervalConverter(string):
    # this method is used to help us determine if the current month is inside the promotion month
    if string == 'Jan,Apr,Jul,Oct':
        return [1,4,7,10]
    elif string == 'Feb,May,Aug,Nov':
        return [2,5,8,11]
    elif string == 'Mar,Jun,Sep,Dec':
        return [3,6,9,12]
    else:
        return []

In [75]:
def storeNaturalJoin(dataStore, data, headerStore, header):
    # In this function, we modify it so that it will join up the promotion `True` store with the info with promotion
    # And the non promotion store with the info with non promotion (info being the average, variance and etc)
    
    # Header processing
    headerProcessed = ["Day",
                       "Month",
                       "Year",
                       "Month In Promotion"]
    newHeader =  [header[1]] + header[3:] + headerProcessed + headerStore[1:4] +\
                 ["Competition Since Day Count", "Promotion Since Day Count", headerStore[6]] # no repetitive store index in header    
    newHeader += [headerStore[9], headerStore[12], headerStore[15], 
                      headerStore[18], headerStore[21], headerStore[22]]
    
    
    resultant_list = []
    
    for row in data:
        currentIndex = row[0]
        currentStoreInfo = dataStore[currentIndex-1,:] # get corresponding store entry with store index to be removed later
        currentDate = row[2] # we will get the current datetime object
        currentDay = currentDate.day # day value (integer)
        currentMonth = currentDate.month # month value (integer)
        currentYear = currentDate.year # year value (integer)
        monthInPromotion = currentMonth in storeInfoPromotionIntervalConverter(currentStoreInfo[6]) # boolean
        
        listRow = row.copy().tolist()
        listRow = [listRow[1],] + listRow[3:]
        currentRow = listRow + [currentDay, currentMonth, currentYear, monthInPromotion] # new entries 
        
        competitionSinceDate = currentStoreInfo[4] # competition since date
        promotionSinceDate = currentStoreInfo[5] # promotion since date
        competitionPastDayCount = (currentDate - competitionSinceDate).days
        competitionPastDayCount = 0 if competitionPastDayCount < 0 else competitionPastDayCount
        promotionPastDayCount = (currentDate - promotionSinceDate).days
        promotionPastDayCount = 0 if promotionPastDayCount < 0 else promotionPastDayCount
        
        #currentRow += list(currentStoreInfo[1:])
        # concatenate the relevant info (distinguishing promotion and non-promotion)
        promotionBoolean = row[5]
        constantInfo = [currentStoreInfo[1], currentStoreInfo[2], currentStoreInfo[3],
                            competitionPastDayCount, promotionPastDayCount, currentStoreInfo[6]]
        noPromotionList = [currentStoreInfo[7], currentStoreInfo[10], currentStoreInfo[13],
                           currentStoreInfo[16], currentStoreInfo[19], currentStoreInfo[22]]
        promotionList = [currentStoreInfo[8], currentStoreInfo[11], currentStoreInfo[14],
                         currentStoreInfo[17], currentStoreInfo[20], currentStoreInfo[22]]
        #totalList = [currentStoreInfo[9], currentStoreInfo[12], currentStoreInfo[15],
        #             currentStoreInfo[18], currentStoreInfo[21], currentStoreInfo[22]]
        
        # Check and append differently
        if promotionBoolean:
            currentRow += constantInfo + promotionList
        else:
            currentRow += constantInfo + noPromotionList
        
        #currentRow += constantInfo + totalList
        
        # Problem here: we have to append one object to make the numpy array conversion correct
        currentRow.append(object()) # random python object
        
        resultant_list.append(currentRow)
        
    # manually remove this object to keep the correctness of the data
    resultNumpyArray = np.array(resultant_list)
    resultNumpyArray = resultNumpyArray[:,0: resultNumpyArray.shape[1]-1]
    
    return newHeader, resultNumpyArray


#### Combine with store information to get full information

In [76]:
# clean the logged data and make it ready to use
headerTrainCleanLog, dataTrainCleanLog = storeNaturalJoin(dataStoreLog, dataTrainLog, headerStoreLog, headerTrainLog)
headerTrainClean, dataTrainClean = storeNaturalJoin(dataStore, dataTrain, headerStore, headerTrain)
headerTestClean, dataTestClean = storeNaturalJoin(dataStore, dataRawTest, headerStore, headerRawTest)
headerTestCleanLog, dataTestCleanLog = storeNaturalJoin(dataStoreLog, dataRawTestLog, headerStoreLog, headerRawTestLog)

#### One hot key encoding of categorical variables

In [77]:
def singleFeatureOneHotKeyEncoder(feature_column_vector_train, feature_column_vector_test, *dayOfWeek):
    # Problem here is that, the testing and training data may not coincide
    # We should write separate functions to deal with data (day, month, year)
    
    # numerical encoding
    enc = LabelEncoder()
    featureListTrain = (feature_column_vector_train).tolist()
    featureListTest = (feature_column_vector_test).tolist()
    
    if (len(dayOfWeek) > 0):
        dayOfWeek = dayOfWeek[0]
        featureListTrain = [dayOfWeek if x == dayOfWeek else 0 for x in featureListTrain]
        featureListTest = [dayOfWeek if x == dayOfWeek else 0 for x in featureListTest]
    # we should fit the one has larger value set

    if (len(set(featureListTrain)) > len(set(featureListTest))):
        
        
        enc.fit(featureListTrain) 
    else:
        enc.fit(featureListTest)
    
    labelEncodedFeatureTrain = enc.transform(featureListTrain).reshape(-1, 1)
    labelEncodedFeatureTest = enc.transform(featureListTest).reshape(-1, 1)
    
    # oneHot encoding
    enc = OneHotEncoder()
    if (len(set(featureListTrain)) > len(set(featureListTest))):
        enc.fit(labelEncodedFeatureTrain) # use train to fit the data
    else:
        enc.fit(labelEncodedFeatureTest)
    
    # return order: train, test
    return enc.transform(labelEncodedFeatureTrain).toarray(), enc.transform(labelEncodedFeatureTest).toarray()
    

In [78]:
def dayMonthYearFeatureOneHotKeyEncoder(feature_column_vector_train, feature_column_vector_test, featureName):
    if (featureName == 'Day'):
        featureList = list(range(1, 32))
    elif (featureName == 'Month'):
        featureList = list(range(1, 13))
    else:
        featureList = list(range(2013, 2017))
        
    enc = LabelEncoder()
    enc.fit(featureList)
    
    featureListTrain = (feature_column_vector_train).tolist()
    featureListTest = (feature_column_vector_test).tolist()
    
    labelEncodedFeatureTrain = enc.transform(featureListTrain).reshape(-1, 1)
    labelEncodedFeatureTest = enc.transform(featureListTest).reshape(-1, 1)
    
    enc = OneHotEncoder()
    enc.fit(labelEncodedFeatureTrain)
    
    return enc.transform(labelEncodedFeatureTrain).toarray(), enc.transform(labelEncodedFeatureTest).toarray()
    
    

In [79]:
def numericalTrainTransformation(headerTrain, dataTrain, headerTest, dataTest):
    # This function prepares from the clean data to all numerical numpy array ready to be feed into DMatrix
    # We will finally safely remove the store index column
    
    headerNew = [ "DayOfWeek1", "DayOfWeek2", "DayOfWeek3", "DayOfWeek4", "DayOfWeek5", "DayOfWeek6", 
                  "DayOfWeek7",
                  "Number of Customers", 
                  "Open -dummy -True by default",
                  "PromoBoolean","PromoBoolean2",
                  "StateHoliday1", "StateHoliday2" ,"StateHoliday3",
                  "SchoolHolidayBoolean", "SchoolHolidayBoolean",
                  "Month In Promotion 1", "Month In Promotion 2",
                  "Store type 1", "store type 2", "store type 3", "store type 4",
                  "assortmentType1", "assortmentType2", "assortmentType3",
                  "Competition distance reciprocal", "competitionSinceDayCount",
                  "promotionSinceDayCount", "promotionInterval1", "promotionInterval2",
                  "promotionInterval3", "promotionInterval4",
                  'Average Sales',
                  'Variance Sales',
                  'Average SC Ratio',
                  'Variance SC Ratio',
                  'Median Sales',
                 'Average Open Ratio'
                ]
        
    dayOfWeekColumnTrain, dayOfWeekColumnTest = singleFeatureOneHotKeyEncoder(dataTrain[:,0], dataTest[:,0])
    customersNumberColumnTrain, customersNumberColumnTest = dataTrain[:,1], dataTest[:,1]
    
    openBooleanTrain, openBooleanTest = singleFeatureOneHotKeyEncoder(dataTrain[:,2], dataTest[:,2]) 
    promoBooleanTrain, promoBooleanTest = singleFeatureOneHotKeyEncoder(dataTrain[:,3], dataTest[:,3]) 
    stateHolidayBooleanTrain, stateHolidayBooleanTest = singleFeatureOneHotKeyEncoder(dataTrain[:,4], dataTest[:,4])
    schoolHolidayBooleanTrain, schoolHolidayBooleanTest = singleFeatureOneHotKeyEncoder(dataTrain[:,5], dataTest[:,5])
    
    monthInPromotionBooleanTrain, monthInPromotionBooleanTest = singleFeatureOneHotKeyEncoder(dataTrain[:,9], dataTest[:,9])
    storeTypeColumnTrain, storeTypeColumnTest = singleFeatureOneHotKeyEncoder(dataTrain[:,10], dataTest[:,10])
    assortmentTypeColumnTrain, assortmentTypeColumnTest = singleFeatureOneHotKeyEncoder(dataTrain[:,11], dataTest[:,11])
    
    competitionDistanceReciprocalTrain, competitionDistanceReciprocalTest = dataTrain[:,12], dataTest[:,12]
    competitionSinceDayCountTrain, competitionSinceDayCountTest = dataTrain[:,13], dataTest[:,13]
    promotionSinceDayCountTrain, promotionSinceDayCountTest = dataTrain[:,14], dataTest[:, 14]
    
    promotionIntervalColumnTrain, promotionIntervalColumnTest = singleFeatureOneHotKeyEncoder(dataTrain[:,15], dataTest[:,15])
    
    calculatedStatisticsColumnTrain, calculatedStatisticsColumnTest = dataTrain[:,16:], dataTest[:,16:]
    
    # use numpy.column_stack to accomplish column and matrix side by side stacking
    resultantArrayTrain = np.column_stack((dayOfWeekColumnTrain,                                      
                                      customersNumberColumnTrain,
                                      openBooleanTrain,
                                      promoBooleanTrain,
                                      stateHolidayBooleanTrain,
                                      schoolHolidayBooleanTrain,
                                      monthInPromotionBooleanTrain,
                                      storeTypeColumnTrain,
                                      assortmentTypeColumnTrain,
                                      competitionDistanceReciprocalTrain,
                                      competitionSinceDayCountTrain,
                                      promotionSinceDayCountTrain,
                                      promotionIntervalColumnTrain,
                                      calculatedStatisticsColumnTrain))
    
    resultantArrayTest = np.column_stack((dayOfWeekColumnTest,                                      
                                      customersNumberColumnTest,
                                      openBooleanTest,
                                      promoBooleanTest,
                                      stateHolidayBooleanTest,
                                      schoolHolidayBooleanTest,
                                      monthInPromotionBooleanTest,
                                      storeTypeColumnTest,
                                      assortmentTypeColumnTest,
                                      competitionDistanceReciprocalTest,
                                      competitionSinceDayCountTest,
                                      promotionSinceDayCountTest,
                                      promotionIntervalColumnTest,
                                      calculatedStatisticsColumnTest))
    
    return resultantArrayTrain, resultantArrayTest, headerNew # order: train, test, headerNew

#### Convert the full infomation (contains categorical information) we have into numerical numpy array that have each entry radily converted to floating numbers using one hot key encoding

In [80]:
dataTrainNumerical, dataTestNumerical, headerWhole = \
    numericalTrainTransformation(headerTrainClean, dataTrainClean, 
                                 headerTestClean, dataTestClean)
dataTrainNumericalLog, dataTestNumericalLog, headerWholeLog= \
    numericalTrainTransformation(headerTrainCleanLog, dataTrainCleanLog, 
                                 headerTestCleanLog, dataTestCleanLog)

#### Helper functions for restore real prediction for test data

In [81]:
def restoreExponential(predictionWithZero):
    # restore the logged prediction values
    oriList = predictionWithZero.tolist()
    resultant_list = []
    for i in oriList:
        resultant_list.append(math.exp(i))
    return np.array(resultant_list)

def restoreSalesFromRatio(data, prediction, customerIndex):
    resultant_list = []
    N = len(prediction.tolist())
    for i in range(N):
        resultant_list.append(prediction[i] * data[i][customerIndex])
    return np.array(resultant_list)

def restoreZeroEntryInPrediction(prediction, dataTestCleanWithZero, openBooleanIndex):
    # restore the zero entries in the prediction
    resultantList= []
    predictionIndex = 0
    for i in range(dataTestCleanWithZero.shape[0]):
        if (dataTestCleanWithZero[i][openBooleanIndex] is True): # the store is open, put our prediction inside
            resultantList.append(prediction[predictionIndex])
            predictionIndex += 1 # update prediction index to the next prediction point
        else:
            resultantList.append(0) # the store is closed, append zero and do not update the index
    return np.array(resultantList).reshape(len(resultantList),)

-----------------------

## XGBoost Model - Initial Attempt

In [82]:
def XGBoostTrain(dataTrain, dataTest, labelTrain, params, ratioBoolean, logBoolean, customerIndex, 
                 dataTestRawZero, openBooleanIndex, adjustment):
    dTrain = xgb.DMatrix(dataTrain, label = labelTrain)
    dTest = xgb.DMatrix(dataTest)
    bst = xgb.train(params=params, dtrain=dTrain)
    prediction = bst.predict(dTest) 
    
    if (ratioBoolean):
        prediction = restoreSalesFromRatio(dataTest, prediction, customerIndex)
    
    if (logBoolean):
        prediction = restoreExponential(prediction)
        
    prediction = restoreZeroEntryInPrediction(prediction, dataTestRawZero, openBooleanIndex)
    
    return bst, prediction * adjustment # order: booster, prediction numpy array

#### XGBoost self-defined objective functions and helper functions

In [83]:
def evalError(prediction, label, data, ratio, customerIndex, log):    
    N = label.shape[0]
    labelShould = label
    error = 0
    if ratio is True:
        prediction = restoreSalesFromRatio(data, prediction, customerIndex)
        labelShould = restoreSalesFromRatio(data, label, customerIndex)
        
    if log is True:
        prediction = restoreExponential(prediction)
        labelShould = restoreExponential(label)
        
    for i in range (N):
        t = labelShould[i]
        p = prediction[i] 
        error += ((p-t)/t)**2
    return float((error/N)**(1/2))

#### Cross validation time functions for XGBoost

In [84]:
def xgbcv_helper(trainingSet, trainingLabel, validationSet, validationLabel, testingData, 
                 para, ratioLabelBoolean, customerIndex, logBoolean):
    dTrain = xgb.DMatrix(trainingSet, label=trainingLabel)
    dTest = xgb.DMatrix(testingData)
    dValidation = xgb.DMatrix(validationSet)
    dTrainWithoutLabel = xgb.DMatrix(trainingSet)
    bst = xgb.train(params=para, dtrain=dTrain)
    
    prediction = bst.predict(dTest)

    validationPrediction = bst.predict(dValidation)
    trainPrediction = bst.predict(dTrainWithoutLabel)
    
    error = evalError(validationPrediction, validationLabel, validationSet, ratioLabelBoolean, customerIndex, logBoolean) 
    trainError = evalError(trainPrediction, trainingLabel, trainingSet, ratioLabelBoolean, customerIndex, logBoolean)
    
    #print("validation error" + str(error))
    #print("training error" + str(trainError))
    return prediction, bst, error, trainError

In [85]:
def crossValidationTimeSeries(dataTrainNumerical, label, kFold, dataTestNumerical, para, ratioLabelBoolean, customerIndex, logBoolean):
    N = dataTrainNumerical.shape[0]
    resultant_validation_error_list = []
    resultant_training_error_list = []
    k = int(math.floor(N/(kFold+1)))
    # print(k)
    
    for multiplier in range(kFold, 0, -1): # the training data we have is in the reverse of time
        
        trainingSet = dataTrainNumerical[k*multiplier:,:]
        validationSet = dataTrainNumerical[k*(multiplier-1):(k*multiplier), :]
        trainingLabel = label[k*multiplier:]
        validationLabel = label[k*(multiplier-1):(k*multiplier)]
        
        prediction , bst, validationError, trainingError= \
            xgbcv_helper(trainingSet, trainingLabel, validationSet, 
                         validationLabel, dataTestNumerical, para, ratioLabelBoolean, customerIndex, logBoolean)
        
        resultant_validation_error_list.append(validationError)
        resultant_training_error_list.append(trainingError)
    
        print(validationError)
                
        #print(prediction)
    print(float(sum(resultant_validation_error_list)/len(resultant_validation_error_list))) #average validation error
    return resultant_validation_error_list, resultant_training_error_list

In [86]:
# Full Model
params = {      'eta'             : 0.4,
                'nround'          : 5000,
                'colsample_bytree': 0.9}


validationError, trainingError = \
    crossValidationTimeSeries(dataTrainNumerical, labelTrain, 5, dataTestNumerical, params, False, 7, False)

BST, prediction = XGBoostTrain(dataTrainNumerical, dataTestNumerical, labelTrain, params, False, False, 7,
                                     dataRawTestZero, 4, 1)

print(prediction)

0.10855846798451357


0.15771944832984305


0.16308717778126036


0.1251613819228103


0.08876697957766948
0.12865869111921932


[  4784.61474609   6162.81689453   8055.07080078 ...,   7417.85302734
  27684.65039062   6875.97412109]


In [87]:
# LogModel
params = { 
                'eta'             : 0.4,
                'nround'          : 3000,
                'colsample_bytree': 0.9}
    
validationErrorLog, trainingErrorLog = \
    crossValidationTimeSeries(dataTrainNumericalLog, labelTrainLog, 5, dataTestNumericalLog, params, False, 7, True)

BSTLog, predictionLog = XGBoostTrain(dataTrainNumericalLog, dataTestNumericalLog, labelTrainLog, params, False, True, 7,
                                     dataRawTestZero, 4, 1)
print(predictionLog)

0.13160641786265506


0.15618770090693332


0.08652256477668971


0.08181715049681244


0.07297608981729073
0.10582198477207624


[  4230.90454317   5365.7039596    7764.18167981 ...,   7495.17108217
  24243.16607153   6937.60091122]


## Improvement1 -- Process sales/customers ratio label

In [88]:
def salesCustomerRatioLabelConverter(dataTrain, labelTrain):
    resultant_list = []
    listing = labelTrain.tolist()
    for i in range(len(listing)):
        resultant_list.append( listing[i]/dataTrain[i][3] ) # sales / customers number
    return np.array(resultant_list)

In [89]:
labelTrainRatio = salesCustomerRatioLabelConverter(dataTrain, labelTrain)

In [91]:
# Ratio Full Model
params = {      'eta'             : 0.4,
                'nround'          : 5000,
                'colsample_bytree': 0.9}


validationError, trainingError = \
    crossValidationTimeSeries(dataTrainNumerical, labelTrainRatio, 5, dataTestNumerical, params, True, 7, False)

BSTRatio, predictionRatio = XGBoostTrain(dataTrainNumerical, dataTestNumerical, labelTrainRatio, params, True, False, 7,
                                     dataRawTestZero, 4, 1)

print(predictionRatio)

0.07204626009894179


0.07713680322858207


0.08107674117795739


0.09116375863022896


0.06277184694499297
0.07683908201614062


[  4904.64185715   5806.85913563   8158.87932682 ...,   7919.91522312
  26627.46015549   7072.7364006 ]


## Improvement2 -- Feature Selection

In [139]:
def get_importance(model, threshold): 
    impt = model.get_fscore()
    index = []
    for (key,value) in impt.items():
        if impt[key] >= threshold: index.append(int(str(key)[1:]))
    return index

def select_features(index, dataTrain, dataTest):
    partialTrain = np.column_stack((dataTrain[:,index[0]], dataTrain[:,index[1]]))
    partialTest = np.column_stack((dataTest[:,index[0]], dataTest[:,index[1]]))
    for j in range(2,len(index)):
        i = index[j]
        partialTrain = np.column_stack((partialTrain, dataTrain[:,i]))
        partialTest = np.column_stack((partialTest, dataTest[:,i]))
    return partialTrain, partialTest

def getFeatureNames(headerWhole, indexList):
    resultant_list = []
    for i in indexList:
        resultant_list.append(headerWhole[i])
    return resultant_list

In [140]:
# Partial Full Model 1 (impt weight) 
index = get_importance(BST, 5)
print(getFeatureNames(headerWhole, index))
data1, test1 = select_features(index, dataTrainNumerical, dataTestNumerical)
customerIndex = index.index(7)
params1 = {     'eta'             : 0.3,
                'nround'          : 3000,
                'colsample_bytree': 0.8}
validationError, trainingError = \
              crossValidationTimeSeries(data1, labelTrain, 5, test1, params1, False, customerIndex, False)
          
BST1, prediction1 = XGBoostTrain(data1, test1, labelTrain, params1, False, False, customerIndex,
                                     dataRawTestZero, 4, 1)

print(prediction1)

['Average Sales', 'DayOfWeek6', 'DayOfWeek3', 'SchoolHolidayBoolean', 'DayOfWeek1', 'DayOfWeek2', 'Median Sales', 'Variance Sales', 'PromoBoolean', 'competitionSinceDayCount', 'Number of Customers', 'Average SC Ratio', 'promotionSinceDayCount', 'Variance SC Ratio', 'store type 4', 'Competition distance reciprocal', 'DayOfWeek7']


0.10919332586448115


0.17408069056165895


0.09480235859400928


0.0920912319723816


0.0687301272716593
0.10777954685283805


[  4653.01611328   6029.45654297   8022.06884766 ...,   8037.51611328
  25276.25976562   7168.25537109]


In [142]:
# Partial Log Model 1 (impt weight) 
index = get_importance(BSTLog, 5)
print(getFeatureNames(headerWhole, index))
dataLog1, testLog1 = select_features(index, dataTrainNumericalLog, dataTestNumericalLog)
customerIndex = index.index(7)
params1 = {     'eta'             : 0.4,
                'nround'          : 3000,
                'colsample_bytree': 0.9}
validationError, trainingError = \
              crossValidationTimeSeries(dataLog1, labelTrainLog, 5, testLog1, params1, False, customerIndex, True)
          
BSTLog1, predictionLog1 = XGBoostTrain(dataLog1, testLog1, labelTrainLog, params1, False, True, customerIndex,
                                     dataRawTestZero, 4, 1)

print(predictionLog1)

['Average Sales', 'Number of Customers', 'Average SC Ratio', 'assortmentType2', 'store type 4', 'DayOfWeek1', 'Average Open Ratio', 'SchoolHolidayBoolean', 'Variance SC Ratio', 'DayOfWeek6', 'Variance Sales', 'competitionSinceDayCount']


0.12313394032851228
0.15322024877058923


0.07814800511560134


0.07791241413590734


0.07779984057187943
0.10204288978449791


[  4290.07480247   5749.9156275    7803.39092388 ...,   7558.62608151
  24335.77661076   6791.68845051]


In [143]:
# Partial Ratio Model 1 (impt weight) 
index = get_importance(BSTRatio, 15)
print(getFeatureNames(headerWhole, index))
customerIndex = index.index(7)
dataRatio1, testRatio1 = select_features(index, dataTrainNumerical, dataTestNumerical)
params1 = {     'eta'             : 0.4,
                'nround'          : 3000,
                'colsample_bytree': 0.9}
validationError, trainingError = \
                  crossValidationTimeSeries(dataRatio1, labelTrainRatio, 5, testRatio1, params1, True, customerIndex, False)  # kFold, customerIndex
          
BSTRatio1, predictionRatio1 = XGBoostTrain(dataRatio1, testRatio1, labelTrainRatio, params1, True, False, customerIndex,
                                     dataRawTestZero, 4, 1) #customerIndex, openBooleanIndex, adjustment

print(predictionRatio1)

['Average SC Ratio', 'DayOfWeek6', 'DayOfWeek1', 'Number of Customers', 'SchoolHolidayBoolean', 'Month In Promotion 1', 'promotionInterval2', 'Average Sales', 'competitionSinceDayCount', 'Variance SC Ratio', 'promotionSinceDayCount', 'Variance Sales']


0.0740969813432794


0.0706297606771439


0.07583876065814896


0.09269642302531311


0.06234364383207458
0.07512111390719198


[  4869.31973934   5945.98412514   8035.98097229 ...,   7924.94967079
  26753.74673843   7059.88475418]


-----------

# Ensemble on many models from sklearn ML framework

#### Floating number converted training and testing data for SKLearn Regressors
1. First Set: **full model** with trainData, testData, and label
2. Second Set: **Log model** with trainDataLog, testDataLog, and labelLog
3. Third Set: **Ratio model** with tranData, testData, and labelRatio

In [96]:
trainData = dataTrainNumerical.astype(float)
testData = dataTestNumerical.astype(float)
trainDataLog = dataTrainNumericalLog.astype(float)
testDataLog = dataTestNumericalLog.astype(float)
labelLog = labelTrainLog.astype(float)
label = labelTrain.astype(float)
labelRatio = labelTrainRatio.astype(float)

#### Cross validation for sklearn models

In [97]:
def evalErrorSKLearn(prediction, realLabel, data, customerIndex, logBoolean, ratioBoolean):
    # Root Mean Square Percentage Error (RMSPE), return a floating number
    N = realLabel.shape[0]
    realLabel = realLabel.copy()
    prediction = prediction.copy()
    
    if (ratioBoolean):
        prediction = restoreSalesFromRatio(data, prediction, customerIndex)
        realLabel = restoreSalesFromRatio(data, realLabel, customerIndex)
    
    if (logBoolean):
        prediction = restoreExponential(prediction)
        realLabel = restoreExponential(realLabel)
    
    error = 0
    for i in range (N):
        t = realLabel[i]
        p = prediction[i]
        error += ((p-t)/t)**2
    return float((error/N)**(1/2))

In [98]:
def crossValidationSKLearnLinearModel(trainData, label, testData, kFold, customerIndex, logBoolean, ratioBoolean, SKLearn, **para):
    # para is the optional parameters we want to pass in to respective SKLearn linear models
    # SKLearn is the linear model we want to test out
    N = trainData.shape[0]
    resultant_validation_error_list = []
    resultant_training_error_list = []
    k = int(math.floor(N/(kFold+1)))
    # print(k)
    
    for multiplier in range(kFold, 0, -1): # the training data we have is in the reverse of time
        
        trainingSet = trainData[k*multiplier:,:]
        validationSet = trainData[k*(multiplier-1):(k*multiplier), :]
        trainingLabel = label[k*multiplier:]
        validationLabel = label[k*(multiplier-1):(k*multiplier)]
        
        # modelling and prediction
        model = SKLearn(**para)
        model.fit(X = trainingSet, y = trainingLabel)
        validationPrediction = model.predict(X = validationSet)
        trainingPrediction = model.predict(X = trainingSet)
        
        # error calculation
        validationError = evalErrorSKLearn(validationPrediction,validationLabel, validationSet, customerIndex, logBoolean, ratioBoolean)
        trainingError = evalErrorSKLearn(trainingPrediction, trainingLabel, trainingSet, customerIndex, logBoolean, ratioBoolean)
        
        resultant_validation_error_list.append(validationError)
        resultant_training_error_list.append(trainingError)
        print(validationError)
    # final validation error display
    print(float(sum(resultant_validation_error_list)/len(resultant_validation_error_list))) #average validation error
    return resultant_validation_error_list, resultant_training_error_list

In [99]:
def SKLearnTrain(trainData, label, testData, customerIndex, logBoolean, ratioBoolean, SKLearn, **para):
    model = SKLearn(**para)
    model.fit(X = trainData, y = label)
    prediction = model.predict(X = testData)
    
    if (ratioBoolean):
        prediction = restoreSalesFromRatio(testData, prediction, customerIndex)
    if (logBoolean):
        prediction = restoreExponential(prediction)
        
    res = restoreZeroEntryInPrediction(prediction, dataRawTestZero, 4)
    print (res)
    return res

In [100]:
# Adaboost: full model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, label, testData, 5, customerIndex, False, False, ensemble.AdaBoostRegressor, 
                                      n_estimators=5, loss='exponential', learning_rate=0.3)
prediction_ada = SKLearnTrain(trainData, label, testData, customerIndex, False, False, ensemble.AdaBoostRegressor,
             n_estimators=5, loss='exponential', learning_rate=0.3)

0.20611918432699905


0.4532808464048451


0.29678114996041693


0.2834751023539924


0.25729666832274384
0.2993905902737995


[  5894.69837353   6281.20327028   7339.84432824 ...,   8119.60751809
  19950.53459821   6281.20327028]


In [102]:
# Adaboost: Log model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainDataLog, labelLog, testDataLog, 5, customerIndex, True, False, ensemble.AdaBoostRegressor, 
                                      n_estimators=15, loss='exponential', learning_rate=0.5)
prediction_ada_log = SKLearnTrain(trainDataLog, labelLog, testDataLog, customerIndex, True, False, ensemble.AdaBoostRegressor,
             n_estimators=15, loss='exponential', learning_rate=0.5)

0.16651832076980483


0.34956501454106426


0.1892649418431169


0.20576049539423064


0.17230345030287278
0.21668244457021793


[  5191.31293646   6467.59714424   9122.38335082 ...,   9122.38335082
  17866.74677533   6104.04507367]


In [103]:
# Adaboost: ratio model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, labelRatio, testData, 5, customerIndex, False, True, ensemble.AdaBoostRegressor, 
                                      n_estimators=5, loss='exponential', learning_rate=0.3)
prediction_ada_ratio = SKLearnTrain(trainData, labelRatio, testData, customerIndex, False, True, ensemble.AdaBoostRegressor,
             n_estimators=5, loss='exponential', learning_rate=0.3)

0.0863121353548269


0.0862713397418665


0.09523145361947849


0.12130637026418474


0.08769544887093352
0.09536334957025802


[  4916.03482099   6174.07354052   8110.26300283 ...,   7872.33389494
  29259.72222825   6832.3496712 ]


In [104]:
# Bagging: full model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, label, testData, 5, customerIndex, False, False, ensemble.BaggingRegressor,
                                      n_estimators = 10, max_samples = 1.0, max_features = 1.0)
prediction_bag = SKLearnTrain(trainData, label, testData, customerIndex, False, False, ensemble.BaggingRegressor,
                                      n_estimators = 10, max_samples = 1.0, max_features = 1.0)

0.09925332502374767


0.16035569906097322


0.08763447486268866


0.09740379019587135


0.06695210733045313
0.1023198792947468


[  4685.5   5855.7   8374.6 ...,   7860.   26129.    7242.7]


In [105]:
# Bagging: log model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainDataLog, labelLog, testDataLog, 5, customerIndex, True, False, ensemble.BaggingRegressor,
                                      n_estimators = 10, max_samples = 1.0, max_features = 1.0)
prediction_bag_log = SKLearnTrain(trainDataLog, labelLog, testDataLog, customerIndex, True, False, ensemble.BaggingRegressor,
                                      n_estimators = 10, max_samples = 1.0, max_features = 1.0)

0.09547363398243891


0.1722354370652777


0.08087526460904677


0.09354097093117539


0.06634260258057953
0.10169358183370365


[  4653.39818695   5910.96483541   8237.4369925  ...,   7771.22625527
  26808.70533924   7200.69869278]


In [106]:
# Bagging: ratio model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, labelRatio, testData, 5, customerIndex, False, True, ensemble.BaggingRegressor,
                                      n_estimators = 10, max_samples = 1.0, max_features = 1.0)
prediction_bag_ratio = SKLearnTrain(trainData, labelRatio, testData, customerIndex, False, True, ensemble.BaggingRegressor,
                                      n_estimators = 10, max_samples = 1.0, max_features = 1.0)

0.07631364694503094


0.08447573929968384


0.08826459510922612


0.08765554835097811


0.06168818229851206
0.07967954240068623


[  4472.52458497   5859.47516433   8239.5790591  ...,   7818.85315801
  27524.19437055   7040.5601102 ]


In [112]:
# Extra tree: full model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, label, testData, 5, customerIndex, False, False, ensemble.ExtraTreesRegressor)
prediction_extra_tree_full = SKLearnTrain(trainData, label, testData, customerIndex, False, False, ensemble.ExtraTreesRegressor)

0.13349437749426282


0.19566867859546405


0.1314135103438069


0.10623794487677782


0.0716605164697768
0.12769500555601768


[  4792.4   5871.6   7956.5 ...,   7918.2  27182.1   7112. ]


In [113]:
# Extra tree: log model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainDataLog, labelLog, testDataLog, 5, customerIndex, True, False, ensemble.ExtraTreesRegressor)
prediction_extra_tree_log = SKLearnTrain(trainDataLog, labelLog, testDataLog, customerIndex, True, False, ensemble.ExtraTreesRegressor)

0.12935552264770772


0.20672107808992174


0.12860946871351578


0.0957784375197803


0.07096606032431153
0.12628611345904742


[  4635.45132475   5680.58931641   8146.80996433 ...,   7836.08303549
  26184.50795031   6899.00608788]


In [115]:
# Extra tree: ratio model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, labelRatio, testData, 5, customerIndex, False, True, ensemble.ExtraTreesRegressor)
prediction_extra_tree_ratio = SKLearnTrain(trainData, labelRatio, testData, customerIndex, False, True, ensemble.ExtraTreesRegressor)

0.07744912596749193


0.09046815125190233


0.09066996333251862


0.0885399315267407


0.07273995627516087
0.08397342567076288


[  4476.1899007    5807.30360493   8209.41653534 ...,   7640.24730873
  26896.23625812   7079.46070281]


In [116]:
# Gradient Boosting: full model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, label, testData, 5, customerIndex, False, False, ensemble.GradientBoostingRegressor)
prediction_gradient_full = SKLearnTrain(trainData, label, testData, customerIndex, False, False, ensemble.GradientBoostingRegressor)

0.12007281254546166


0.17447584286555293


0.12233600676928329


0.12679875057625592


0.08152628388929017
0.1250419393291688


[  5108.47284209   6044.74513063   8299.64649567 ...,   7820.79791977
  26800.79594919   7227.27025411]


In [118]:
# Gradient Boosting: log model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainDataLog, labelLog, testDataLog, 5, customerIndex, True, False, ensemble.GradientBoostingRegressor)
prediction_gradient_log = SKLearnTrain(trainDataLog, labelLog, testDataLog, customerIndex, True, False, ensemble.GradientBoostingRegressor)

0.11106235529463163


0.15167920677424634


0.10309751390413727


0.10587190679066066


0.07030452984772423
0.10840310252228003


[  4914.90174413   6049.41015053   8125.60681771 ...,   7943.69310421
  24081.22219837   7211.87158352]


In [119]:
# Gradient Boosting: ratio model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, labelRatio, testData, 5, customerIndex, False, True, ensemble.GradientBoostingRegressor)
prediction_gradient_ratio = SKLearnTrain(trainData, labelRatio, testData, customerIndex, False, True, ensemble.GradientBoostingRegressor)

0.07337479796785006


0.07313464689581245


0.08338624789595053


0.09786193687383962


0.0618487180114386
0.07792126952897825


[  4867.64871466   5896.18750165   8099.15648636 ...,   7970.13976856
  26813.06436348   7129.65296716]


In [120]:
# Random forest: full model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, label, testData, 5, customerIndex, False, False, ensemble.RandomForestRegressor,
                                      n_estimators=15, max_features = 'auto')
prediction_random_full = SKLearnTrain(trainData, label, testData, customerIndex, False, False, ensemble.RandomForestRegressor,
                                      n_estimators=15, max_features = 'auto')

0.09600000266666228


0.16141633114043108


0.08529004484610525


0.09545167044074772


0.0658315576977503
0.10079792135833934


[  4850.33333333   5798.13333333   8052.46666667 ...,   7884.6
  27874.06666667   7099.33333333]


In [121]:
# Random forest: log model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainDataLog, labelLog, testDataLog, 5, customerIndex, True, False, ensemble.RandomForestRegressor,
                                      n_estimators=15, max_features = 'auto')
prediction_random_log = SKLearnTrain(trainDataLog, labelLog, testDataLog, customerIndex, True, False, ensemble.RandomForestRegressor,
                                      n_estimators=15, max_features = 'auto')

0.09637982186592346


0.1711013617282746


0.08091349031448862


0.09306220412898998


0.0654055684462339
0.10137248929678211


[  4626.82496999   5857.55391777   8474.39576136 ...,   7681.57798245
  27380.25590932   7210.7843548 ]


In [122]:
# Random forest: ratio model
customerIndex = 7
validationList, trainingList = \
    crossValidationSKLearnLinearModel(trainData, labelRatio, testData, 5, customerIndex, False, True, ensemble.RandomForestRegressor,
                                      n_estimators=15, max_features = 'auto')
prediction_random_ratio = SKLearnTrain(trainData, labelRatio, testData, customerIndex, False, True, ensemble.RandomForestRegressor,
                                      n_estimators=15, max_features = 'auto')

0.07491790625380754


0.083945260512553


0.0869746182275697


0.08611672662939417


0.060674665279342226
0.07852583538053333


[  4650.45532302   5879.84487318   8187.80560839 ...,   7753.4398963
  27392.52799173   7113.16373764]


## Final Average Prediction

In [127]:
def arithmeticAverage(* predictions):
    N = predictions[0].shape[0]
    numberOfPrediction = len(predictions)
    resultantList = []
    for i in range(N):
        summation = 0
        for prediction in predictions:
            summation += prediction[i]
        resultantList.append(summation/numberOfPrediction)
    return np.array(resultantList)    

In [128]:
prediction_final = arithmeticAverage(predictionRatio, predictionRatio1, predictionLog, predictionLog1, prediction1,
                                     prediction_ada_ratio, prediction_bag, prediction_bag_log, prediction_bag_ratio,
                                     prediction_extra_tree_ratio, prediction_gradient_log, prediction_gradient_ratio,
                                     prediction_random_full, prediction_random_log, prediction_random_ratio)

In [131]:
filePath = os.path.join(directory_path, 'FinalPrediction.csv')
writeToFile(prediction_final, filePath)