In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
import math
import warnings 
warnings.filterwarnings("ignore")

In [2]:
def DT_Fitting(x, y_residual, model_list):
    clf = tree.DecisionTreeRegressor(max_depth=6)
    clf = clf.fit(x, y_residual)
    model_list.append(clf)
    yp = clf.predict(x).reshape(-1,1)
    y_residual = y_residual-yp
    return y_residual

In [3]:
def Boosting(dataset, target, no_boosting_runs):
    
    model_list = []
    x = dataset.drop(columns=['NumberOfSales','NumberOfCustomers'])
    y_residual = dataset[target].values.reshape(-1,1)
        
    for i in range(no_boosting_runs):
        y_residual = DT_Fitting(x, y_residual, model_list)
    
    return model_list

In [4]:
def RandomForest(dataset, target, no_trees):
 
    x = dataset.drop(columns=['NumberOfSales','NumberOfCustomers'])
    y = dataset[target].values.reshape(-1,1)
    model1 = RandomForestRegressor(random_state=12345) 
    model1.set_params(n_estimators=no_trees,min_samples_leaf=5)
    model1.fit(x,y)
    
    return model1

In [5]:
def RandomForest_eval(model,data_to_predict):
    return model.predict(data_to_predict)

In [6]:
def GradientBoosting_eval(models,data_to_predict):
    prediction = np.zeros((len(data_to_predict),1))
    for model in models:
        yp = model.predict(data_to_predict).reshape(-1,1)
        prediction += yp
    return prediction.reshape(-1,1)

In [12]:
def Stacking_eval(ypB,ypF):
    prediction = []
    for i in range(len(ypB)):
        prediction.append( (ypB[i]+ypF[i]) / 2)
    return prediction

## Reading csv

In [8]:
train=pd.read_csv('TrainClean.csv',index_col=0).reset_index(drop=True)
test=pd.read_csv('TestClean.csv',index_col=0).reset_index(drop=True)

### Create list of models

In [9]:
model_list = Boosting(train,'NumberOfSales',130)
model_forest = RandomForest(train,'NumberOfSales',150)

### Predict Sales

In [10]:
stores = test['StoreID'].unique()
months = ['March','April']
to_return = pd.DataFrame(columns=['StoreID', 'Month', 'NumberOfSales'])

In [13]:
for store in stores:
    current_store_df = test.loc[test['StoreID'] == store]
    for month in months:
        X = current_store_df.loc[current_store_df[month] == 1]
        X = X.drop(labels='StoreID', axis=1)
        resB = GradientBoosting_eval(model_list,X)
        resF = RandomForest_eval(model_forest,X)
        res = Stacking_eval(resB,resF)
        res = np.expm1(res)
        res = np.round(res)
        if(month == 'March'):
            res = {'StoreID' : [store], 'Month' : [3], 'NumberOfSales' : [np.sum(res)]}
        else:
            res = {'StoreID' : [store], 'Month' : [4], 'NumberOfSales' : [np.sum(res)]}
        res = pd.DataFrame(res)
        to_return = to_return.append(res,ignore_index=True)

### Csv Creation

In [14]:
to_return.to_csv("PredictedSales.csv")