In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
import math
import warnings 
warnings.filterwarnings("ignore")

In [3]:
def DT_Fitting(x, y_residual, model_list):
    clf = tree.DecisionTreeRegressor(max_depth=6)
    clf = clf.fit(x, y_residual)
    model_list.append(clf)
    yp = clf.predict(x).reshape(-1,1)
    y_residual = y_residual-yp
    return y_residual

In [4]:
def Boosting(dataset, target, no_boosting_runs):
    
    model_list = []
    x = dataset.drop(columns=['NumberOfSales','NumberOfCustomers'])
    y_residual = dataset[target].values.reshape(-1,1)
        
    for i in range(no_boosting_runs):
        y_residual = DT_Fitting(x, y_residual, model_list)
    
    return model_list

In [5]:
def RandomForest(dataset, target, no_trees):
 
    x = dataset.drop(columns=['NumberOfSales','NumberOfCustomers'])
    y = dataset[target].values.reshape(-1,1)
    model1 = RandomForestRegressor(random_state=12345) 
    model1.set_params(n_estimators=no_trees,min_samples_leaf=5)
    model1.fit(x,y)
    
    return model1

In [6]:
def RandomForest_eval(model,data_to_predict):
    return model.predict(data_to_predict)

In [7]:
def GradientBoosting_eval(models,data_to_predict):
    prediction = np.zeros((len(data_to_predict),1))
    for model in models:
        yp = model.predict(data_to_predict).reshape(-1,1)
        prediction += yp
    return prediction.reshape(-1,1)

In [8]:
def Stacking_eval(ypB,ypF):
    return (ypB + ypF)/2

## Reading csv

In [13]:
train=pd.read_csv('TrainClean.csv',index_col=0).reset_index(drop=True)
test=pd.read_csv('TestClean.csv',index_col=0).reset_index(drop=True)

In [14]:
train.columns

Index(['IsHoliday', 'HasPromotions', 'NearestCompetitor', 'Region_AreaKM2',
       'Region_GDP', 'Region_PopulationK', 'Mean_Dew_PointC', 'Mean_Humidity',
       'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC', 'Mean_VisibilityKm',
       'Mean_Wind_SpeedKm_h', 'Precipitationmm', 'StandardMarket',
       'HyperMarket', 'SuperMarket', 'ShoppingCenter', 'General',
       'WithFishDepartment', 'WithNonFoodDepartment', 'Region0', 'Region1',
       'Region2', 'Region3', 'Region4', 'Region5', 'Region6', 'Region7',
       'Region8', 'Region9', 'January', 'February', 'March', 'April', 'May',
       'June', 'July', 'August', 'September', 'October', 'November',
       'December', 'Tuesday', 'Wednesday', 'Friday', 'Saturday', 'Monday',
       'Thursday', 'Sunday', 'Thunderstorm', 'Snow', 'Fog', 'Hail', 'Nothing',
       'Rain', 'ZeroClouds', 'H', 'WCI', 'NumberOfSales', 'NumberOfCustomers'],
      dtype='object')

### Create list of models

In [15]:
model_list = Boosting(train,'NumberOfSales',130)
model_forest = RandomForest(train,'NumberOfSales',150)

### Predict Sales

In [16]:
stores = test['StoreID'].unique()
months = ['March','April']
to_return = pd.DataFrame(columns=['StoreID', 'Month', 'NumberOfSales'])

In [17]:
to_return = pd.DataFrame(columns=['StoreID', 'Month', 'NumberOfSales'])
for store in stores:
    current_store_df = test.loc[test['StoreID'] == store]
    for month in months:
        X = current_store_df.loc[current_store_df[month] == 1]
        X = X.drop(labels='StoreID', axis=1)
        resB = GradientBoosting_eval(model_list,X)
        resF = RandomForest_eval(model_forest,X).reshape(-1,1)
        res = Stacking_eval(resB,resF)
        res = np.expm1(res)
        res = np.round(res)
        if(month == 'March'):
            res = {'StoreID' : [store], 'Month' : [3], 'NumberOfSales' : [np.sum(res)]}
        else:
            res = {'StoreID' : [store], 'Month' : [4], 'NumberOfSales' : [np.sum(res)]}
        res = pd.DataFrame(res)
        to_return = to_return.append(res,ignore_index=True)

In [18]:
to_return

Unnamed: 0,Month,NumberOfSales,StoreID
0,3,195973.0,1000
1,4,176368.0,1000
2,3,57703.0,1001
3,4,70200.0,1001
4,3,126673.0,1002
5,4,113236.0,1002
6,3,134706.0,1003
7,4,117996.0,1003
8,3,104488.0,1004
9,4,90894.0,1004


### Csv Creation

In [27]:
to_return.to_csv("PredictedSales.csv")