In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [4]:
#Load the Train Dataset
train = pd.read_csv("../data/X_train_ini.csv")
test = pd.read_csv("../data/test.csv")

In [5]:
def mypredict(new_train):
    
    global train,test,t
    
    #Create Empty Data Frame
    df = pd.DataFrame()
    
    X_train = train.copy()
    X_test = new_train.copy()

    #----------------------------------------------------------
    # Feature Engineering
    #----------------------------------------------------------
    #One Hot Encoding
    X_train['IsHoliday_oh'] = X_train['IsHoliday'].astype(int)
    #Extract Year, Month, Week, Date (Train)
    X_train['day'] = pd.to_datetime(X_train['Date']).dt.day
    X_train['week'] = pd.to_datetime(X_train['Date']).dt.week
    X_train['month'] = pd.to_datetime(X_train['Date']).dt.month
    X_train['year'] = pd.to_datetime(X_train['Date']).dt.year

    y_train = train['Weekly_Sales']
    X_train = X_train.drop(['IsHoliday','Date','Weekly_Sales'],axis=1)

    #One Hot Encoding
    X_test['IsHoliday_oh'] = X_test['IsHoliday'].astype(int)
    #Extract Year, Month, Week, Date (Train)
    X_test['day'] = pd.to_datetime(X_test['Date']).dt.day
    X_test['week'] = pd.to_datetime(X_test['Date']).dt.week
    X_test['month'] = pd.to_datetime(X_test['Date']).dt.month
    X_test['year'] = pd.to_datetime(X_test['Date']).dt.year
    X_test = X_test.drop(['IsHoliday','Date','Weekly_Sales'],axis=1)

    #----------------------------------------------------------
    # Model Building
    #----------------------------------------------------------
    rf_mod = RandomForestRegressor(n_estimators=150,
                                   random_state=125247)
    rf_mod.fit(X_train,y_train)

    #Random Forest (Prediction)
    y_preds = rf_mod.predict(X_test)
    y_preds[np.isnan(y_preds)] = 0

    df['Weekly_Pred'] = y_preds
    
    train = pd.concat([train,new_train])
    
    return df
        

In [6]:
#Below code shouldn't be part of the delivery, it's only to test your code for each fold

avg_wae = 0

for t in np.arange(10)+1:
    fold_file = "fold_"+np.str(t)+".csv"
    new_train = pd.read_csv("../data/fold/"+fold_file)
    df_return = mypredict(new_train)
    actuals = new_train['Weekly_Sales']
    preds = df_return['Weekly_Pred']
    
    #Assign the Weights based on the IsHoliday
    weights = [5 if i==1 else 1 for i in new_train['IsHoliday'].astype(int)]
    
    #Calculate the WAE
    wae = sum(weights * abs(actuals - preds)) / sum(weights)
    avg_wae = avg_wae + wae
    
    print ("RF - Fold:{} - WAE:{}".format(t,wae))
avg_wae = (avg_wae/10)

print ("RF -Average WAE:{}".format(avg_wae))
    
    


RF - Fold:1 - WAE:1731.7923736134726
RF - Fold:2 - WAE:1424.243267669094
RF - Fold:3 - WAE:1347.834076027682
RF - Fold:4 - WAE:1393.652038929543
RF - Fold:5 - WAE:2633.4714950173634
RF - Fold:6 - WAE:1648.0231413370645
RF - Fold:7 - WAE:1642.7074726304932
RF - Fold:8 - WAE:1342.9034368842938
RF - Fold:9 - WAE:1254.4201269070277
RF - Fold:10 - WAE:1227.092273014353
RF -Average WAE:1564.6139702030384
