In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

stores = pd.read_csv('../data/store.csv')
train = pd.read_csv('../data/train.csv', parse_dates = ['Date'])
test = pd.read_csv('../data/test.csv', parse_dates = ['Date'])

  interactivity=interactivity, compiler=compiler, result=result)


## Error Metrics

In [2]:
from sklearn.metrics import make_scorer

def rmspe(y_true, y_pred):
    """Root Mean Square Percentage Error
    Details about this function can be found on kaggle 
    https://www.kaggle.com/c/rossmann-store-sales/details/evaluation"""
    idx = y_true != 0
    return np.sqrt(np.mean(((y_true[idx] - y_pred[idx]) / y_true[idx]) ** 2))

# Create a Scorer
rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

## Data Pre-Processing and Feature Extraction

In [7]:
def process(input_data, store_data, sort_by=None, convertToArray=True):
    
    # Create a copy of the data
    data = input_data.copy()
    
    if sort_by:
        data.sort_values(by=sort_by, inplace=True)
    
    # Merge the Store information to the data
    data = data.merge(store_data, on='Store')
    data.drop(['Store'], axis=1, inplace=True)
    
    # Process the Date field
    data['year'] = data.Date.apply(lambda x: x.year)
    data['month'] = data.Date.apply(lambda x: x.month)
    data['day'] = data.Date.apply(lambda x: x.day)
    data['woy'] = data.Date.apply(lambda x: x.weekofyear)
    data.drop(['Date'], axis = 1, inplace=True)
    
    # Process the Competition Open fields
    data['CompetitionOpen'] = 12 * (data.year - data.CompetitionOpenSinceYear) + (data.month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis=1, inplace=True)
    
    # Process the Promo Open field
    data['PromoOpen'] = 12 * (data.year - data.Promo2SinceYear) + (data.woy - data.Promo2SinceWeek) / float(4)
    data['PromoOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['Promo2SinceYear', 'Promo2SinceWeek'], axis=1, inplace=True)
    
    # Normalize State Holiday field
    data['StateHoliday'] = data.StateHoliday.apply(lambda x: x if x in ['a', 'b', 'c'] else 0)
    
    # Dummy Coding
    for dummy in ['StateHoliday', 'StoreType', 'Assortment']:
        # Create dummy columns
        data = pd.get_dummies(data, columns=[dummy])
        
        # Remove original column
        if dummy in data.columns:
            data.drop([dummy], axis=1, inplace=True)
    
    # Fix State Holiday columns, some values are not present in the testing data
    for col in ['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']:
        if col not in data.columns:
            data[col] = np.zeros(len(data.index))
    
    # Drop unused Columns
    data.drop(['PromoInterval'], axis=1, inplace=True)
    
    # Remove NaN values
    data.fillna(0, inplace=True)
    
    # Make sure columns are sorted
    data = data.reindex_axis(sorted(data.columns), axis=1)
    
    # training data
    if 'Sales' in data.columns:
    
        # Consider only open stores for training. Closed stores wont count into the score
        # data = data[data.Open != 0]
    
        # Use only Sales bigger then zero
        # data = data[data.Sales > 0]

        X_train = data.drop(['Sales', 'Customers'], axis=1)
        y_train = data.Sales

        if convertToArray:
            return X_train.values, y_train.values

        return X_train, y_train
    
    # testing data
    else:
        X_test = data.drop(['Id'], axis=1)

        if convertToArray:
            return X_test.values,

        return X_test,

In [None]:
X_train, y_train = process(train, stores, sort_by='Date')

## Regression - Cross Validation

In [None]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestRegressor

# Random Forest Classifier
clf = RandomForestRegressor(n_estimators=30)

scores = cross_validation.cross_val_score(clf, X_train, y_train, scoring=rmspe_scorer, cv=4)
scores.mean()

In [6]:
from sklearn import cross_validation
from sklearn.neighbors import KNeighborsRegressor

# KNN Classifier
clf = KNeighborsRegressor(n_neighbors=5)

scores = cross_validation.cross_val_score(clf, X_train, y_train, scoring=rmspe_scorer, cv=4)
scores.mean()

-0.72983324033974628

## Regression - Full Training

In [72]:
X_test, = process(test, stores, sort_by='Id')

In [109]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Classifier
clf = RandomForestRegressor(n_estimators=30)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [138]:
import time, datetime

sales = pd.Series(y_pred)
result = pd.DataFrame({'Id': test['Id'], 'Sales':sales})

st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
result.to_csv('submission_%s.csv' % st, header=True, index=False)

  result = result.union(other)
