In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

# Import all utility functions
import utils
import skutils

stores = pd.read_csv('../data/store.csv')
train = pd.read_csv('../data/train.csv', parse_dates = ['Date'])
test = pd.read_csv('../data/test.csv', parse_dates = ['Date'])

  interactivity=interactivity, compiler=compiler, result=result)


## Data Pre-Processing and Feature Extraction

In [2]:
def process(input_data, store_data, max_comp_distance=100000, sort_by=None):
    
    # Create a copy of the data
    data = input_data.copy()
    
    if sort_by:
        data.sort_values(by=sort_by, inplace=True)
    
    # Merge the Store information to the data
    data = data.merge(store_data, on='Store')
    data.drop(['Store'], axis=1, inplace=True)
    
    # Process the Date field
    data['year'] = data.Date.apply(lambda x: x.year)
    data['month'] = data.Date.apply(lambda x: x.month)
    # data['day'] = data.Date.apply(lambda x: x.day)
    data['woy'] = data.Date.apply(lambda x: x.weekofyear)
    data.drop(['Date'], axis = 1, inplace=True)
    
    # Normalize Competition Distance
    data['CompetitionDistance'] = data.CompetitionDistance.fillna(max_comp_distance)
    
    # Process the Competition Open fields
    # data['CompetitionOpen'] = 12 * (data.year - data.CompetitionOpenSinceYear) + (data.month - data.CompetitionOpenSinceMonth)
    # data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis=1, inplace=True)
    
    # Process the Promo Open field
    # data['PromoOpen'] = 12 * (data.year - data.Promo2SinceYear) + (data.woy - data.Promo2SinceWeek) / float(4)
    # data['PromoOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['Promo2SinceYear', 'Promo2SinceWeek'], axis=1, inplace=True)
    
    # Normalize State Holiday field
    data['StateHoliday'] = data.StateHoliday.apply(lambda x: x if x in ['a', 'b', 'c'] else 0)
    
    # Dummy Coding
    for dummy in ['StateHoliday', 'StoreType', 'Assortment', 'DayOfWeek']:
        # Create dummy columns
        data = pd.get_dummies(data, columns=[dummy])
        
        # Remove original column
        if dummy in data.columns:
            data.drop([dummy], axis=1, inplace=True)
    
    # Fix State Holiday columns, some values are not present in the testing data
    for col in ['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']:
        if col not in data.columns:
            data[col] = np.zeros(len(data.index))
    
    # Drop unused Columns
    data.drop(['PromoInterval'], axis=1, inplace=True)
    
    # Make sure columns are sorted
    data = data.reindex_axis(sorted(data.columns), axis=1)
    
    # training data
    if 'Sales' in data.columns:
        
        # Remove NaN values
        data.fillna(0, inplace=True)
    
        # Consider only open stores for training. Closed stores wont count into the score
        data = data[data.Open != 0]
    
        # Use only Sales bigger then zero
        data = data[data.Sales > 0]

        return data.drop(['Sales', 'Customers'], axis=1), data.Sales
    
    # testing data
    else:
        # Remove NaN values
        # appear only in Open column
        data.fillna(1, inplace=True)
        
        return data.drop(['Id'], axis=1),

In [3]:
X_train, y_train = process(train, stores)

## Regression - Cross Validation

In [4]:
from sklearn.ensemble import RandomForestRegressor

# Classifier Parameters
clf_params = {
  'n_estimators': 20
}

# Random Forest Classifier
clf = RandomForestRegressor(random_state=42, **clf_params)

scores = skutils.cross_val(clf, X_train, y_train, scoring=utils.rmspe)

print("Cross validation score %.8f (+- %.8f)" % (scores.mean(), scores.std()))

Cross validation score -0.28802898 (+- 0.03760398)


In [5]:
from sklearn.neighbors import KNeighborsRegressor

# KNN Classifier
clf = KNeighborsRegressor(n_neighbors=5)

scores = skutils.cross_val(clf, X_train, y_train, scoring=utils.rmspe)

print("Cross validation score %.8f (+- %.8f)" % (scores.mean(), scores.std()))

Cross validation score -0.32068736 (+- 0.04321527)


## Regression - Full Training

In [4]:
X_test, = process(test, stores)

In [5]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Classifier
clf = RandomForestRegressor(**clf_params)

clf.fit(X_train.values, y_train.values)
y_pred = clf.predict(X_test.values)

In [6]:
result = pd.DataFrame({'Id': test['Id'], 'Sales': y_pred}).sort_values('Id')
result.to_csv('submission_%s.csv' % utils.timestamp(), index=False)