In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

# Import all utility functions
import utils
import skutils

stores = pd.read_csv('../data/store.csv')
train = pd.read_csv('../data/train.csv', parse_dates = ['Date'])
test = pd.read_csv('../data/test.csv', parse_dates = ['Date'])

  interactivity=interactivity, compiler=compiler, result=result)


## Data Pre-Processing and Feature Extraction

In [2]:
def transform(input_data, store_data):
    
    # Create a copy of the data
    data = input_data.copy()

    # Merge the Store information to the data
    data = data.merge(store_data, on='Store')
    data.drop(['Store'], axis=1, inplace=True)
    
    if 'Sales' not in data.columns:
        # Merge creates new Ids, so we need to reset the Ids
        # on the Id column for the test set
        data.set_index('Id', inplace=True)    
    
    # Process the Date field
    data['year'] = data.Date.apply(lambda x: x.year)
    data['month'] = data.Date.apply(lambda x: x.month)
    # data['day'] = data.Date.apply(lambda x: x.day)
    data['woy'] = data.Date.apply(lambda x: x.weekofyear)
    data.drop(['Date'], axis = 1, inplace=True)
    
    # Process the Competition Open fields
    data['CompetitionOpen'] = 12 * (data.year - data.CompetitionOpenSinceYear) + (data.month - data.CompetitionOpenSinceMonth)
    data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], axis=1, inplace=True)
    
    # Process the Promo Open field
    data['PromoOpen'] = 12 * (data.year - data.Promo2SinceYear) + (data.woy - data.Promo2SinceWeek) / float(4)
    data['PromoOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0)
    data.drop(['Promo2SinceYear', 'Promo2SinceWeek'], axis=1, inplace=True)
    
    # Get promo months
    data['p_1'] = data.PromoInterval.apply(lambda x: x[:3] if type(x) == str else 0)
    data['p_2'] = data.PromoInterval.apply(lambda x: x[4:7] if type(x) == str else 0)
    data['p_3'] = data.PromoInterval.apply(lambda x: x[8:11] if type(x) == str else 0)
    data['p_4'] = data.PromoInterval.apply(lambda x: x[12:15] if type(x) == str else 0)

    # Get dummies for promo months
    data = pd.get_dummies(data, columns = ['p_1', 'p_2', 'p_3', 'p_4'])
    data.drop(['p_1_0', 'p_2_0', 'p_3_0', 'p_4_0'], axis=1, inplace=True)
    
    # Normalize State Holiday field
    data['StateHoliday'] = data.StateHoliday.apply(lambda x: x if x in ['a', 'b', 'c'] else 0)
    
    # Dummy Coding
    for dummy in ['StateHoliday', 'StoreType', 'Assortment', 'DayOfWeek']:
        # Create dummy columns
        data = pd.get_dummies(data, columns=[dummy])
        
        # Remove original column
        if dummy in data.columns:
            data.drop([dummy], axis=1, inplace=True)
    
    # Fix State Holiday columns, some values are not present in the testing data
    for col in ['StateHoliday_0', 'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c']:
        if col not in data.columns:
            data[col] = np.zeros(len(data.index))
    
    # Drop unused Columns
    data.drop(['PromoInterval'], axis=1, inplace=True)
    
    # Make sure columns are sorted
    data = data.reindex_axis(sorted(data.columns), axis=1)
    
    # training data
    if 'Sales' in data.columns:

        # Consider only open stores for training. Closed stores wont count into the score
        data = data[data.Open != 0]
    
        # Use only Sales bigger then zero
        data = data[data.Sales > 0]

        # Outlier Removal, exclude values bigger than 3 x std
        data = data[((data.Sales - data.Sales.mean()) / data.Sales.std()).abs() < 3]
        
        return data.drop(['Sales', 'Customers'], axis=1), data.Sales
    
    # testing data
    else:
        return data,

def zeroWhenClosed(X, y):
    y = np.array(y)
    y[X['Open'] != 1] = 0
    return y

In [3]:
X_train, y_train = transform(train, stores)

# Create a simple log1p transform
yProcessor = skutils.Log1pTransform()

# Transform the training values
y_train = yProcessor.transform(y_train)

## Estimator Pipeline

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regression parameters
rfr_params = {
  'n_estimators': 20
}

# Initialize Estimators
estimators = [
    ('nan', skutils.NanPreProcessor([
        ('CompetitionDistance', 100000),
        ('Open', 1),
    ], nan=0)),
    ('values', skutils.PandasTransform()),
    ('forest', RandomForestRegressor(random_state=42, **rfr_params)),
]

# Create a Pipe
clf = Pipeline(estimators)

## Cross Validation

In [5]:
scores = skutils.cross_val(clf, X_train, y_train, scoring=utils.rmspe_log1p)

print("Cross validation score %.8f (+- %.8f)" % (scores.mean(), scores.std()))

Cross validation score -0.20870272 (+- 0.07662819)


## Full Training and Prediction

In [6]:
X_test, = transform(test, stores)

In [7]:
clf.fit(X_train, y_train.values)
y_pred = clf.predict(X_test)

# Inverse transform for the predictions
y_pred = yProcessor.inverse_transform(y_pred)
y_pred = zeroWhenClosed(X_test, y_pred)



In [8]:
result = pd.DataFrame({'Id': X_test.index.values, 'Sales': y_pred})
result.to_csv('submission_%s.csv' % utils.timestamp(), index=False)