In [1]:
import os
import sys
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

data_dir = '../data/'

inf = float('inf')

In [2]:
def strtodate(s):
    try:
        return datetime.strptime(s, '%d/%m/%Y')
    except ValueError as e:
        pass # let it pass

    try:
        return datetime.strptime(s, '%Y-%m-%d')
    except ValueError as e:
        raise

In [3]:
train_csvpath = os.path.join(data_dir, 'train.csv')
train_data = pd.read_csv(train_csvpath,
    dtype={
        'Store': int,
        'DayOfWeek': int,
        'Sales': float,
        'Open': int,
        'Customers': int,
        'Promo': int,
        'StateHoliday': str,
        'SchoolHoliday': int,
    },
    parse_dates=['Date'],
    date_parser=strtodate)

print '\'train.csv\' loaded'

'train.csv' loaded


In [4]:
train_data.insert(len(train_data.columns), 'StateHoliday_0', train_data['StateHoliday'] == '0')
train_data.insert(len(train_data.columns), 'StateHoliday_a', train_data['StateHoliday'] == 'a')
train_data.insert(len(train_data.columns), 'StateHoliday_b', train_data['StateHoliday'] == 'b')
train_data.insert(len(train_data.columns), 'StateHoliday_c', train_data['StateHoliday'] == 'c')
train_data.insert(len(train_data.columns), 'Weekends', 5 <= train_data['DayOfWeek'])
train_data.insert(len(train_data.columns), 'Weekdays', train_data['DayOfWeek'] < 5)

print train_data.iloc[:10]

   Store  DayOfWeek       Date    Sales  Customers  Open  Promo StateHoliday  \
0      1          5 2015-07-31   5263.0        555     1      1            0   
1      2          5 2015-07-31   6064.0        625     1      1            0   
2      3          5 2015-07-31   8314.0        821     1      1            0   
3      4          5 2015-07-31  13995.0       1498     1      1            0   
4      5          5 2015-07-31   4822.0        559     1      1            0   
5      6          5 2015-07-31   5651.0        589     1      1            0   
6      7          5 2015-07-31  15344.0       1414     1      1            0   
7      8          5 2015-07-31   8492.0        833     1      1            0   
8      9          5 2015-07-31   8565.0        687     1      1            0   
9     10          5 2015-07-31   7185.0        681     1      1            0   

   SchoolHoliday StateHoliday_0 StateHoliday_a StateHoliday_b StateHoliday_c  \
0              1           True        

In [5]:
print train_data.dtypes

Store                      int32
DayOfWeek                  int32
Date              datetime64[ns]
Sales                    float64
Customers                  int32
Open                       int32
Promo                      int32
StateHoliday              object
SchoolHoliday              int32
StateHoliday_0              bool
StateHoliday_a              bool
StateHoliday_b              bool
StateHoliday_c              bool
Weekends                    bool
Weekdays                    bool
dtype: object


In [6]:
store_csvpath = os.path.join(data_dir, 'store.csv')
store_data = pd.read_csv(store_csvpath,
    dtype={
        'Store': int,
        'StoreType': str,
        'Assortment': str,
        'CompetitionDistance': float,
        'CompetitionOpenSinceMonth': float,
        'CompetitionOpenSinceYear': float,
        'Promo2': int,
        'Promo2SinceWeek': float,
        'Promo2SinceYear': float,
        'PromoInterval': str
    })

print '\'store.csv\' loaded'

'store.csv' loaded


In [7]:
store_data.fillna({
    'CompetitionDistance': inf,
    'CompetitionOpenSinceMonth': -1.,
    'CompetitionOpenSinceYear': -1.,
    'Promo2SinceWeek': -1.,
    'Promo2SinceYear': -1.,
    'PromoInterval': '',
    }, inplace=True)

print store_data.iloc[:10]

   Store StoreType Assortment  CompetitionDistance  CompetitionOpenSinceMonth  \
0      1         c          a               1270.0                        9.0   
1      2         a          a                570.0                       11.0   
2      3         a          a              14130.0                       12.0   
3      4         c          c                620.0                        9.0   
4      5         a          a              29910.0                        4.0   
5      6         a          a                310.0                       12.0   
6      7         a          c              24000.0                        4.0   
7      8         a          a               7520.0                       10.0   
8      9         a          c               2030.0                        8.0   
9     10         a          a               3160.0                        9.0   

   CompetitionOpenSinceYear  Promo2  Promo2SinceWeek  Promo2SinceYear  \
0                    2008.0       0

In [8]:
store_data = store_data.astype({
    'CompetitionOpenSinceMonth': int,
    'CompetitionOpenSinceYear': int,
    'Promo2SinceWeek': int,
    'Promo2SinceYear': int
    }, copy=False)

print store_data.dtypes

Store                          int32
StoreType                     object
Assortment                    object
CompetitionDistance          float64
CompetitionOpenSinceMonth      int32
CompetitionOpenSinceYear       int32
Promo2                         int32
Promo2SinceWeek                int32
Promo2SinceYear                int32
PromoInterval                 object
dtype: object


In [9]:
train_data.insert(len(train_data.columns), 'StoreType_a', False)
train_data.insert(len(train_data.columns), 'StoreType_b', False)
train_data.insert(len(train_data.columns), 'StoreType_c', False)
train_data.insert(len(train_data.columns), 'StoreType_d', False)

train_data.insert(len(train_data.columns), 'Assortment_a', False)
train_data.insert(len(train_data.columns), 'Assortment_b', False)
train_data.insert(len(train_data.columns), 'Assortment_c', False)

train_data.insert(len(train_data.columns), 'HasCompetition', False)
train_data.insert(len(train_data.columns), 'CompetitionDistance', 0.)

train_data.insert(len(train_data.columns), 'IsDoingPromo2', False)

print train_data.dtypes

Store                           int32
DayOfWeek                       int32
Date                   datetime64[ns]
Sales                         float64
Customers                       int32
Open                            int32
Promo                           int32
StateHoliday                   object
SchoolHoliday                   int32
StateHoliday_0                   bool
StateHoliday_a                   bool
StateHoliday_b                   bool
StateHoliday_c                   bool
Weekends                         bool
Weekdays                         bool
StoreType_a                      bool
StoreType_b                      bool
StoreType_c                      bool
StoreType_d                      bool
Assortment_a                     bool
Assortment_b                     bool
Assortment_c                     bool
HasCompetition                   bool
CompetitionDistance           float64
IsDoingPromo2                    bool
dtype: object


In [10]:
num_stores = 1115
for i in range(1, num_stores+1):
    store_details = store_data.loc[store_data['Store'] == i]
    
    # I fucking hate pandas so much
    train_data.loc[train_data['Store'] == i, 'StoreType_a'] = (store_details.loc[:, 'StoreType'] == 'a').all()
    train_data.loc[train_data['Store'] == i, 'StoreType_b'] = (store_details.loc[:, 'StoreType'] == 'b').all()
    train_data.loc[train_data['Store'] == i, 'StoreType_c'] = (store_details.loc[:, 'StoreType'] == 'c').all()
    train_data.loc[train_data['Store'] == i, 'StoreType_d'] = (store_details.loc[:, 'StoreType'] == 'd').all()
    
    train_data.loc[train_data['Store'] == i, 'Assortment_a'] = (store_details.loc[:, 'Assortment'] == 'a').all()
    train_data.loc[train_data['Store'] == i, 'Assortment_b'] = (store_details.loc[:, 'Assortment'] == 'b').all()
    train_data.loc[train_data['Store'] == i, 'Assortment_c'] = (store_details.loc[:, 'Assortment'] == 'c').all()
    
    if (store_details['CompetitionDistance'] < inf).all():
        cp_open_since_mo = store_details.loc[:, 'CompetitionOpenSinceMonth'].values[0]
        cp_open_since_yr = store_details.loc[:, 'CompetitionOpenSinceYear'].values[0]

        if cp_open_since_mo == -1 and cp_open_since_yr == -1:
            dummy_date_str = '2010/1'
        else:
            dummy_date_str = '%d/%d' % (cp_open_since_yr, cp_open_since_mo)
        dummy_date = datetime.strptime(dummy_date_str, '%Y/%m')

        train_data.loc[(train_data['Store'] == i) & \
            (train_data['Date'] >= dummy_date), ['HasCompetition',
            'CompetitionDistance']] = [True, store_details['CompetitionDistance'].values[0]]
    
    if (store_details.loc[:, 'Promo2'] == 1).all():
        p2_since_yr = store_details.loc[:, 'Promo2SinceYear'].values[0]
        p2_since_wk = store_details.loc[:, 'Promo2SinceWeek'].values[0]
        p2_interval = store_details.loc[:, 'PromoInterval'].values[0]
        
        dummy_date_str = 'Mon, %d/%d' % (p2_since_yr, p2_since_wk)
        dummy_date = datetime.strptime(dummy_date_str, '%a, %Y/%W')
        
        for mo_str in p2_interval.split(','):
            # I am sorry
            mo_str = mo_str[:3]

            mo = datetime.strptime(mo_str, '%b')            
            train_data.loc[(train_data['Store'] == i) &
                (train_data['Date'].dt.month == mo.month) &
                (train_data['Date'] >= dummy_date), 'IsDoingPromo2'] = True

In [11]:
print 'Check integrity\n'

print train_data.dtypes

Check integrity

Store                           int32
DayOfWeek                       int32
Date                   datetime64[ns]
Sales                         float64
Customers                       int32
Open                            int32
Promo                           int32
StateHoliday                   object
SchoolHoliday                   int32
StateHoliday_0                   bool
StateHoliday_a                   bool
StateHoliday_b                   bool
StateHoliday_c                   bool
Weekends                         bool
Weekdays                         bool
StoreType_a                      bool
StoreType_b                      bool
StoreType_c                      bool
StoreType_d                      bool
Assortment_a                     bool
Assortment_b                     bool
Assortment_c                     bool
HasCompetition                   bool
CompetitionDistance           float64
IsDoingPromo2                    bool
dtype: object


In [12]:
train_data.to_csv(os.path.join(data_dir, 'train_merged.csv'), date_format='%d/%m/%Y', index=False)