In [2]:
import pandas as pd
import datetime
import csv
import numpy as np
import xgboost as xgb
import itertools
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.base import TransformerMixin
from sklearn import cross_validation
from matplotlib import pylab as plt
plot = True
goal = 'Sales'
myid = 'Id'


In [4]:
def ToWeight(y):
    w = np.zeros(y.shape,dtype=float)
    ind = y!=0
    w[ind] = 1./(y[ind]**2)
    return w
    
def rmspe(yhat,y):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w*(y-yhat)**2))
    return rmspe

def rmspe_xg(yhat,y):
    y = y.get_label() 
    y = np.exp(y) -1
    yhat = np.exp(yhat)-1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w*(y - yhat)**2))
    return 'rmspe',rmspe

In [5]:
store = pd.read_csv('./store.csv')
store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
Store                        1115 non-null int64
StoreType                    1115 non-null object
Assortment                   1115 non-null object
CompetitionDistance          1112 non-null float64
CompetitionOpenSinceMonth    761 non-null float64
CompetitionOpenSinceYear     761 non-null float64
Promo2                       1115 non-null int64
Promo2SinceWeek              571 non-null float64
Promo2SinceYear              571 non-null float64
PromoInterval                571 non-null object
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


In [6]:
test_df = pd.read_csv('./test.csv')

test_df.head()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
Id               41088 non-null int64
Store            41088 non-null int64
DayOfWeek        41088 non-null int64
Date             41088 non-null object
Open             41077 non-null float64
Promo            41088 non-null int64
StateHoliday     41088 non-null object
SchoolHoliday    41088 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 2.5+ MB


In [7]:
def load_data():
    store = pd.read_csv('./store.csv')
    train_org = pd.read_csv('./train.csv',dtype={'StateHoliday':pd.np.string_})
    test_org = pd.read_csv('./test.csv',dtype={'StateHoliday':pd.np.string_})
    train = pd.merge(train_org,store,on='Store',how='left')
    test = pd.merge(test_org,store,on='Store',how='left')
    feature = test.columns.tolist()
    numerics = ['int16','int32','int64','float16','float32','float64']
    feature_numeric = test.select_dtypes(include = numerics).columns.tolist()
    feature_non_numeric = [f for f in feature if f not in feature_numeric]
    return (train,test,feature,feature_non_numeric)

In [105]:
def process_data(train,test,features,features_non_numeric):
    train = train[train['Sales']>0]

    for data in [train,test]:
        data['year'] = data.Date.apply(lambda x : x.split('-')[0])
        data['year'] = data['year'].astype(float)
        data['month']= data.Date.apply(lambda x : x.split('-')[1])
        data['month']= data['month'].astype(float)
        data['day']  = data.Date.apply(lambda x : x.split('-')[2])
        data['day']  = data['day'].astype(float)
        
        data['promojan'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Jan' in x else 0)
        data['promofeb'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Feb' in x else 0)
        data['promomar'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Mar' in x else 0)
        data['promoapr'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Apr' in x else 0)
        data['promomay'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'May' in x else 0)
        data['promojun'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Jun' in x else 0)
        data['promojul'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Jul' in x else 0)
        data['promoaug'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Aug' in x else 0)
        data['promosep'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Sep' in x else 0)
        data['promooct'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Oct' in x else 0)
        data['promonov'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Nov' in x else 0)
        data['promodec'] = data.PromoInterval.apply(lambda x:0 if isinstance(x,float) else 1 if 'Dec' in x else 0)
        
    noisy_features = [myid,'Date']
    features = [c for c in features if c not in noisy_features]
    features_non_numeric = [c for c in features_non_numeric if c not in noisy_features]
    features.extend(['year','month','day'])
    class DataFrameInputer(TransformerMixin):
        
        def __init__(self):
                """
                """
        def fit(self, X, y=None):
            self.fill = pd.Series([X[c].value_counts().index[0]
                                      if X[c].dtype==np.dtype('O') else X[c].mean() for c in X],
                                      index = X.columns)
            return self
        def transform(self,X,y=None):
            return X.fillna(self.fill)
        
    train = DataFrameInputer().fit_transform(train)
    test  = DataFrameInputer().fit_transform(test)
        
    le = LabelEncoder()
    for col in features_non_numeric:
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
        
    scaler = StandardScaler()
    for col in set(features) - set(features_non_numeric) - set([]):
        try:
            scaler.fit(list(train[col])+list(test[col]))
        except:
            print(col)
        train[col] = scaler.transform(train[col])
        test[col] = scaler.transform(test[col])
    return (train,test,features,features_non_numeric)

In [80]:
train,test,features,features_non_numeric = load_data()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
Store                        1017209 non-null int64
DayOfWeek                    1017209 non-null int64
Date                         1017209 non-null object
Sales                        1017209 non-null int64
Customers                    1017209 non-null int64
Open                         1017209 non-null int64
Promo                        1017209 non-null int64
StateHoliday                 1017209 non-null object
SchoolHoliday                1017209 non-null int64
StoreType                    1017209 non-null object
Assortment                   1017209 non-null object
CompetitionDistance          1014567 non-null float64
CompetitionOpenSinceMonth    693861 non-null float64
CompetitionOpenSinceYear     693861 non-null float64
Promo2                       1017209 non-null int64
Promo2SinceWeek              509178 non-null float64
Promo2SinceYear              509178 non-null f

In [103]:
train.fillna(0)
train.groupby('CompetitionOpenSinceMonth').count()
train['CompetitionOpenSinceYear']

0          2008.0
1          2007.0
2          2006.0
3          2009.0
4          2015.0
5          2013.0
6          2013.0
7          2014.0
8          2000.0
9          2009.0
10         2011.0
11            NaN
12            NaN
13         2014.0
14         2010.0
15            NaN
16         2005.0
17         2010.0
18            NaN
19         2009.0
20         1999.0
21            NaN
22         2005.0
23         2000.0
24         2003.0
25            NaN
26         2005.0
27         2014.0
28            NaN
29         2014.0
            ...  
1017179    2013.0
1017180    2011.0
1017181    2009.0
1017182    2009.0
1017183       NaN
1017184       NaN
1017185    2000.0
1017186    2009.0
1017187    2013.0
1017188    2007.0
1017189       NaN
1017190    2002.0
1017191    2004.0
1017192    2013.0
1017193       NaN
1017194    2012.0
1017195    2012.0
1017196    2006.0
1017197    2012.0
1017198    2008.0
1017199    2011.0
1017200    2012.0
1017201    2004.0
1017202    2011.0
1017203   

In [106]:
train,test,features,features_non_numeric = process_data(train,test,features,features_non_numeric)
train['CompetitionOpenSinceYear']

0         -0.140119
1         -0.341682
2         -0.543245
3          0.061444
4          1.270821
5          0.867695
6          0.867695
7          1.069258
8         -1.752622
9          0.061444
10         0.464570
11         0.000522
12         0.000522
13         1.069258
14         0.263007
15         0.000522
16        -0.744808
17         0.263007
18         0.000522
19         0.061444
20        -1.954185
21         0.000522
22        -0.744808
23        -1.752622
24        -1.147933
25         0.000522
26        -0.744808
27         1.069258
28         0.000522
29         1.069258
             ...   
1016082   -0.543245
1016083    0.666133
1016084   -0.140119
1016085    0.464570
1016086    0.666133
1016087   -0.946371
1016088    0.464570
1016089    0.263007
1016090    1.069258
1016091   -0.543245
1016092    0.000522
1016093    0.000522
1016094    0.000522
1016179    0.464570
1016353    0.000522
1016356    0.867695
1016368    0.000522
1016429    0.000522
1016447    0.000522


In [259]:
def XGB_native(train,test,features,features_non_numeric):
    depth =13
    eta = 0.01
    ntrees = 3000
    mcw = 3
    params = {'objective':'reg:linear',
             'booster':'gbtree',
             'eta':eta,
              'mx_depth':depth,
              'min_child_weight':mcw,
              'subsample':0.9,
              'colsample_bytree':0.7,
              'silent':1
             }
    print "params"+str(params)
    print 'featres'+str(features)
    
    tsize = 0.05
    X_train, X_test = cross_validation.train_test_split(train, test_size=tsize)
    dtrain = xgb.DMatrix(X_train[features], np.log(X_train[goal] + 1))
    dvalid = xgb.DMatrix(X_test[features], np.log(X_test[goal] + 1))
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
#     gbm = xgb.train(params,dtrain,ntrees,evals=watchlist,early_stopping_rounds=100,feval=rmspe_xg,verbose_eval=True)
    gbm = xgb.train(params, dtrain, ntrees, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)
   
    indices = train_probs<0
    train_probs[indices] = 0
    error = rmspe(np.exp(train_probs)-1,X_test[goal].values)
    print "````````",error
    
    test_probs = gbm.predict(xgb.DMatrix(test[features]))
    indices = test_probs<0
    test_probs[indices] = 0
    submission = pd.DataFrame({myid:test[myid],goal:np.exp(test_probs)-1})
    if os.path.exists('result/'):
        os.makedirs('result/')
    submission.to_csv('./result/data_xgb_d%s_eta%s_ntree%s_mcw%s_tsze%s.csv'%(str(depth),str(eta),str(ntrees),str(mcw),str(tsize)),index=False)
    
    if plot:
        outfile = open('xgb.fmap','w')
        i = 0
        for feat in features:
            outfile.write('{0}\t{1}\tq\n'.format(i,feat))
            i = i +1
        outfile.close()
        importance = gbm.get_fscore(fmap='xgb.fmap')
        importance = sorted(importance.items(),key = operator.itemgetter(1))
        df = pd.DataFrame(importance,columns=['feature','fscore'])
        df['fscore'] = df['fscore'] /df['fscore'].sum()
        
        plt.figure()
        df.plot()
        df.plot(kind = 'barh',x = 'feature',y = 'fscore',legend=False,figsize = (25,15))
        plt.title('XGBoost Feature Importance')
        plt.xlabel('relative importance')
        plt.gcf.savefig('feature_importance_xgb_d%s_eta%s_mvw%s_tsize%s.png'%(str(depth),str(eta),str(ntrees),str(mcw),str(tsize)))
        
        

In [260]:
# train,test,features,features_non_numeric = load_data()
# train,test,features,features_non_numeric = process_data(train,test,features,features_non_numeric)
# train.head()

In [261]:
XGB_native(train,test,features,features_non_numeric)

params{'subsample': 0.9, 'eta': 0.01, 'colsample_bytree': 0.7, 'silent': 1, 'objective': 'reg:linear', 'mx_depth': 13, 'min_child_weight': 3, 'booster': 'gbtree'}
featres['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'year', 'month', 'day']
[0]	eval-rmspe:0.999864	train-rmspe:0.999864
Multiple eval metrics have been passed: 'train-rmspe' will be used for early stopping.

Will train until train-rmspe hasn't improved in 100 rounds.
[1]	eval-rmspe:0.999837	train-rmspe:0.999837
[2]	eval-rmspe:0.999809	train-rmspe:0.999809
[3]	eval-rmspe:0.999779	train-rmspe:0.999779
[4]	eval-rmspe:0.999747	train-rmspe:0.999747
[5]	eval-rmspe:0.999712	train-rmspe:0.999712
[6]	eval-rmspe:0.999675	train-rmspe:0.999675
[7]	eval-rmspe:0.999636	train-rmspe:0.999636
[8]	eval-rmspe:0.999593	train-rmspe:0.999594
[9]	eval

NameError: global name 'train_probs' is not defined