In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import time
from datetime import datetime
import datetime

types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'CompetitionDistance' : np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'Promo2SinceYear': np.dtype(int),
         'SchoolHoliday': np.dtype(float),
         'PromoInterval': np.dtype(str)}

In [2]:
train = pd.read_csv('D://kishore//train.csv',
                    parse_dates=['Date'], dtype=types)

test = pd.read_csv('D://kishore//test.csv',
                   parse_dates=['Date'],dtype=types)

store = pd.read_csv('D://kishore//store.csv')

In [3]:
def calcDates(df):
    df['Month'] = df.Date.dt.month
    df['Year'] = df.Date.dt.year
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.weekofyear
    # Year-Month 2015-08 
    # will be used for monthly sale calculation:
    df['YearMonth'] = df['Date'].apply(lambda x:(str(x)[:7]))
    return df


train = pd.merge(train,store,on='Store')
test = pd.merge(test,store,on='Store')

train = calcDates(train)
test = calcDates(test)

In [4]:
def cleanPromoCompetition(df,drop=False):
    # ========== Fixing promo2 ============
    df.PromoInterval.fillna(0,inplace=True)
    monthAsString = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                     7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}

    # Using string format of month names to extract info from promo interval column                 
    df['SMonth'] = df.Month.map(monthAsString)
    # Fixing NaN values in promo interval when there is no promotion
    df.loc[df.PromoInterval==0,'PromoInterval'] = ''

    # New feature: 
    #     IsPromo2Month: 
    #     0 if month is not among PromoInterval
    #     1 if it is


    df['IsPromo2Month'] = 0
    for interval in df.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                condmatch = (df.SMonth == month) & (df.PromoInterval == interval)
                # If promo started this year, Week of Year must be > Promo2SinceWeek
                cond1 = (condmatch & (df.Year == df.Promo2SinceYear)
                         & (df.WeekOfYear >= df.Promo2SinceWeek) )
                # Or If promo started previous year, Week of Year doesn't matter
                cond2 = condmatch & (df.Year > df.Promo2SinceYear)
                fullcond = cond1 | cond2
                df.loc[fullcond, 'IsPromo2Month'] = 1

     # ======= Fixing Competition =============
    df.CompetitionOpenSinceYear.fillna(0,inplace=True)
    df.CompetitionOpenSinceMonth.fillna(0,inplace=True)

    # New feature: 
    #    Competition:
    #    1 if there exist a compettion at date = today
    #    0 otherwise

    df['Competition'] = 0
    cond1 = df.Year > df.CompetitionOpenSinceYear
    cond2 = ((df.Year == df.CompetitionOpenSinceYear)
             & (df.Month >= df.CompetitionOpenSinceMonth))
    fullcond = cond1 | cond2
    df.loc[fullcond, 'Competition'] = 1

    if (drop):
        df = df.drop(['SMonth','PromoInterval','Promo2SinceYear','Promo2SinceWeek'],1)
        df = df.drop(['CompetitionOpenSinceMonth','CompetitionOpenSinceYear'],1)

    return df

train = cleanPromoCompetition(train,drop=True)
test = cleanPromoCompetition(test,drop=True)

In [5]:
trainOpen = train[train.Open == 1][['Store','YearMonth','Sales']]
monthlySale  = trainOpen.groupby(['Store','YearMonth'],as_index=False).mean()


#====== Finding renovated stores ========

renovatedStores = []
for store in train.Store.unique():
    # Renovated stores are close before 2015 for more than 2 month
    if len(monthlySale[monthlySale.Store==store]) < 29:
        renovatedStores.append(store)


#print(renovatedStores)

def createRenovation(df,renovatedStores):

    # New features:
    # StoreRenovated: 1 if it is, 0 otherwise
    # DaysAfterRenovation: 0 if date is before renovation, 1 if it is after
    df['StoreRenovated'] = 0
    df['DaysAfterRenovation'] = 0
    for store in renovatedStores:
        df.loc[df.Store == store,'StoreRenovated'] = 1
        # Renovated stores are back to open state in 2015
        df.loc[(df.Store == store) & (df.Year == 2015), 'DaysAfterRenovation'] = 1

    return df


train = createRenovation(train,renovatedStores)
test  = createRenovation(test,renovatedStores)



monthlySale['MonthSale'] = monthlySale.Sales
monthlySale = monthlySale.drop(['Sales'],1)

# New feature: MonthSale:
# Average of monthly sale for each store
# Adding monthly sale to train set:
train = pd.merge(train,monthlySale,on=['Store','YearMonth'])


# Small NaN Fix on test, only 1 case which is in fact open
test.Open.fillna(1,inplace=True)


#train = train.sort_values(by = 'Date')
train.to_csv('D://kishore//trainCleaned.csv')
test.to_csv('D://kishore//testCleaned.csv')

In [119]:
train_r = pd.read_csv('D://kishore//trainCleaned_new.csv')
test_r = pd.read_csv('D://kishore//testCleaned_new.csv')

train_r.drop(['Unnamed: 0'], axis=1, inplace=True)
test_r.drop(['Unnamed: 0'], axis=1, inplace=True)


In [120]:
train_r['CompetitionDistance'].mean()


1270.0

In [121]:
test_r['CompetitionDistance'].mean()


1270.0

In [122]:
train_r.CompetitionDistance = train_r.CompetitionDistance.fillna(value=train_r['CompetitionDistance'].mean())
test_r.CompetitionDistance = test_r.CompetitionDistance.fillna(value=test_r['CompetitionDistance'].mean())

In [123]:
train_r.isnull().sum()
test_r.isnull().sum()

Id                     0
Store                  0
DayOfWeek              0
Date                   0
Open                   0
Promo                  0
StateHoliday           0
SchoolHoliday          0
StoreType              0
Assortment             0
CompetitionDistance    0
Promo2                 0
Month                  0
Year                   0
Day                    0
WeekOfYear             0
YearMonth              0
IsPromo2Month          0
Competition            0
StoreRenovated         0
DaysAfterRenovation    0
dtype: int64

In [124]:
train_r.columns


Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'Promo2', 'Month', 'Year', 'Day', 'WeekOfYear',
       'YearMonth', 'IsPromo2Month', 'Competition', 'StoreRenovated',
       'DaysAfterRenovation', 'MonthSale'],
      dtype='object')

In [125]:
train_r.describe()


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,SchoolHoliday,CompetitionDistance,Promo2,Month,Year,Day,WeekOfYear,IsPromo2Month,Competition,StoreRenovated,DaysAfterRenovation,MonthSale
count,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0
mean,1.0,3.997877,41745.5,3945.704883,467.646497,0.829087,0.382166,0.204883,1270.0,0.0,5.961783,2013.83758,15.707006,24.098726,0.0,1.0,0.0,0.0,4767.722074
std,0.0,1.998404,272.076276,2015.41784,228.93085,0.376633,0.486175,0.403831,0.0,0.0,3.350357,0.766009,8.794449,14.547508,0.0,0.0,0.0,0.0,514.93483
min,1.0,1.0,41275.0,0.0,0.0,0.0,0.0,0.0,1270.0,0.0,1.0,2013.0,1.0,1.0,0.0,1.0,0.0,0.0,4092.230769
25%,1.0,2.0,41510.25,3588.25,463.0,1.0,0.0,0.0,1270.0,0.0,3.0,2013.0,8.0,12.0,0.0,1.0,0.0,0.0,4443.125
50%,1.0,4.0,41745.5,4373.5,529.0,1.0,0.0,0.0,1270.0,0.0,6.0,2014.0,16.0,23.0,0.0,1.0,0.0,0.0,4600.076923
75%,1.0,6.0,41980.75,5220.0,598.75,1.0,1.0,0.0,1270.0,0.0,9.0,2014.0,23.0,36.0,0.0,1.0,0.0,0.0,4907.538462
max,1.0,7.0,42216.0,9528.0,1130.0,1.0,1.0,1.0,1270.0,0.0,12.0,2015.0,31.0,52.0,0.0,1.0,0.0,0.0,6441.04


In [126]:
train_r.drop(train_r[train_r.Open == 1].index, inplace=True)
train_r.drop(train_r[train_r.Sales != 0].index, inplace=True)

In [127]:
train_r.drop(['YearMonth'], axis=1, inplace=True)
test_r.drop(['YearMonth'], axis=1, inplace=True)

In [128]:
train_r['YearTrend']=0
test_r['YearTrend']=0
train_r['TimeInMonth'] = (train_r['Year'] - 2013)*12 + train_r['Month']
test_r['TimeInMonth'] = (test_r['Year'] - 2013)*12 + test_r['Month']

In [129]:
growthfitnopromo = None
growthfitwithpromo = None
growthfit0nopromo = None
growthfit0withpromo = None
growthfit1nopromo = None
growthfit1withpromo = None

In [130]:
#for i in train_r['Date']:
 #   date_format = '%Y-%m-%d'
  #  a = datetime.datetime.strptime('1899-12-30', date_format)
   # b = datetime.datetime.strptime(d, date_format)
   # delta = b - a
   # train_r[i, 'date']= delta
            

In [131]:
#x_values =pd.DataFrame()
#for i in train_r['Store'].unique():
    #print("fitting store i=",i,"\n")
Store =  train_r[(train['Store']==1) & (train['Month'] !=12)][['Store', 'DayOfWeek', 'Date', 'Sales', 
                                                                                'Customers', 'Open', 'Promo','StateHoliday', 
                                                                                'SchoolHoliday', 'StoreType', 'Assortment',
                                                                                'CompetitionDistance', 'Promo2', 'Month',
                                                                                'Year', 'Day', 'WeekOfYear','IsPromo2Month',
                                                                                'Competition', 'StoreRenovated',
                                                                                'DaysAfterRenovation', 'MonthSale']]
#if Store['StoreRenovated'].iloc[1] == 0:
x = Store['Date']
y = Store['Sales']
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(x, y)

# Make predictions using the testing set
Store_y_pred = regr.predict(test_r)

  after removing the cwd from sys.path.


ValueError: Expected 2D array, got 1D array instead:
array=[42211 42204 42197 42190 42183 42176 42169 42162 42159 42155 42149 42148
 42141 42138 42134 42127 42125 42120 42113 42106 42100 42099 42097 42092
 42085 42078 42071 42064 42057 42050 42043 42036 42029 42022 42015 42008
 42005 41973 41966 41959 41952 41945 41938 41931 41924 41917 41915 41910
 41903 41896 41889 41882 41875 41868 41861 41854 41847 41840 41833 41826
 41819 41812 41809 41805 41799 41798 41791 41788 41784 41777 41770 41763
 41760 41756 41750 41749 41747 41742 41735 41728 41721 41714 41707 41700
 41693 41686 41679 41672 41665 41658 41651 41644 41640 41602 41595 41588
 41581 41574 41567 41560 41553 41550 41546 41539 41532 41525 41518 41511
 41504 41497 41490 41483 41476 41469 41462 41455 41448 41441 41434 41427
 41424 41420 41414 41413 41406 41403 41399 41395 41392 41385 41378 41371
 41365 41364 41362 41357 41350 41343 41336 41329 41322 41315 41308 41301
 41294 41287 41280 41275].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [82]:
Store =  train_r[(train['Store']==1) & (train['Month'] !=12)][['Store', 'DayOfWeek', 'Date', 'Sales', 
                                                                                'Customers', 'Open', 'Promo','StateHoliday', 
                                                                                'SchoolHoliday', 'StoreType', 'Assortment',
                                                                                'CompetitionDistance', 'Promo2', 'Month',
                                                                                'Year', 'Day', 'WeekOfYear','IsPromo2Month',
                                                                                'Competition', 'StoreRenovated',
                                                                                'DaysAfterRenovation', 'MonthSale']]
if Store['StoreRenovated'].iloc[1] == 0:
    date = ((Store['Date']))
    for d in date:
        date_format = '%Y-%m-%d'
        a = datetime.datetime.strptime('1899-12-30', date_format)
        b = datetime.datetime.strptime(d, date_format)
        delta = b - a
        print (delta.days) # that's it


#    y = Store[Store['Promo']Promo==0,]Store['Sales']
    

42211
42204
42197
42190
42183
42176
42169
42162
42159
42155
42149
42148
42141
42138
42134
42127
42125
42120
42113
42106
42100
42099
42097
42092
42085
42078
42071
42064
42057
42050
42043
42036
42029
42022
42015
42008
42005
41973
41966
41959
41952
41945
41938
41931
41924
41917
41915
41910
41903
41896
41889
41882
41875
41868
41861
41854
41847
41840
41833
41826
41819
41812
41809
41805
41799
41798
41791
41788
41784
41777
41770
41763
41760
41756
41750
41749
41747
41742
41735
41728
41721
41714
41707
41700
41693
41686
41679
41672
41665
41658
41651
41644
41640
41602
41595
41588
41581
41574
41567
41560
41553
41550
41546
41539
41532
41525
41518
41511
41504
41497
41490
41483
41476
41469
41462
41455
41448
41441
41434
41427
41424
41420
41414
41413
41406
41403
41399
41395
41392
41385
41378
41371
41365
41364
41362
41357
41350
41343
41336
41329
41322
41315
41308
41301
41294
41287
41280
41275


  """Entry point for launching an IPython kernel.


In [83]:
def excel_date(date1):
    date_format = '%Y-%m-%d'
    a = datetime.datetime.strptime('1899-12-30', date_format)
    b = datetime.datetime.strptime(date1, date_format)
    delta = b - a
    return delta

In [61]:
 Store =  train_r[(train['Store']==1114) & (train['Month'] !=12)][['Store', 'DayOfWeek', 'Date', 'Sales', 
                                                                                'Customers', 'Open', 'Promo','StateHoliday', 
                                                                                'SchoolHoliday', 'StoreType', 'Assortment',
                                                                                'CompetitionDistance', 'Promo2', 'Month',
                                                                                'Year', 'Day', 'WeekOfYear','IsPromo2Month',
                                                                                'Competition', 'StoreRenovated',
                                                                                'DaysAfterRenovation', 'MonthSale']]
    

  """Entry point for launching an IPython kernel.


In [88]:
x

datetime.timedelta(41275)