In [12]:
import numpy as np 
import pandas as pd 
import sklearn
import scipy.sparse
import lightgbm as lgb
import gc 
import matplotlib.pyplot as plt 
import warnings
from pandas 
warnings.filterwarnings('ignore')



pd.set_option('display.max_rows',600)
pd.set_option('display.max_columns',50)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

from itertools import product 

In [13]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [3]:
data_path = 'C:/Code/Data Science Code Base/Kaggle Data/Predict_Future_Sales/'
sales = pd.read_csv(data_path+'sales_train_v2.csv')
shops = pd.read_csv(data_path+'shops.csv')
items = pd.read_csv(data_path+'items.csv')
item_cats = pd.read_csv(data_path+'item_categories.csv')

### Creating item/shop month aggregates

In [6]:
index_cols = ['shop_id','item_id','date_block_num']

grid=[]

for block_num in sales.date_block_num.unique():
    curr_shops = sales.loc[sales['date_block_num']==block_num,'shop_id'].unique()
    curr_items = sales.loc[sales['date_block_num']==block_num,'item_id'].unique()
    grid.append(np.array(list(product(*[curr_shops,curr_items,[block_num]])),dtype='int32'))

grid = pd.DataFrame(np.vstack(grid),columns = index_cols,dtype=np.int32)

# Grouping data to get all item/shop-month agrregates 

gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]


all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)

# Now groupby data to get all shop_month agregate

gb = sales.groupby(['shop_id','date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]

all_data = pd.merge(all_data,gb,how='left',on=['shop_id','date_block_num']).fillna(0)

# Now groupby data to get item_month aggregates 
gb = sales.groupby(['item_id','date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]

all_data = pd.merge(all_data,gb,how='left',on=['item_id','date_block_num']).fillna(0)

#Downcasting to save memory 64 to 32 bit 
all_data = downcast_dtypes(all_data)
del grid,gb
gc.collect();

### Creating Lag Features 

In [7]:
col_to_rename = list(all_data.columns.difference(index_cols))

shift_range = [1,2,3,4,5,12]

for month_shift in shift_range:
    train_shift = all_data[index_cols+col_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num']+month_shift
    foo = lambda x: '{}_lag_{}'.format(x,month_shift) if x in col_to_rename else x
    train_shift = train_shift.rename(columns=foo) 
    all_data = pd.merge(all_data,train_shift,how='left',on=index_cols).fillna(0)

del train_shift

# Not using old data from 2013 
all_data = all_data[all_data['date_block_num']>=12]

fit_cols  = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]
to_drop_cols = list(set(list(all_data.columns))-(set(fit_cols)|set(index_cols))) + ['date_block_num']

# Map Category to items 

item_cat = items[['item_id','item_category_id']].drop_duplicates()

all_data  = pd.merge(all_data,item_cat,how='left',on='item_id')

all_data = downcast_dtypes(all_data)
gc.collect();

In [9]:
from pandas import HDFStore
store = HDFStore('all_data.h5')
store.put('all_data',all_data,format='table',data_columns=True)

In [23]:
store = HDFStore('all_data.h5')
all_data = store['all_data']

In [16]:
dates = all_data['date_block_num']
last_block = dates.max()

dates_train = dates[dates<last_block]
dates_test = dates[dates==last_block]

X_train = all_data.loc[dates<last_block].drop(to_drop_cols,axis=1)
X_test = all_data.loc[dates==last_block].drop(to_drop_cols,axis=1)

y_train = all_data.loc[dates<last_block,'target'].values
y_test = all_data.loc[dates==last_block,'target'].values

### Sampling a part of the data cause potato laptop :(

In [21]:
all_data = all_data[all_data['date_block_num']>=25]

(2178627, 25)

In [22]:
lgb_params = {
                'feature_fraction':0.75,
                'metric':'rmse',
                'min_data_in_leaf':100,
                'bagging_fraction':0.75,
                'learning_rate':0.25,
                'objective':'mse',
                'bagging_seed':2**7,
                'num_leaves':2**9,
                'max_depth':20,
                'bagging_freq':1,
                'verbose':0
            }


model = lgb.train(lgb_params,lgb.Dataset(X_train,label=y_train),num_boost_round=200)
pred_lgb_val = model.predict(X_test)
pred_lgb_tr = model.predict(X_train)


print('Test R-squared for LightGBM is %f'% r2_score(y_test,pred_lgb_val))
print('Train R-squared for LightGBM is %f'%r2_score(y_train,pred_lgb_tr))
print("increased num leaves and learning rate,decreased min_data_in_leaf")

Test R-squared for LightGBM is 0.243272
Train R-squared for LightGBM is 0.679944
increased num leaves and learning rate,decreased min_data_in_leaf
