In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import OrderedDict
import math

# 数据理解

目标：

    1、对于每个店铺，预测这个店铺每个商品在某个月的销量。

表说明：

    1、test.csv：测试数据表，其中仅含有店铺id和item_id这两个特征。
    2、sales_train.csv:训练数据表。有每个店铺每个种商品34个月每天的销量数据。还有商品价格。
    3、items.csv：商品的名称，id和类别id信息
    4、item_categories.csv：类别名字
    5、shops.csv：店铺信息

问题：

    1、不是所有的店铺都需要预测。
    2、不是所有的商品都需要预测。
    3、训练特征和测试特征量不匹配。
    4、月份日期是从2013年1月开始（记为0），到2015年10月（记为33），预测2015年11月的一个整个月每个店铺每个商品的销量
    



In [2]:
shop_df=pd.read_csv('./data/shops.csv')
shop_df.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [3]:
item_category_df=pd.read_csv('./data/item_categories.csv')
item_category_df.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [4]:
df_test=pd.read_csv('./data/test.csv')
df_test.head() # 特征就给了两个…需要自己加特征

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [5]:
len(df_test['shop_id'].value_counts())

42

In [6]:
item_data=pd.read_csv('data/items.csv')
print(item_data.shape)
item_data.head()

(22170, 3)


Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [43]:
train_data_raw=pd.read_csv('data/sales_train_v2.csv')
train_data_raw.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [8]:
len(train_data_raw['shop_id'].value_counts())

60

In [9]:
sample_sub=pd.read_csv('data/sample_submission.csv')
sample_sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


# Baseline

    1、想法很简单，根据测试数据，从训练数据中筛选出目标店铺和商品。
    2、按月统计。
    3、对于店铺A中的商品a，将33个月的均值作为其预测值

In [47]:
'''
获取目标商店id和商品id
'''
# 获取测试的商店
shop_list=list(set(df_test['shop_id'].tolist()))
# 获取测试的商品
item_list=list(set(df_test['item_id'].tolist()))

In [54]:
'''
筛选出仅含目标商店id和商品id的记录
'''
# 筛选出仅含目标商店的记录
target_df=train_data_raw[train_data_raw['shop_id'].isin(shop_list)]
# 筛选出仅含目标商店和目标商品的记录
target_df=target_df[target_df['item_id'].isin(item_list)]
target_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
10,03.01.2013,0,25,2574,399.0,2.0
11,05.01.2013,0,25,2574,399.0,1.0
12,07.01.2013,0,25,2574,399.0,1.0
13,08.01.2013,0,25,2574,399.0,2.0


In [55]:
'''
根据商店id进行分组
'''
shop_dict={}
for key,value in target_df.groupby('shop_id'):
    shop_dict[key]=value

In [56]:
'''
对于每个商店，按照月份，对每种商品进行销量统计
'''
shop_dict.keys()
final_result_dict={}

# 结果list
result_dict={}
for ID,shop_id,item_id in zip(df_test['ID'].tolist(),df_test['shop_id'].tolist(),df_test['item_id'].tolist()):
    result_dict[str(shop_id)+"_"+str(item_id)]=ID

print(len(result_dict))
    
# 对每个商店
for shop_id in shop_dict.keys():
    first_shop=shop_dict[shop_id]
    
    # 对月id和item_id进行groupby，并对其余几项计算sum
    # 得到的是该店铺，每个商品，在每个月的总销量
    first_result=first_shop.groupby(['date_block_num','item_id']).sum()
    
    # 对上面结果，对item_id进行group_by，并对其余几项计算mean
    # 得到的是该店铺，每个商品，在33个月的月平均销量
    first_final_result=first_result.groupby('item_id').mean()

    # 结果
    first_item_id=first_final_result.index.tolist()
    first_item_cnt=first_final_result['item_cnt_day'].tolist()


    for item_id,item_cnt in zip(first_item_id,first_item_cnt):
        final_result_dict[result_dict[str(shop_id)+'_'+str(item_id)]]=item_cnt


# 算下来有近一半的数据是没有的…= -         
len(final_result_dict)
# first_shop


214200


111404

In [57]:
'''
构建提交结果
这个结果大概是1967名，总共2087名
嘛，看起来作为一个baseline足够了
'''
sub_result=[]
for i in range(df_test.shape[0]):
    sub_result.append(final_result_dict.get(i,0))


my_sub_df=pd.DataFrame({'ID':list(range(df_test.shape[0])),'item_cnt_month':sub_result})
len(sub_result)
my_sub_df.to_csv('./my_sub/20190110_mean_baseline.csv',index=False)
my_sub_df.head()

Unnamed: 0,ID,item_cnt_month
0,0,1.444444
1,1,0.0
2,2,2.0
3,3,1.0
4,4,0.0


# 机器学习方法

方法总结：

    1、常用的传统机器学习方法做回归：
        决策树回归
        随机森林回归
        SVR
        L1，L2回归
        GBDT
        LGBM
    2、深度学习方法：
        MLP
        RNN
        其他例如WaveNet之类奇怪的网络

步骤：

    1、由于训练集和测试集的信息格式不一样（训练集按天给，测试集需要月的预测）不对等，需要进行对应的处理。
    2、特征太少，我们需要做一下特征工程。
    

In [10]:
'''
总的目标是将数据处理成和测试数据一样[店铺id,商品id]这样的格式，然后在考虑添加信息
'''

'''
获取目标商店id和商品id
'''
# 获取测试的商店
shop_list=list(set(df_test['shop_id'].tolist()))
# 获取测试的商品
item_list=list(set(df_test['item_id'].tolist()))

'''
筛选出仅含目标商店id和商品id的记录
'''
# 筛选出仅含目标商店的记录
target_df=train_data_raw[train_data_raw['shop_id'].isin(shop_list)]
# 筛选出仅含目标商店和目标商品的记录
target_df=target_df[target_df['item_id'].isin(item_list)]
target_df.head()

'''
根据商店id进行分组
'''
shop_dict={}
for key,value in target_df.groupby('shop_id'):
    shop_dict[key]=value

item_id_list=[]
date_block_num_list=[]
shop_id_list=[]
item_cnt_list=[]


# 对每个商店
for shop_id in shop_dict.keys():
    # 拿到这个商店所有的信息
    first_shop=shop_dict[shop_id]
    # 拿到这个商店每个月的信息
    first_shop_month={}
    for key,value in first_shop.groupby('date_block_num'):
        first_shop_month[key]=value
        
    # 拿到这个商店，每个月，这个商品的全月销量
    for key in first_shop_month.keys():
        temp_sum=first_shop_month[key].groupby('item_id').sum()
        
        item_id_list+=temp_sum.index.tolist()
        date_block_num_list+=[key]*len(temp_sum)
        shop_id_list+=[shop_id]*len(temp_sum)
        item_cnt_list+=temp_sum['item_cnt_day'].tolist()
        
    
    # 对月id和item_id进行groupby，并对其余几项计算sum
    # 得到的是该店铺，每个商品，在每个月的总销量
#     first_result=first_shop.groupby(['date_block_num','item_id']).sum()
#     print(first_result)
#     break

In [80]:
'''
将月份转换成0~11月（共12个月）
并构建新的dataframe
'''
date_block_num_list_cate=[ (aa+1)%12 for aa in date_block_num_list]
my_new_df=pd.DataFrame({'item_id':item_id_list,'date_block_num':date_block_num_list,
                        'date_block_num_cate':date_block_num_list_cate,
                        'shop_id':shop_id_list,'item_cnt':item_cnt_list})

In [12]:
my_new_df

Unnamed: 0,data_block_num,data_block_num_cate,item_cnt,item_id,shop_id
0,0,1,1.0,33,2
1,0,1,1.0,482,2
2,0,1,1.0,491,2
3,0,1,1.0,839,2
4,0,1,3.0,1007,2
5,0,1,1.0,1010,2
6,0,1,2.0,1023,2
7,0,1,1.0,1204,2
8,0,1,1.0,1224,2
9,0,1,1.0,1247,2


## 尝试1：随机森林法

    1、划分训练集和测试集：
        1.1 所有店铺2015年10月份的数据作为测试集
        1.2 剩下的作为验证集

In [13]:
from sklearn.ensemble import RandomForestRegressor

  from numpy.core.umath_tests import inner1d


In [14]:

# 划分训练数据和测试数据
test_data=my_new_df[my_new_df.data_block_num==33]
train_data=my_new_df[my_new_df.data_block_num!=33]

In [15]:
# 定义label和feature
features=['data_block_num_cate','item_id','shop_id']
label=['item_cnt']

In [16]:
'''
拿到train和tset数据
'''
train_x=train_data[features]
train_y=train_data[label]

test_x=test_data[features]
test_y=test_data[label]

In [17]:
RandomForestRegressor??

In [18]:
forest = RandomForestRegressor(n_estimators = 500,criterion='mse',
                                      random_state=1,n_jobs=-1)

In [19]:
forest.fit(train_x,train_y)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [20]:
pred_y=forest.predict(test_x)

In [22]:
# 感觉有点高啊…
pred_y=np.asarray(pred_y,dtype=np.float32)
test_y=np.asarray(test_y,dtype=np.float32)
rmse(pred_y,test_y)

array([14.357409], dtype=float32)

In [154]:
# 全量训练
all_train_x=my_new_df[features]
all_train_y=my_new_df[label]

In [155]:
all_forest = RandomForestRegressor(n_estimators = 1000,criterion='mse',
                                      random_state=1,n_jobs=-1)

In [156]:
all_forest.fit(all_train_x,all_train_y)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [157]:
all_train_x.head()

Unnamed: 0,data_block_num_cate,item_id,shop_id
0,1,33,2
1,1,482,2
2,1,491,2
3,1,839,2
4,1,1007,2


In [162]:
df_test['data_block_num_cate']=[11]*df_test.shape[0]
pred_sub=all_forest.predict(df_test[features])

In [166]:
# 提交结果

# 妈呀，这个结果8.多的loss，效果超差
my_sub_forest_df=pd.DataFrame({'ID':list(range(df_test.shape[0])),'item_cnt_month':pred_sub.tolist()})
my_sub_forest_df.head()
my_sub_forest_df.to_csv('./my_sub/20190110_forest.csv',index=False)

## 尝试2：GBDT

    1、random-forest的效果很差，检查了数据，觉得不是数据问题，问题应该出在forest本身。
    2、因为特征太少了，（一共就3个特征），最深也就3层的数，但是需要预测范围又比较广，因此效果可能不是很好。
    3、我们现在看一下GBDT的效果

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

In [86]:
def train_rfr(rfr_raw_data,features=['item_id','shop_id'],label=['item_cnt_day']):
    
    print(rfr_raw_data.head())
    
    # 划分训练数据和测试数据
    test_data=rfr_raw_data[rfr_raw_data.date_block_num==33]
    train_data=rfr_raw_data[rfr_raw_data.date_block_num!=33]
    # 定义label和feature
#     features=['item_id','shop_id']
#     label=['item_cnt_day']
    '''
    拿到train和tset数据
    '''
    train_x=train_data[features]
    train_y=train_data[label]

    test_x=test_data[features]
    test_y=test_data[label]
    
    print(test_x.head())
    
    rfr = GradientBoostingRegressor()
    
    rfr.fit(train_x,train_y)
    
    predict_rfr=rfr.predict(test_x)
    
    print(rmse(predict_rfr,test_y))
    
    return rfr

In [76]:
# 用每天的数据来预测每月的数据…
# 最后得到的结果，在kagle上RMSE在1.48536
train_rfr(train_data_raw)

  y = column_or_1d(y, warn=True)


[9.7445135]


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [87]:
# 这里直接用每月的数据来测每月的数据…效果奇差
# 感觉是不是要再看一下数据…= - 
train_rfr(my_new_df,features=['item_id','shop_id'],label=['item_cnt'])

   date_block_num  date_block_num_cate  item_cnt  item_id  shop_id
0               0                    1       1.0       33        2
1               0                    1       1.0      482        2
2               0                    1       1.0      491        2
3               0                    1       1.0      839        2
4               0                    1       3.0     1007        2
      item_id  shop_id
7744       31        2
7745      486        2
7746      787        2
7747      794        2
7748      968        2


  y = column_or_1d(y, warn=True)


[14.423452]


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

array([9.7445135], dtype=float32)

In [69]:
pred_rfr_raw=rfr.predict(df_test[features])

In [70]:
my_result_tocsv(pred_rfr_raw,'20190110_rfr_raw.csv')

In [58]:
train_data_raw.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [98]:
aa=[1]
bb=[2]
aa+bb

[1, 2]

In [95]:
first_shop=shop_dict[shop_id]
first_shop_month={}
for key,value in first_shop.groupby('date_block_num'):
    first_shop_month[key]=value
first_shop_month[0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
31962,16.01.2013,0,2,11391,899.00,1.0
31976,14.01.2013,0,2,10930,199.00,1.0
31986,22.01.2013,0,2,10039,299.00,1.0
31991,08.01.2013,0,2,10391,199.00,1.0
31994,22.01.2013,0,2,10423,1199.00,1.0
31999,19.01.2013,0,2,10669,750.00,1.0
32003,19.01.2013,0,2,11304,349.00,1.0
32005,09.01.2013,0,2,10904,149.00,1.0
32014,25.01.2013,0,2,12361,149.00,1.0
32037,17.01.2013,0,2,12286,100.00,1.0


In [102]:
first_shop_month[1].groupby('item_id').sum()

Unnamed: 0_level_0,date_block_num,shop_id,item_price,item_cnt_day
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31,4,8,2796.00,4.0
482,1,2,3300.00,1.0
496,1,2,3680.00,1.0
835,2,4,9200.00,2.0
839,1,2,3300.00,1.0
1007,1,2,449.00,1.0
1829,2,4,2998.00,2.0
1916,1,2,249.00,2.0
2252,1,2,599.00,1.0
2308,2,4,1648.00,4.0


# support function

In [29]:
def rmse(y1,y2):
    y1=np.asarray(y1,dtype=np.float32)
    y2=np.asarray(y2,dtype=np.float32)
    result=[ (a-b)**2 for a,b in zip(y1,y2)]
    rmse=sum(result)/len(result)
    return rmse**0.5

In [66]:
def model_test(model,final_test):
    return model.predict(final_test)

In [68]:
def my_result_tocsv(pred_sub,name):
    my_sub_forest_df=pd.DataFrame({'ID':list(range(df_test.shape[0])),'item_cnt_month':pred_sub.tolist()})
    my_sub_forest_df.head()
    my_sub_forest_df.to_csv('./my_sub/'+name,index=False)