In [1]:
import pandas as pd
import numpy as np
import itertools
import xgboost as xgb

In [9]:
train_data = pd.read_csv('./data/sales_train.csv')
test_data = pd.read_csv('./data/test.csv')
item_data = pd.read_csv('./data/items.csv')

In [3]:
#将训练数据中的按天销量改为按月销量
train_data = train_data.drop(train_data[train_data.item_price < 0].index | train_data[train_data.item_price >= 100000].index)
train_data = train_data.drop(train_data[train_data.item_cnt_day < 0].index | train_data[train_data.item_cnt_day >= 1000].index)

train_data1 = pd.pivot_table(train_data, index=['shop_id','item_id','date_block_num'], values=['item_cnt_day'], aggfunc=[np.sum],fill_value=0).reset_index();
train_data1.columns = train_data1.columns.droplevel().map(str)
train_data1 = train_data1.reset_index(drop=True).rename_axis(None, axis=1)
train_data1.columns.values[0]="shop_id";
train_data1.columns.values[1]="item_id";
train_data1.columns.values[2]="month_id";
train_data1.columns.values[3]="item_cnt_month";

In [4]:
print(train_data1)

         shop_id  item_id  month_id  item_cnt_month
0              0       30         1              31
1              0       31         1              11
2              0       32         0               6
3              0       32         1              10
4              0       33         0               3
...          ...      ...       ...             ...
1608219       59    22164        27               2
1608220       59    22164        30               1
1608221       59    22167         9               1
1608222       59    22167        11               2
1608223       59    22167        17               1

[1608224 rows x 4 columns]


In [5]:
#数据增强
#但是如果商店该月没有销售该商品，则上面的数据中则没有该条数据。而实际上这时候该商店该月该商品的销售量应该等于0，这样更能体现商品、商店、销量的关系
matrix = []
cols = ['month_id','shop_id','item_id']
for i in range(34):
    sales = train_data[train_data.date_block_num==i]
    matrix.append(np.array(list(itertools.product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['month_id'] = matrix['month_id'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols, inplace=True)
print(matrix.info())

train_data1 = pd.pivot_table(train_data, index=['shop_id','item_id','date_block_num'], values=['item_cnt_day'], aggfunc=[np.sum],
fill_value=0).reset_index();
train_data1.columns = train_data1.columns.droplevel().map(str)
train_data1 = train_data1.reset_index(drop=True).rename_axis(None, axis=1)
train_data1.columns.values[0]="shop_id";
train_data1.columns.values[1]="item_id";
train_data1.columns.values[2]="month_id";
train_data1.columns.values[3]="item_cnt_month";

#合并matrix和train_data1数据
train_data1 = pd.merge(matrix, train_data1, on=["shop_id", "item_id", "month_id"], how='left')
train_data1['item_cnt_month'] = (train_data1['item_cnt_month'].fillna(0).astype(np.float16))
train_data1["month_id1"] = train_data1["month_id"] % 12;
print(train_data1.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10884508 entries, 139168 to 10741223
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   month_id  int8 
 1   shop_id   int8 
 2   item_id   int16
dtypes: int16(1), int8(2)
memory usage: 124.6 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10884508 entries, 0 to 10884507
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   month_id        int8   
 1   shop_id         int8   
 2   item_id         int16  
 3   item_cnt_month  float16
 4   month_id1       int8   
dtypes: float16(1), int16(1), int8(3)
memory usage: 155.7 MB
None


In [6]:
#限定销量在20以内
train_data1['item_cnt_month'] = (train_data1['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16))

In [10]:
#增加其他特征
train_data2 = pd.pivot_table(train_data, index=['shop_id','item_id','date_block_num'], values=['item_cnt_day'], aggfunc=[np.sum],fill_value=0).reset_index();
train_data2.columns = train_data2.columns.droplevel().map(str)
train_data2 = train_data2.reset_index(drop=True).rename_axis(None, axis=1)
train_data2.columns.values[0]="shop_id";
train_data2.columns.values[1]="item_id";
train_data2.columns.values[2]="month_id";
train_data2.columns.values[3]="item_cnt_month1";
train_data2["month_id"] = train_data2["month_id"] + 1;

item_price1 = pd.pivot_table(train_data, index=['item_id','date_block_num'], values=['item_price'], aggfunc=[np.mean], fill_value=0).reset_index();
item_price1.columns = item_price1.columns.droplevel().map(str)
item_price1 = item_price1.reset_index(drop=True).rename_axis(None, axis=1)
item_price1.columns.values[0]="item_id";
item_price1.columns.values[1]="month_id";
item_price1.columns.values[2]="item_mean_price1";
item_price1["month_id"] = item_price1["month_id"] + 1;

shop_cnt = pd.pivot_table(train_data1, index=['shop_id','month_id'], values=['item_cnt_month'], aggfunc=[np.mean], fill_value=0).reset_index();
shop_cnt.columns = shop_cnt.columns.droplevel().map(str)
shop_cnt = shop_cnt.reset_index(drop=True).rename_axis(None, axis=1)
shop_cnt.columns.values[0]="shop_id";
shop_cnt.columns.values[1]="month_id";
shop_cnt.columns.values[2]="shop_cnt_month1";
shop_cnt["month_id"] = shop_cnt["month_id"] + 1;

item_cnt = pd.pivot_table(train_data1, index=['item_id','month_id'], values=['item_cnt_month'], aggfunc=[np.mean], fill_value=0).reset_index();
item_cnt.columns = item_cnt.columns.droplevel().map(str)
item_cnt = item_cnt.reset_index(drop=True).rename_axis(None, axis=1)
item_cnt.columns.values[0]="item_id";
item_cnt.columns.values[1]="month_id";
item_cnt.columns.values[2]="item_total_month1";
item_cnt["month_id"] = item_cnt["month_id"] + 1;


combined_data = train_data1.append(test_data);
#合并上述的五个特征到训练集中
combined_data = pd.merge(combined_data, train_data2, on = ['shop_id', 'item_id','month_id'], how = 'left')
combined_data["item_cnt_month1"].fillna(0, inplace=True)

combined_data = pd.merge(combined_data, item_price1, on = ['item_id','month_id'], how = 'left')
combined_data["item_mean_price1"].fillna(0, inplace=True)

combined_data = pd.merge(combined_data, shop_cnt, on = ['shop_id','month_id'], how = 'left')
combined_data["shop_cnt_month1"].fillna(0, inplace=True)

combined_data = pd.merge(combined_data, item_cnt, on = ['item_id', 'month_id'], how = 'left')
combined_data["item_total_month1"].fillna(0, inplace=True)

combined_data = pd.merge(combined_data, item_data, on = ['item_id'], how = 'left')
combined_data["item_category_id"].fillna(0, inplace=True)

In [11]:
print(combined_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11098708 entries, 0 to 11098707
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   month_id           float64
 1   shop_id            int64  
 2   item_id            int64  
 3   item_cnt_month     float16
 4   month_id1          float64
 5   ID                 float64
 6   item_cnt_month1    float64
 7   item_mean_price1   float64
 8   shop_cnt_month1    float16
 9   item_total_month1  float16
 10  item_name          object 
 11  item_category_id   int64  
dtypes: float16(3), float64(5), int64(3), object(1)
memory usage: 910.3+ MB
None


In [11]:
#模型训练
model = xgb.XGBRegressor(max_depth=4, colsample_btree=0.1, learning_rate=0.1, n_estimators=32, min_child_weight=2);
model.fit(X_train, y_train)

NameError: name 'X_train' is not defined