In [3]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as df
import warnings
import csv
warnings.simplefilter(action='ignore', category=FutureWarning)
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [4]:
#读取定义数据文件CSV
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
item= pd.read_csv ('../input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
test= pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')


In [8]:
def Hxw(data):
    print("----------Top-5- Record----------")
    print(data.head(3))
    print("-----------Information-----------")
    print(data.info(null_counts=True))
    print("----------Shape of Data----------")
    print(data.shape)
    print("---------------------------------")

In [9]:
Hxw(train)
Hxw(test)

In [10]:
#查看商品种类信息
item.head()

In [11]:
#查看商品信息
items.head()

In [12]:
#查看商店信息
shops.head()

In [13]:
train.head()

In [19]:
#每天的销量图
%matplotlib inline

plt.figure(figsize=(14, 4))
g = sns.distplot(
    np.log(train[train['item_cnt_day'] > 0]['item_cnt_day']))
g.set_title("Item Sold Count Distribuition", fontsize=18)
g.set_ylabel("Frequency", fontsize=12)

In [22]:
#导入plotly工具
import plotly.offline as offline
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
offline.init_notebook_mode()

In [23]:
#商店所有物品的价格
temp = train.groupby('shop_id')['item_price'].sum()
# 画出柱状图
trace = [go.Bar(x=temp.index, y=temp.values,)]
# 设置图的字体颜色等
layout = go.Layout(
    title="TOP 25 Shop Name by Total Amount Sold ",
    yaxis=dict(title='Total Sold')
)
# 画出图形
fig = go.Figure(data=trace, layout=layout)
iplot(fig, filename='schoolStateNames')

In [24]:
#商店每天销售情况
temp = train.groupby('shop_id')['item_cnt_day'].sum()
# 画出柱状图
trace = [go.Bar(x=temp.index, y=temp.values,)]
# 设置图的字体颜色等
layout = go.Layout(
    title="TOP 25 Shop Name by Total Amount Sold ",
    yaxis=dict(title='Total Sold')
)
# 画出图形
fig = go.Figure(data=trace, layout=layout)
iplot(fig, filename='schoolStateNames')

In [25]:
def date_process(df):
    # 转化为时间戳
    df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y")
    df["_weekday"] = df['date'].dt.weekday  # 获取周
    df["_day"] = df['date'].dt.day  # 获取天
    df["_month"] = df['date'].dt.month  # 获取月
    return df


train = date_process(train)
train[["date", "_weekday", "_day", "_month"]].head()


In [None]:
#时间序列并不是按照顺序来的，先对其进行处理。
dates_temp = train['date'].value_counts().reset_index().sort_values('index')
# 对列名重新命名
dates_temp = dates_temp.rename(
    columns={"date": "Total_Bills"}).rename(columns={"index": "date"})
dates_temp.head()

In [None]:
#每天的商品价格总和。
dates_temp_sum = train.groupby('date')['item_price'].sum().reset_index()
dates_temp_sum.head()

In [None]:
#统计一下每天卖出去的商品的数量
dates_temp_count = train[train['item_cnt_day'] > 0].groupby(
    'date')['item_cnt_day'].sum().reset_index()
dates_temp_count.head()

In [None]:
#结果的表示
# 定义图形
trace0 = go.Scatter(x=dates_temp.date.astype(str), y=dates_temp.Total_Bills,
                    opacity=0.8, name='Total tickets')
trace1 = go.Scatter(x=dates_temp_sum.date.astype(str), name="Total Amount",
                    y=dates_temp_sum['item_price'], opacity=0.8)
trace2 = go.Scatter(x=dates_temp_count.date.astype(str), name="Total Items Sold",
                    y=dates_temp_count['item_cnt_day'], opacity=0.8)
# 设置标题等参数
layout = dict(
    title="Informations by Date",
    xaxis=dict(rangeselector=dict(buttons=list([
        dict(count=1, label='1m', step='month', stepmode='backward'),
        dict(count=3, label='3m', step='month', stepmode='backward'),
        dict(count=6, label='6m', step='month', stepmode='backward'),
        dict(step='all')])),
        rangeslider=dict(visible=True), type='date'))
# 画出图形
fig = dict(data=[trace0, trace1, trace2], layout=layout)
iplot(fig)

In [None]:
#每个月销量
temp = train.groupby(['_month'])['item_cnt_day'].sum()
# 画出柱状图
trace = [go.Bar(x=temp.index, y=temp.values,)]
# 设置图的字体颜色等
layout = go.Layout(
    title="Total orders by Month",
    xaxis=dict(title='Months'),
    yaxis=dict(title='Total Orders')
)
# 画出图形
fig = go.Figure(data=trace, layout=layout)
iplot(fig, filename='schoolStateNames')

In [None]:
#查看是否有异值
sns.pairplot(train[['item_price','item_cnt_day']])

In [None]:
#去除异常值
train = train[train['item_cnt_day']<1000]
train = train[(train.item_price>0) & (train.item_price<300000)]
train.shape

In [None]:
#drop方法去除
train = train.drop_duplicates()
train.shape

In [None]:
#训练集进行聚合运算
EDG = train.pivot_table(index=['shop_id','item_id'],columns='date_block_num',values='item_cnt_day',aggfunc='sum').fillna(0.0).reset_index()

In [None]:
#添加一个feature，就是将EDG与items数据集合并，增加items_category_id列。
train_cleaned_df = EDG.merge(items[['item_id','item_category_id']],how='inner')
train_cleaned_df[:3]

In [None]:
#测试集进行聚合计算
test = test.merge(train_cleaned_df,how='left')
test = test.fillna(0.0)
test

In [None]:
#建立模型
Xtrain = train_cleaned_df.iloc[:,(train_cleaned_df.columns != 33)].values
ytrain = train_cleaned_df.iloc[:,train_cleaned_df.columns==33].values

X_test = test.iloc[:, (test.columns != 'ID') & (test.columns != 0)].values

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBRegressor

xgbrfr = XGBRegressor()
xgbrfr.fit(Xtrain,ytrain)

In [None]:
y_pred = xgbrfr.predict(X_test)
y_pred = list(map(lambda x: min(20,max(x,0)), list(y_pred)))
#导出CSV文件
sub_df = pd.DataFrame({'ID':test.ID,'item_cnt_month': y_pred })
sub_df.to_csv('predict sale_1.1.csv',index=False)