In [101]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn import linear_model
from sklearn.preprocessing import *
from sklearn import ensemble
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.externals import joblib

In [110]:
# 划分训练集和测试集
def trainTestSplit(df, splitN, trainLabel):
    trainX = df[:splitN][trainLabel]
    trainY = df[:splitN]['cnt']
    testX = df[splitN:][trainLabel]
    testY = df[splitN:]['cnt']
    return (trainX, trainY, testX, testY)

# 训练模型
def trainLinearModel(X, y):
    clf = linear_model.RidgeCV(alphas=[0.01*x for x in range(1,200)], scoring='neg_mean_squared_error')
    clf.fit(X, y)
    print('Coefficients:', clf.coef_)
    return clf

# 训练模型
def trainTreeModel(X, y):
#     clf = ensemble.RandomForestRegressor(n_estimators=200)
    clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=0, loss='ls')
    clf.fit(X, y)
    return clf

# 检验模型
def validModel(trainX, trainY, testX, testY):
    clf = trainLinearModel(trainX, trainY)
    predictY = clf.predict(testX)
    cost = np.linalg.norm(predictY - testY)**2 / len(predictY)
    print("cost:", cost)
    
# 添加过去第i周的统计量
def statWeek(df, weeks):
    if isinstance(weeks, int):
        weeks = [weeks]
    colName = []
    for i in weeks:
        weekDf = pd.pivot_table(df, index=['week'], values=['cnt'], aggfunc=[np.mean, np.std, np.max, np.min])
        weekDf.columns = ['mean%d'%i, 'std%d'%i, 'max%d'%i, 'min%d'%i]
        colName.extend(weekDf.columns)
        weekDf.index += i
        df = pd.merge(df, weekDf, left_on='week', right_index=True, how='left')
    return df,colName

# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('./%s.txt' % fileName, sep='\t', header=False, index=False)

In [111]:
#导入训练数据
train_data = pd.read_csv('fusai_A_train_feature_set.csv')

print(train_data.head(10))

   date  day_of_week  brand  cnt  guess_date  date_year  date_month_day  \
0     1            2      1   31  2013-01-01       2013               1   
1     1            2      6    6  2013-01-01       2013               1   
2     1            2      9   15  2013-01-01       2013               1   
3     2            3      4   20  2013-01-02       2013               2   
4     2            3      6    6  2013-01-02       2013               2   
5     2            3      7   30  2013-01-02       2013               2   
6     2            3     10   48  2013-01-02       2013               2   
7     3            4      4   16  2013-01-03       2013               3   
8     3            4      6    4  2013-01-03       2013               3   
9     3            4      8   23  2013-01-03       2013               3   

   date_property guess_date_str  sale_quantity    ...     brand_1 brand_2  \
0              2     2013-01-01        28137.0    ...           1       0   
1              2    

In [127]:
# 划分训练测试集
splitN = int(len(train_data[train_data.brand == 9]) * 0.67)
fea = ['date_month_day_scaled', 'sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
      'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
       'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
       'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
      'after_restday_one', 'after_holiday_one', 'is_holi_restday', 'is_newYearDay', 'isHolidayWeekend', 'isPureWeekend',
       'is_NationalDay', 'is_ChineseNewYearDay']
# fea = ['date_month_day_scaled', 'week_scaled', 'date_year_scaled', 'sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
#       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
#       'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
#       'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
#        'dividedMonth_late', 'dividedMonth_early',
#       'after_restday_one', 'after_holiday_one', 'is_holi_restday', 'is_newYearDay', 'isHolidayWeekend', 'isPureWeekend',
#         'brand_1', 'brand_2', 'brand_3', 'brand_4', 'brand_5',
#        'brand_6', 'brand_7', 'brand_8', 'brand_9', 'brand_10']
trainX,trainY,testX,testY = trainTestSplit(train_data[train_data.brand == 2], splitN, fea)
print(trainX.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 696 entries, 11 to 6509
Data columns (total 32 columns):
date_month_day_scaled    696 non-null float64
sale_quantity_scaled     696 non-null float64
day_of_week_1            696 non-null int64
day_of_week_2            696 non-null int64
day_of_week_3            696 non-null int64
day_of_week_4            696 non-null int64
day_of_week_5            696 non-null int64
day_of_week_6            696 non-null int64
day_of_week_7            696 non-null int64
date_property_0          696 non-null int64
date_property_1          696 non-null int64
date_property_2          696 non-null int64
date_month_1             696 non-null int64
date_month_2             696 non-null int64
date_month_3             696 non-null int64
date_month_4             696 non-null int64
date_month_5             696 non-null int64
date_month_6             696 non-null int64
date_month_7             696 non-null int64
date_month_8             696 non-null int64
date_mont

In [128]:
#检验模型
validModel(trainX.values, trainY.values, testX.values, testY.values)

Coefficients: [ -12.3407972    56.93897138   28.87091046   28.39394979    3.85772303
  -65.65274556  -24.79443159   19.7791096     9.54548426  110.24463421
  -50.66164191  -59.5829923   127.33779584   31.51766942  -39.20051054
  -34.56342029    4.02236671  -65.97324836  -37.54099481  -21.0498203
   10.3498782    10.51467016  -27.29266751   41.8782815   -19.81342081
  164.58585155  -27.98901826    0.          -31.59397403  -50.66164191
  -19.76786649   -4.56855367]
cost: 17535.7257162


In [129]:
#导入测试数据
test_data = pd.read_csv('fusai_A_test_feature_set.csv')

print(test_data.head(10))

   date  day_of_week  brand  week  guess_date  date_year  date_month  \
0  1107            4      7   174  2016-05-05       2016           5   
1  1107            4      8   174  2016-05-05       2016           5   
2  1107            4      9   174  2016-05-05       2016           5   
3  1107            4     10   174  2016-05-05       2016           5   
4  1108            5      1   174  2016-05-06       2016           5   
5  1108            5      2   174  2016-05-06       2016           5   
6  1108            5      3   174  2016-05-06       2016           5   
7  1108            5      4   174  2016-05-06       2016           5   
8  1108            5      5   174  2016-05-06       2016           5   
9  1108            5      6   174  2016-05-06       2016           5   

   date_month_day  date_property guess_date_str      ...        brand_7  \
0               5              0     2016-05-05      ...              1   
1               5              0     2016-05-05      ... 

In [132]:
#定义树模型预测函数
def predictTreeModelByBrand(df, fea, brand):
    clf = trainTreeModel(df[df.brand == brand][fea].values, df[df.brand == brand]['cnt'].values)
    #进行预测
    test_data_brand = test_data[test_data.brand == brand][['date', 'day_of_week', 'guess_date', 'brand']]
    predict = clf.predict(test_data[test_data.brand == brand][fea].values)
    test_data_brand['predict'] = predict
    return test_data_brand

#定义线性模型预测函数
def predictLinearModelByBrand(df, fea, brand):
    clf = trainLinearModel(df[df.brand == brand][fea].values, df[df.brand == brand]['cnt'].values)
    #进行预测
    test_data_brand = test_data[test_data.brand == brand][['date', 'day_of_week', 'guess_date', 'brand']]
    predict = clf.predict(test_data[test_data.brand == brand][fea].values)
    test_data_brand['predict'] = predict
    return test_data_brand

predict_data = predictTreeModelByBrand(train_data, fea, 9)
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 1)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 2)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 3)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 4)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 5)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 6)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 7)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 8)])
predict_data = pd.concat([predict_data, predictLinearModelByBrand(train_data, fea, 10)])

predict_df = pd.read_csv('fusai_test_A_20180227.txt', sep='\t')
predict_df['predict'] = -1
for i in range(len(predict_df)):
    date = predict_df.iloc[[i], [0]].values[0][0]
    brand = predict_df.iloc[[i], [2]].values[0][0]
    predict_df.predict[(predict_df.date == date) & (predict_df.brand == brand)] = predict_data.predict[(predict_data.date == date) & (predict_data.brand == brand)]
predict_df['predict'] = predict_df.predict.map(lambda x: 10 if x < 10 else x)
print(predict_df.head(100))

# print(predict_df.head(100))

Coefficients: [ -35.39247171   44.47194222   89.64674053   30.71230204   -8.44979185
  -73.04808121  -36.09521518   13.51978333  -16.28573766  148.52783254
  -69.48955907  -79.03827347  155.9725058   189.57916319   46.33329688
  -12.03524075  -29.69039301  -47.86679365  -43.73191118  -75.92893842
  -48.3984538   -30.04585299  -42.36922603  -61.81815604  -51.10965873
  243.97517035  -34.28184733 -195.80642881  -44.75642614  -69.48955907
    0.            0.        ]
Coefficients: [ -1.38422787e+01   6.02371223e+01   6.83035897e+01   3.31870624e+01
   2.32525648e+00  -7.09986111e+01  -3.11036046e+01   2.16955460e-01
  -1.93064842e+00   1.17747027e+02  -4.14567936e+01  -7.62902337e+01
   1.65136458e+02   5.34860775e+01  -2.57759880e+01  -3.55845050e+01
   2.63127934e+00  -6.56453814e+01  -4.73990614e+01  -3.23452364e+01
  -7.43747909e+00   1.18192708e+01  -3.92018657e+01   2.03164317e+01
  -6.71081274e+01   2.22715191e+02  -3.86197351e+01  -1.64374329e+02
  -3.76704987e+01  -4.14567936e+0

In [100]:
#保存预测结果
exportResult(predict_df[['date','brand', 'predict']], 'brand_A_fusai_3_2')