In [323]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn import linear_model
from sklearn.preprocessing import *
from sklearn import ensemble
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [324]:
# 划分训练集和测试集
def trainTestSplit(df, splitN, trainLabel):
    trainX = df[:splitN][trainLabel]
    trainY = df[:splitN]['cnt']
    testX = df[splitN:][trainLabel]
    testY = df[splitN:]['cnt']
    return (trainX, trainY, testX, testY)

# 训练模型
def trainModel(X, y):
#     clf = LinearRegression()
#     clf = linear_model.LogisticRegression()
#     clf = ensemble.RandomForestRegressor(n_estimators=200)
    clf = linear_model.RidgeCV(alphas=[0.01*x for x in range(1,200)], scoring='neg_mean_squared_error')
    clf.fit(X, y)
    print('Coefficients:', clf.coef_)
    return clf

# 检验模型
def validModel(trainX, trainY, testX, testY):
    clf = trainModel(trainX, trainY)
    predictY = clf.predict(testX)
    cost = np.linalg.norm(predictY - testY)**2 / len(predictY)
    print("cost:", cost)
    
# 添加过去第i周的统计量
def statWeek(df, weeks):
    if isinstance(weeks, int):
        weeks = [weeks]
    colName = []
    for i in weeks:
        weekDf = pd.pivot_table(df, index=['week'], values=['cnt'], aggfunc=[np.mean, np.std, np.max, np.min])
        weekDf.columns = ['mean%d'%i, 'std%d'%i, 'max%d'%i, 'min%d'%i]
        colName.extend(weekDf.columns)
        weekDf.index += i
        df = pd.merge(df, weekDf, left_on='week', right_index=True, how='left')
    return df,colName

# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('./%s.txt' % fileName, sep='\t', header=False, index=False)

In [325]:
#导入训练数据
train_data = pd.read_csv('fusai_A_train_feature_set.csv')

print(train_data.head(10))

   date  brand  cnt  day_of_week  week  guess_date  date_year  date_month  \
0     1      1   31            2     0  2013-01-01       2013           1   
1     1      6    6            2     0  2013-01-01       2013           1   
2     1      9   15            2     0  2013-01-01       2013           1   
3     2      9    0            3     0  2013-01-02       2013           1   
4     2      8    0            3     0  2013-01-02       2013           1   
5     2      3    0            3     0  2013-01-02       2013           1   
6     2      2    0            3     0  2013-01-02       2013           1   
7     2      7   30            3     0  2013-01-02       2013           1   
8     2      6    6            3     0  2013-01-02       2013           1   
9     2      4   20            3     0  2013-01-02       2013           1   

   date_property guess_date_str    ...     brand_1 brand_2  brand_3  brand_4  \
0              2     2013-01-01    ...           1       0        0     

In [326]:
# 划分训练测试集
splitN = int(train_data.index[-1] * 0.67)
fea = ['sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
      'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
       'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
       'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
      'after_restday_one', 'after_holiday_one', 'is_holi_restday', 'is_newYearDay', 'isHolidayWeekend', 'isPureWeekend',
       'is_NationalDay', 'is_ChineseNewYearDay']
# fea = [ 'sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
#       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
#       'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
#       'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
#        'dividedMonth_late', 'dividedMonth_early',
#       'after_restday_one', 'after_holiday_one', 'is_holi_restday', 'is_newYearDay', 'isHolidayWeekend', 'isPureWeekend',
#         'brand_1', 'brand_2', 'brand_3', 'brand_4', 'brand_5',
#        'brand_6', 'brand_7', 'brand_8', 'brand_9', 'brand_10']
trainX,trainY,testX,testY = trainTestSplit(train_data, splitN, fea)
print(trainX.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7806 entries, 0 to 7805
Data columns (total 41 columns):
sale_quantity_scaled    7806 non-null float64
day_of_week_1           7806 non-null int64
day_of_week_2           7806 non-null int64
day_of_week_3           7806 non-null int64
day_of_week_4           7806 non-null int64
day_of_week_5           7806 non-null int64
day_of_week_6           7806 non-null int64
day_of_week_7           7806 non-null int64
date_property_0         7806 non-null int64
date_property_1         7806 non-null int64
date_property_2         7806 non-null int64
date_month_1            7806 non-null int64
date_month_2            7806 non-null int64
date_month_3            7806 non-null int64
date_month_4            7806 non-null int64
date_month_5            7806 non-null int64
date_month_6            7806 non-null int64
date_month_7            7806 non-null int64
date_month_8            7806 non-null int64
date_month_9            7806 non-null int64
date_month_

In [327]:
#检验模型
validModel(trainX.values, trainY.values, testX.values, testY.values)

cost: 39218.8291952


In [328]:
# 正式模型
modelName = "linear2"
clf = trainModel(train_data[:][fea].values, train_data[:]['cnt'].values)
# clf = trainModel(train_data[-750:][fea].values, train_data[-750:]['cnt'].values)

In [329]:
#导入测试数据
test_data = pd.read_csv('fusai_A_test_feature_set.csv')

print(test_data.head(10))

   date  day_of_week  brand  week  guess_date  date_year  date_month  \
0  1107            4      7   174  2016-05-05       2016           5   
1  1107            4      8   174  2016-05-05       2016           5   
2  1107            4      9   174  2016-05-05       2016           5   
3  1107            4     10   174  2016-05-05       2016           5   
4  1108            5      1   174  2016-05-06       2016           5   
5  1108            5      2   174  2016-05-06       2016           5   
6  1108            5      3   174  2016-05-06       2016           5   
7  1108            5      4   174  2016-05-06       2016           5   
8  1108            5      5   174  2016-05-06       2016           5   
9  1108            5      6   174  2016-05-06       2016           5   

   date_property guess_date_str  sale_quantity      ...       brand_7  \
0              0     2016-05-05        42158.0      ...             1   
1              0     2016-05-05        42158.0      ...      

In [330]:
#进行预测
predict = clf.predict(test_data[:][fea].values)

predict_df = pd.read_csv('fusai_test_A_20180227.txt', sep='\t')
predict_df['predict'] = predict
predict_df['predict'] = predict_df.predict.map(lambda x: 10 if x < 0 else x)

print(predict_df.head(100))

    date  day_of_week  brand     predict
0   1107            4      7  252.351019
1   1107            4      8  433.951217
2   1107            4      9  602.834515
3   1107            4     10  296.612647
4   1108            5      1  314.063756
5   1108            5      2  224.604121
6   1108            5      3  312.538544
7   1108            5      4  547.354317
8   1108            5      5  384.672361
9   1108            5      6  348.312556
10  1108            5      7  364.559470
11  1108            5      8  611.713323
12  1108            5      9  778.462694
13  1108            5     10  348.317426
14  1109            6      1   54.013714
15  1109            6      2   73.298083
16  1109            6      3   69.780825
17  1109            6      4  104.838165
18  1109            6      5   83.226859
19  1109            6      6   56.282413
20  1109            6      7   70.103665
21  1109            6      8   91.318944
22  1109            6      9  213.648895
23  1109        

In [331]:
#保存预测结果
exportResult(predict_df[['date','brand', 'predict']], 'random_forest_A_fullfilling_fusai_3_1')