In [22]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn import linear_model
from sklearn.preprocessing import *
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [23]:
# 划分训练集和测试集
def trainTestSplit(df, splitN, trainLabel):
    trainX = df[:splitN][trainLabel]
    trainY = df[:splitN]['cnt']
    testX = df[splitN:][trainLabel]
    testY = df[splitN:]['cnt']
    return (trainX, trainY, testX, testY)

# 训练模型
def trainModel(X, y):
    clf = LinearRegression()
#     clf = ensemble.RandomForestRegressor()
    clf.fit(X, y)
    print('Coefficients:', clf.coef_)
    return clf

# 检验模型
def validModel(trainX, trainY, testX, testY):
    clf = trainModel(trainX, trainY)
    predictY = clf.predict(testX)
    cost = np.linalg.norm(predictY - testY)**2 / len(predictY)
    print("cost:", cost)
    
# 添加过去第i周的统计量
def statWeek(df, weeks):
    if isinstance(weeks, int):
        weeks = [weeks]
    colName = []
    for i in weeks:
        weekDf = pd.pivot_table(df, index=['week'], values=['cnt'], aggfunc=[np.mean, np.std, np.max, np.min])
        weekDf.columns = ['mean%d'%i, 'std%d'%i, 'max%d'%i, 'min%d'%i]
        colName.extend(weekDf.columns)
        weekDf.index += i
        df = pd.merge(df, weekDf, left_on='week', right_index=True, how='left')
    return df,colName

# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('./%s.txt' % fileName, sep='\t', header=False, index=False)

In [24]:
#导入训练数据
train_data = pd.read_csv('B_train_feature_set.csv')

print(train_data.head(10))

    cnt  day  day_of_week  guess_date  week  date_year  date_month  \
0    68    3            3  2013-01-02     0       2013           1   
1    36    4            4  2013-01-03     0       2013           1   
2  5565    5            5  2013-01-04     0       2013           1   
3  4966    6            6  2013-01-05     0       2013           1   
4  3346    7            7  2013-01-06     0       2013           1   
5  3396    8            1  2013-01-07     1       2013           1   
6  4146    9            2  2013-01-08     1       2013           1   
7  3096   10            3  2013-01-09     1       2013           1   
8  2713   11            4  2013-01-10     1       2013           1   
9  2409   12            5  2013-01-11     1       2013           1   

   date_property  sale_quantity dividedMonth        ...          date_month_5  \
0              2        28137.0        early        ...                     0   
1              2        28137.0        early        ...            

In [55]:
# 划分训练测试集
splitN = int(train_data.index[-1] * 0.67)
fea = ['week_scaled', 'date_year_scaled', 'sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
      'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
      'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
      'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
       'dividedMonth_late', 'dividedMonth_early',
      'after_restday_one', 'after_holiday_one', 'is_holi_restday', 'is_newYearDay', 'isHolidayWeekend', 'isPureWeekend']
trainX,trainY,testX,testY = trainTestSplit(train_data, splitN, fea)
print(trainX.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307 entries, 0 to 1306
Data columns (total 33 columns):
week_scaled             1307 non-null float64
date_year_scaled        1307 non-null float64
sale_quantity_scaled    1307 non-null float64
day_of_week_1           1307 non-null int64
day_of_week_2           1307 non-null int64
day_of_week_3           1307 non-null int64
day_of_week_4           1307 non-null int64
day_of_week_5           1307 non-null int64
day_of_week_6           1307 non-null int64
day_of_week_7           1307 non-null int64
date_property_0         1307 non-null int64
date_property_1         1307 non-null int64
date_property_2         1307 non-null int64
date_month_1            1307 non-null int64
date_month_2            1307 non-null int64
date_month_3            1307 non-null int64
date_month_4            1307 non-null int64
date_month_5            1307 non-null int64
date_month_6            1307 non-null int64
date_month_7            1307 non-null int64
date_mo

In [56]:
#检验模型
trainModel(trainX.values, trainY.values)
# validModel(trainX.values, trainY.values, testX.values, testY.values)

Coefficients: [ -2.47413212e+04   1.99341687e+04   1.36465940e+03   4.80343535e+16
   4.80343535e+16   4.80343535e+16   4.80343535e+16   4.80343535e+16
   4.80343535e+16   4.80343535e+16   7.81586322e+16   6.41069093e+16
   6.41069093e+16  -5.08659744e+15  -5.08659744e+15  -5.08659744e+15
  -5.08659744e+15  -5.08659744e+15  -5.08659744e+15  -5.08659744e+15
  -5.08659744e+15  -5.08659744e+15  -5.08659744e+15  -5.08659744e+15
  -5.08659744e+15   1.87625744e+16   1.87625744e+16  -5.68000000e+02
   1.70650000e+03   1.40517230e+16  -2.98000000e+02   1.40517230e+16
   1.40517230e+16]


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [63]:
# 正式模型
modelName = "linear2"
clf = trainModel(train_data.loc[train_data.guess_date<'2017-02-16',fea].values, train_data.loc[train_data.guess_date<'2017-02-16','cnt'].values)
# clf = trainModel(train_data[-750:][fea].values, train_data[-750:]['cnt'].values)

Coefficients: [ -2.50807956e+04   2.01711025e+04   1.43492264e+03   5.41025064e+02
   3.50212452e+02   5.03895553e+00  -4.98628648e+02  -2.38710144e+02
   1.07751517e+02  -2.66689197e+02   1.13074906e+03  -7.04714783e+02
  -4.26034275e+02  -1.55457809e+03  -1.48214663e+03  -1.53159095e+03
  -1.14046640e+03  -6.73294222e+02  -4.71384231e+02   1.77883677e+01
   4.93428831e+02   8.84548797e+02   1.58445249e+03   1.74512983e+03
   2.12811220e+03   2.13394145e+01  -2.13394145e+01  -3.02401990e+02
   1.73930719e+03  -4.96419520e+02  -1.05461111e+03  -5.09383016e+02
  -1.24946522e+02]


In [64]:
#导入测试数据
test_data = pd.read_csv('B_test_feature_set.csv')

print(test_data.head(10))

   day_of_week  week   day  guess_date  date_year  date_month  date_property  \
0            5   215  1510  2017-02-17       2017           2              0   
1            6   215  1511  2017-02-18       2017           2              1   
2            7   215  1512  2017-02-19       2017           2              1   
3            1   216  1513  2017-02-20       2017           2              0   
4            2   216  1514  2017-02-21       2017           2              0   
5            3   216  1515  2017-02-22       2017           2              0   
6            4   216  1516  2017-02-23       2017           2              0   
7            5   216  1517  2017-02-24       2017           2              0   
8            6   216  1518  2017-02-25       2017           2              1   
9            1   217  1520  2017-02-27       2017           2              0   

   sale_quantity dividedMonth  pre_date_property      ...        date_month_6  \
0        44669.0         late         

In [65]:
#进行预测
predict = clf.predict(test_data[:][fea].values)

predict_df = pd.read_csv('test_B_20171225.txt', sep='\t')
predict_df['predict'] = predict
predict_df['predict'] = predict_df.predict.map(lambda x: 0 if x < 0 else x)

print(predict_df.head(10))

   date  day_of_week      predict
0  1308            5  2406.014282
1  1309            6   792.065581
2  1310            7   417.624867
3  1311            1  2785.375643
4  1312            2  2896.965021
5  1313            3  2551.791524
6  1314            4  2048.123920
7  1315            5  2308.042424
8  1316            6   694.093723
9  1317            1  2687.403785


In [66]:
#查看预测结果
test_data['predict'] = predict

# 导出训练集预处理结果
def exportPredictResult(df, fileName):
    df.to_csv('./%s.csv' % fileName, header=True, index=False)
exportPredictResult(test_data[['guess_date', 'predict', 'day_of_week']], '2_25_B')

print(test_data.guess_date[test_data.predict <= 10])

27     2017-03-19
59     2017-04-23
65     2017-04-29
66     2017-04-30
93     2017-05-28
105    2017-06-11
112    2017-06-18
119    2017-06-25
140    2017-07-16
147    2017-07-23
154    2017-07-30
175    2017-08-20
182    2017-08-27
217    2017-10-05
262    2017-11-19
269    2017-11-26
Name: guess_date, dtype: object


In [67]:
#保存预测结果
exportResult(predict_df[['date','predict']], 'linear_model_B_keng_2_25')