In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from scipy.stats import mode
import csv
import matplotlib.dates
from datetime import *
from sklearn import linear_model
from sklearn.preprocessing import *
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [2]:
# 划分训练集和测试集
def trainTestSplit(df, splitN, trainLabel):
    trainX = df[:splitN][trainLabel]
    trainY = df[:splitN]['cnt']
    testX = df[splitN:][trainLabel]
    testY = df[splitN:]['cnt']
    return (trainX, trainY, testX, testY)

# 训练模型
def trainModel(X, y):
#     clf = LinearRegression()
#     clf = linear_model.LogisticRegression()
#     clf = ensemble.RandomForestRegressor()
    clf = linear_model.RidgeCV(alphas=[0.01*x for x in range(1,200)], scoring='neg_mean_squared_error')
    clf.fit(X, y)
    print('Coefficients:', clf.coef_)
    return clf

# 检验模型
def validModel(trainX, trainY, testX, testY):
    clf = trainModel(trainX, trainY)
    predictY = clf.predict(testX)
    cost = np.linalg.norm(predictY - testY)**2 / len(predictY)
    print("cost:", cost)
    
# 添加过去第i周的统计量
def statWeek(df, weeks):
    if isinstance(weeks, int):
        weeks = [weeks]
    colName = []
    for i in weeks:
        weekDf = pd.pivot_table(df, index=['week'], values=['cnt'], aggfunc=[np.mean, np.std, np.max, np.min])
        weekDf.columns = ['mean%d'%i, 'std%d'%i, 'max%d'%i, 'min%d'%i]
        colName.extend(weekDf.columns)
        weekDf.index += i
        df = pd.merge(df, weekDf, left_on='week', right_index=True, how='left')
    return df,colName

# 导出预测结果
def exportResult(df, fileName):
    df.to_csv('./%s.txt' % fileName, sep='\t', header=False, index=False)

In [3]:
#导入训练数据
train_data = pd.read_csv('A_fulltraining_feature_set.csv')

print(train_data.head(10))

    cnt  day_of_week  week  guess_date  date_year  date_month  date_property  \
0    68            3     0  2013-01-02       2013           1              2   
1    36            4     0  2013-01-03       2013           1              2   
2  5565            5     0  2013-01-04       2013           1              0   
3  4966            6     0  2013-01-05       2013           1              0   
4  3346            7     0  2013-01-06       2013           1              0   
5  3396            1     1  2013-01-07       2013           1              0   
6  4146            2     1  2013-01-08       2013           1              0   
7  3096            3     1  2013-01-09       2013           1              0   
8  2713            4     1  2013-01-10       2013           1              0   
9  2409            5     1  2013-01-11       2013           1              0   

  guess_date_str  sale_quantity dividedMonth        ...          date_month_5  \
0     2013-01-02        28137.0       

In [22]:
# 划分训练测试集
splitN = int(train_data.index[-1] * 0.67)
fea = [ 'sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
      'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
       'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
       'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
      'after_restday_one', 'after_holiday_one', 'is_newYearDay', 'isHolidayWeekend']
# fea = ['week_scaled', 'date_year_scaled', 'sale_quantity_scaled', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
#       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'day_of_week_7', 'date_property_0', 'date_property_1', 'date_property_2',
#       'date_month_1', 'date_month_2', 'date_month_3', 'date_month_4', 'date_month_5', 'date_month_6', 'date_month_7',
#       'date_month_8', 'date_month_9', 'date_month_10', 'date_month_11', 'date_month_12',
#        'dividedMonth_late', 'dividedMonth_early',
#       'after_restday_one', 'after_holiday_one', 'is_holi_restday', 'is_newYearDay', 'isHolidayWeekend', 'isPureWeekend']
trainX,trainY,testX,testY = trainTestSplit(train_data, splitN, fea)
print(trainX.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797 entries, 0 to 796
Data columns (total 27 columns):
sale_quantity_scaled    797 non-null float64
day_of_week_1           797 non-null int64
day_of_week_2           797 non-null int64
day_of_week_3           797 non-null int64
day_of_week_4           797 non-null int64
day_of_week_5           797 non-null int64
day_of_week_6           797 non-null int64
day_of_week_7           797 non-null int64
date_property_0         797 non-null int64
date_property_1         797 non-null int64
date_property_2         797 non-null int64
date_month_1            797 non-null int64
date_month_2            797 non-null int64
date_month_3            797 non-null int64
date_month_4            797 non-null int64
date_month_5            797 non-null int64
date_month_6            797 non-null int64
date_month_7            797 non-null int64
date_month_8            797 non-null int64
date_month_9            797 non-null int64
date_month_10           797 non-n

In [23]:
#检验模型
validModel(trainX.values, trainY.values, testX.values, testY.values)

Coefficients: [ 1293.26298164   366.60953297   265.15920698   -11.27698942  -390.65169631
  -166.49965812    61.42218802  -124.76258413  1341.63015461  -537.85346338
  -803.77669123   802.69042415   218.49541904   -19.81278918   -63.12964945
   -18.12394411  -202.92766132  -172.01569925  -194.95513731    -8.71301626
    -6.52198232  -151.84138315  -183.14458083  -150.64696655  1662.6066923
  -677.36080745    23.27886307]
cost: 327748.496453


In [24]:
# 正式模型
modelName = "linear2"
clf = trainModel(train_data[:][fea].values, train_data[:]['cnt'].values)
# clf = trainModel(train_data[-750:][fea].values, train_data[-750:]['cnt'].values)

Coefficients: [  955.93546984   317.90585364   350.35099959    45.41680449  -396.65241576
  -122.01381277    22.83365903  -217.84108822  1344.83149616  -475.77943004
  -869.05206612   869.97653231   375.7621051    -63.59795351   -89.35192432
   -39.90582665  -221.28182412  -204.66375774  -196.70888297   -44.42316739
    -6.64106752  -168.75542406  -210.40880914   -29.25844615  1508.64894437
  -754.26008835   134.67516332]


In [25]:
#导入测试数据
test_data = pd.read_csv('A_fulltesting_feature_set.csv')

print(test_data.head(10))

   day_of_week  week   day  guess_date  date_year  date_month  date_property  \
0            4   170  1194  2016-04-07       2016           4              0   
1            5   170  1195  2016-04-08       2016           4              0   
2            6   170  1196  2016-04-09       2016           4              1   
3            1   171  1198  2016-04-11       2016           4              0   
4            2   171  1199  2016-04-12       2016           4              0   
5            3   171  1200  2016-04-13       2016           4              0   
6            4   171  1201  2016-04-14       2016           4              0   
7            5   171  1202  2016-04-15       2016           4              0   
8            6   171  1203  2016-04-16       2016           4              1   
9            1   172  1205  2016-04-18       2016           4              0   

  guess_date_str  sale_quantity dividedMonth      ...       date_month_6  \
0     2016-04-07        41163.0        earl

In [26]:
#进行预测
predict = clf.predict(test_data[:][fea].values)

predict_df = pd.read_csv('test_A_20171225.txt', sep='\t')
predict_df['predict'] = predict
predict_df['predict'] = predict_df.predict.map(lambda x: 0 if x < 0 else x)

print(predict_df.head(10))

   date  day_of_week      predict
0  1032            4  1707.144816
1  1033            5  1981.783419
2  1034            6   306.019964
3  1035            1  2392.444639
4  1036            2  2454.148231
5  1037            3  2149.214036
6  1038            4  1707.144816
7  1039            5  1981.783419
8  1040            6   306.019964
9  1041            1  2392.444639


In [27]:
#利用A榜公布的答案进行模型特征调整
answer_predict = pd.read_csv('answer_A_20180225.txt', sep='\t', header=None)
cost = np.linalg.norm(predict - answer_predict[:][1])**2 / len(predict)
print(cost)

504174.972413


In [113]:
#查看预测结果
test_data['predict'] = predict
test_data['answer'] = answer_predict[:][1]

# 导出训练集预处理结果
def exportPredictResult(df, fileName):
    df.to_csv('./%s.csv' % fileName, header=True, index=False)
exportPredictResult(test_data[['guess_date', 'predict', 'answer', 'day_of_week']], 'A_compare')

In [27]:
#stacking
import numpy as np
from sklearn.model_selection import KFold

# 返回训练集预测结果和测试集预测结果
def getOof(clf, trainX, trainY, testX, nFold=10):
    oofTrain = np.zeros(trainX.shape[0])
    oofTest = np.zeros(testX.shape[0])
    oofTestSkf = np.zeros((testX.shape[0], nFold))
    kf = KFold(n_splits=nFold, shuffle=True)
    for i, (trainIdx, testIdx) in enumerate(kf.split(trainX)):
        kfTrainX = trainX[trainIdx]
        kfTrainY = trainY[trainIdx]
        kfTestX = trainX[testIdx]
        clf.fit(kfTrainX, kfTrainY)
        oofTrain[testIdx] = clf.predict(kfTestX)
        oofTestSkf[:,i] = clf.predict(testX)
    oofTest[:] = oofTestSkf.mean(axis=1)
    return oofTrain, oofTest

oofTrain, oofTest = getOof(clf, train_data[:][fea].values, train_data[:]['cnt'].values, test_data[:][fea].values)

df = pd.read_csv('train_20171215.txt', sep='\t')
dataDf = pd.pivot_table(df,index=["date"], values=["cnt","day_of_week"], aggfunc={"cnt":np.sum, "day_of_week": np.max})
dataDf['predict'] = oofTrain
oofTrain_df = dataDf[:]['predict']
oofTrain_df.to_csv('stacking_keng1_train.csv', header=True, index=True)

df_test_A = pd.read_csv('test_A_20171225.txt', sep='\t')
df_test_A['predict'] = oofTest
oofTest_df = df_test_A[:][['date', 'predict']]
oofTest_df.to_csv('stacking_keng1_test.csv', header=True, index=False)

In [9]:
#查看预测结果
test_data['predict'] = predict

# 导出训练集预处理结果
def exportPredictResult(df, fileName):
    df.to_csv('./%s.csv' % fileName, header=True, index=False)
exportPredictResult(test_data[['guess_date', 'predict', 'day_of_week']], 'predict_result_isPureWeekend')

print(test_data.guess_date[test_data.predict <= 10])

53     2016-06-09
54     2016-06-11
140    2016-09-15
155    2016-10-06
Name: guess_date, dtype: object


In [10]:
#保存预测结果
exportResult(predict_df[['date','predict']], 'linear_all_data_isPureWeekend_2_8')