In [1]:
# Importing libraries
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [3]:
# read data, combine train and test data,separate time information
dfs = {}
for name in ['train', 'test']:
#     df = pd.read_csv('C:\Users/Administrator/Desktop/Kaggle/data/%s.csv' % name)
    df = pd.read_csv('Data/%s.csv' %name)
    df['_data'] = name
    dfs[name] = df

df = dfs['train'].append(dfs['test'])

dt = pd.DatetimeIndex(df['dteday'])
df['day'] = dt.day
df['dow'] = dt.dayofweek
df['woy'] = dt.weekofyear

In [4]:
# logarithmic transformation to do error calculation
df['cnt_log'] = np.log(df['cnt'] + 1)

# add a by_season_all column which represents all renting bikes by season
by_season = df[df['_data'] == 'train'].groupby('season')[['cnt']].agg(sum)
by_season.columns = ['by_season_all']
df = df.join(by_season, on='season')

In [20]:
# add another feature discussed before, it shows peaks of bike-renting number
df['peak'] = df[['hr', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['hr'] == 8 or 17 <= x['hr'] <= 18 or 12 <= x['hr'] <= 12)) or (x['workingday'] == 0 and  10 <= x['hr'] <= 19)], axis = 1)

# add features to show special climate, this idea is from a blog 
df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 0.6 and x['windspeed'] < 0.44778], axis = 1)
df['wet'] = df[['hum', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['hum'] >= 0.6], axis = 1)

# test for mutual exclusiveness for weathersit
df['w1'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 1],axis = 1)
df['w2'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 2],axis = 1)
df['w3'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 3],axis = 1)
df['w4'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 4],axis = 1)

# test for mutual exclusiveness for seasons
df['s1'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 1],axis = 1)
df['s2'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 2],axis = 1)
df['s3'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 3],axis = 1)
df['s4'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 4],axis = 1)


print(df)
# df.to_csv('Data/dataframe.csv')

      _data   atemp    cnt      dteday  holiday  hr   hum  instant  mnth  \
0     train  0.2879   16.0  2011-01-01        0   0  0.81        1     1   
1     train  0.2727   40.0  2011-01-01        0   1  0.80        2     1   
2     train  0.2727   32.0  2011-01-01        0   2  0.80        3     1   
3     train  0.2879   13.0  2011-01-01        0   3  0.75        4     1   
4     train  0.2879    1.0  2011-01-01        0   4  0.75        5     1   
5     train  0.2576    1.0  2011-01-01        0   5  0.75        6     1   
6     train  0.2727    2.0  2011-01-01        0   6  0.80        7     1   
7     train  0.2576    3.0  2011-01-01        0   7  0.86        8     1   
8     train  0.2879    8.0  2011-01-01        0   8  0.75        9     1   
9     train  0.3485   14.0  2011-01-01        0   9  0.76       10     1   
10    train  0.3939   36.0  2011-01-01        0  10  0.76       11     1   
11    train  0.3333   56.0  2011-01-01        0  11  0.81       12     1   
12    train 

In [6]:
def get_data():
    data = df[df['_data'] == 'train'].copy()
    return data

def get_error(y_pred, y_actual):
    difference = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(difference).mean()
    return np.sqrt(mean_error)

def get_feature_and_result(data, input_cols):
    X = data[input_cols].as_matrix()
    y = data['cnt_log'].as_matrix()
    return X, y
def train_test_split(data):
    train = data[data['day'] <= 14]
    test = data[data['day'] > 14 ]
    return train, test

# similar functions have been given during the lab
def predict_on_validation_set(model, input_cols):
    data = get_data()
    train, test = train_test_split(data)
    X_train, y_train = get_feature_and_result(train, input_cols)
    X_test, y_test = get_feature_and_result(test, input_cols)
    my_model = model.fit(X_train, y_train)
    y_pred = np.exp(my_model.predict(X_test))-1
    y_pred_last = np.round(y_pred)
    y_pred_last[y_pred_last < 0] = 0
    y_test_last = np.exp(y_test)-1
    score = get_error(y_pred_last, y_test_last)
    return (y_pred_last, y_test_last, score)

df_test = df[df['_data'] == 'test'].copy()

def predict_on_test_set(model, input_cols):
    df_train = df[df['_data'] == 'train'].copy()
    X_train = df_train[input_cols].as_matrix()
    y_train = df_train['cnt_log'].as_matrix()
    X_test = df_test[input_cols].as_matrix()
    my_model = model.fit(X_train, y_train)
    y_pred = my_model.predict(X_test)
    y_pred = np.exp(y_pred)-1
    return y_pred


In [21]:
linreg_cols = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split weather into 4
linreg_cols1 = [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split seasons into 3
linreg_cols2 = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 's1', 's2', 's3', 's4', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split both
linreg_cols3= [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday','s1', 's2', 's3', 's4' , 
    'hr', 'dow', 'yr','mnth','woy'
]


linreg_model = linear_model.LinearRegression()
(linreg_p, linreg_t, linreg_score) = predict_on_validation_set(linreg_model, linreg_cols)
(linreg_p1, linreg_t1, linreg_score1) = predict_on_validation_set(linreg_model, linreg_cols1)
(linreg_p2, linreg_t2, linreg_score2) = predict_on_validation_set(linreg_model, linreg_cols2)
(linreg_p3, linreg_t3, linreg_score3) = predict_on_validation_set(linreg_model, linreg_cols3)
print(linreg_score)
print(linreg_score1)
print(linreg_score2)
print(linreg_score3)

1.02838362041
1.0263485157
1.02418553777
1.02102662703


In [9]:
linreg_pred = predict_on_test_set(linreg_model, linreg_cols)
y_pred = np.round(linreg_pred)
df_test['Prediction'] = y_pred
result = df_test[['dteday', 'Prediction']].copy()
result.to_csv('Data/linreg_submit_new.csv', index=False)