In [1]:
# Importing libraries
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
import pandas as pd
import numpy as np
from sklearn import svm

In [3]:
# read data, combine train and test data,separate time information
dfs = {}
for name in ['train', 'test']:
#     df = pd.read_csv('C:\Users/Administrator/Desktop/Kaggle/data/%s.csv' % name)
    df = pd.read_csv('Data/%s.csv' %name)
    df['_data'] = name
    dfs[name] = df

df = dfs['train'].append(dfs['test'])

dt = pd.DatetimeIndex(df['dteday'])
df['day'] = dt.day
df['dow'] = dt.dayofweek
df['woy'] = dt.weekofyear

In [4]:
# logarithmic transformation to do error calculation
df['cnt_log'] = np.log(df['cnt'] + 1)

# add a by_season_all column which represents all renting bikes by season
by_season = df[df['_data'] == 'train'].groupby('season')[['cnt']].agg(sum)
by_season.columns = ['by_season_all']
df = df.join(by_season, on='season')

In [5]:
# add another feature discussed before, it shows peaks of bike-renting number
df['peak'] = df[['hr', 'workingday']].apply(lambda x: (0, 1)[(x['workingday'] == 1 and  ( x['hr'] == 8 or 17 <= x['hr'] <= 18 or 12 <= x['hr'] <= 12)) or (x['workingday'] == 0 and  10 <= x['hr'] <= 19)], axis = 1)

# add features to show special climate, this idea is from a blog 
df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 0.6 and x['windspeed'] < 0.44778], axis = 1)
df['wet'] = df[['hum', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['hum'] >= 0.6], axis = 1)

## OKAY WEATHER DOES NOT WORK
# test for mutual exclusiveness for weathersit
df['w1'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 1],axis = 1)
df['w2'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 2],axis = 1)
df['w3'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 3],axis = 1)
df['w4'] = df[['weathersit']].apply(lambda x: (0,1)[x['weathersit'] == 4],axis = 1)

## OKAY SEASONS DO NOT WORK EITHER
# test for mutual exclusiveness for seasons
df['s1'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 1],axis = 1)
df['s2'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 2],axis = 1)
df['s3'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 3],axis = 1)
df['s4'] = df[['season']].apply(lambda x: (0,1)[x['season'] == 4],axis = 1)

In [7]:
def get_data():
    data = df[df['_data'] == 'train'].copy()
    return data

def get_error(y_pred, y_actual):
    difference = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(difference).mean()
    return np.sqrt(mean_error)

def get_feature_and_result(data, input_cols):
    X = data[input_cols].as_matrix()
    y = data['cnt_log'].as_matrix()
    return X, y
def train_test_split(data):
    train = data[data['day'] <= 14]
    test = data[data['day'] > 14 ]
    return train, test

# similar functions have been given during the lab
def predict_on_validation_set(model, input_cols):
    data = get_data()
    train, test = train_test_split(data)
    X_train, y_train = get_feature_and_result(train, input_cols)
    X_test, y_test = get_feature_and_result(test, input_cols)
    my_model = model.fit(X_train, y_train)
    y_pred = np.exp(my_model.predict(X_test))-1
    y_pred_last = np.round(y_pred)
    y_pred_last[y_pred_last < 0] = 0
    y_test_last = np.exp(y_test)-1
    score = get_error(y_pred_last, y_test_last)
    return (y_pred_last, y_test_last, score)

df_test = df[df['_data'] == 'test'].copy()

def predict_on_test_set(model, input_cols):
    df_train = df[df['_data'] == 'train'].copy()
    X_train = df_train[input_cols].as_matrix()
    y_train = df_train['cnt_log'].as_matrix()
    X_test = df_test[input_cols].as_matrix()
    my_model = model.fit(X_train, y_train)
    y_pred = my_model.predict(X_test)
    y_pred = np.exp(y_pred)-1
    return y_pred

def scoring_on_validation_set(model, input_cols):
    pred = []
    data = get_data()
    train, test = train_test_split(data)
    
    X_train, y_train = get_feature_and_result(train, input_cols)
    X_test, y_test = get_feature_and_result(test, input_cols)
#     my_model = model.fit(X_train, y_train)
    
    cv = cross_validation.ShuffleSplit(n=len(train),n_iter=10, random_state=0)
    scores_shuffle = np.abs(cross_validation.cross_val_score(model, X_train, y_train, cv=cv))
    scores_shuffle_MAE = np.abs(cross_validation.cross_val_score(model, X_train, y_train, cv=cv, scoring = 'mean_absolute_error'))
    scores_shuffle_MSE = np.abs(cross_validation.cross_val_score(model, X_train, y_train, cv=cv, scoring = 'mean_squared_error'))
    scores_shuffle_MedianAE = np.abs(cross_validation.cross_val_score(model, X_train, y_train, cv=cv, scoring = 'median_absolute_error'))

    print("R2 Accuracy: %0.2f (+/- %0.2f)" % (scores_shuffle.mean(), scores_shuffle.std() * 2))
    print("MAE Accuracy: %0.2f (+/- %0.2f)" % (scores_shuffle_MAE.mean(), scores_shuffle_MAE.std() * 2))
    print("MSE Accuracy: %0.2f (+/- %0.2f)" % (scores_shuffle_MSE.mean(), scores_shuffle_MSE.std() * 2))
    print("MedianAE Accuracy: %0.2f (+/- %0.2f)" % (scores_shuffle_MedianAE.mean(), scores_shuffle_MedianAE.std() * 2))


In [26]:
SVM = svm.LinearSVR(C =0.08)
svm_cols = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]
svm_cols1 = [
    'weathersit', 'temp', 'atemp', 'windspeed',
    'workingday', 'season', 'holiday', 'wet',
    'hr', 'dow', 'woy', 'peak'
]

svm_cols2 = [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 'season', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split seasons into 3
svm_cols3 = [
    'weathersit', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday', 's1', 's2', 's3', 's4', 
    'hr', 'dow', 'yr','mnth','woy'
]
# Split both
svm_cols4= [
    'w1', 'w1','w3','w4', 'temp', 'wet', 'ideal', 
    'holiday', 'workingday','s1', 's2', 's3', 's4' , 
    'hr', 'dow', 'yr','mnth','woy'
]

from sklearn import linear_model, preprocessing, grid_search, cross_validation, metrics
print(scoring_on_validation_set(SVM, svm_cols))
print(scoring_on_validation_set(SVM, svm_cols1))
print(scoring_on_validation_set(SVM, svm_cols2))
print(scoring_on_validation_set(SVM, svm_cols3))
print(scoring_on_validation_set(SVM, svm_cols4))

R2 Accuracy: 0.46 (+/- 0.04)
MAE Accuracy: 0.83 (+/- 0.04)
MSE Accuracy: 1.13 (+/- 0.10)
MedianAE Accuracy: 0.66 (+/- 0.05)
None
R2 Accuracy: 0.55 (+/- 0.02)
MAE Accuracy: 0.75 (+/- 0.03)
MSE Accuracy: 0.93 (+/- 0.08)
MedianAE Accuracy: 0.59 (+/- 0.05)
None
R2 Accuracy: 0.47 (+/- 0.04)
MAE Accuracy: 0.82 (+/- 0.03)
MSE Accuracy: 1.10 (+/- 0.10)
MedianAE Accuracy: 0.67 (+/- 0.05)
None
R2 Accuracy: 0.47 (+/- 0.04)
MAE Accuracy: 0.82 (+/- 0.04)
MSE Accuracy: 1.11 (+/- 0.09)
MedianAE Accuracy: 0.64 (+/- 0.06)
None
R2 Accuracy: 0.48 (+/- 0.03)
MAE Accuracy: 0.81 (+/- 0.04)
MSE Accuracy: 1.09 (+/- 0.09)
MedianAE Accuracy: 0.66 (+/- 0.03)
None


In [24]:
param_grid = {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]}
clf = grid_search.GridSearchCV(svm.LinearSVR(), param_grid)

def C(model, input_cols):
    pred = []
    data = get_data()
    train, test = train_test_split(data)
    
    X_train, y_train = get_feature_and_result(train, input_cols)
    X_test, y_test = get_feature_and_result(test, input_cols)
    
    
    cv = cross_validation.ShuffleSplit(n=len(train),n_iter=10, random_state=0)

    my_model = model.fit(X_train, y_train)
    print(my_model.best_params_)
    
C(clf, svm_cols)
C(clf, svm_cols1)
C(clf, svm_cols2)
C(clf, svm_cols3)
C(clf, svm_cols4)

{'C': 0.08}
{'C': 0.08}
{'C': 0.07}
{'C': 0.1}
{'C': 0.09}


In [27]:
svm_pred = predict_on_test_set(SVM, svm_cols1)
y_pred = np.round(svm_pred)
df_test['Prediction'] = y_pred
result = df_test[['dteday', 'Prediction']].copy()
result.to_csv('Data/svm_submit_new.csv', index=False)