In [1]:

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
from datetime import datetime

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
 #= pd.read_csv('my_Submission.csv')

train_features = list(train.keys())

X = train.iloc[:, :-3]
y = train.iloc[:, -1]

# split into train/dev
X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=1)

X_test = test.iloc[:, :]

date_time = pd.DataFrame(X_test.iloc[:,0])
date_time.reset_index(drop=True, inplace=True)

In [3]:
X_train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
9639,2012-10-06 00:00:00,4,0,0,1,22.96,26.515,83,11.0014
7284,2012-05-02 21:00:00,2,0,1,1,21.32,25.0,77,15.0013
10463,2012-12-02 09:00:00,4,0,0,2,11.48,13.635,93,11.0014
3079,2011-07-17 05:00:00,3,0,0,1,26.24,30.305,69,15.0013
5630,2012-01-09 17:00:00,1,0,1,3,9.02,13.635,87,0.0


In [4]:
X_train_df = pd.DataFrame(X_train)

In [70]:
df_season = pd.get_dummies(X_train_df['season'])
X_train_transformed_df = pd.concat([X_train_df, df_season], axis=1)

df_weather = pd.get_dummies(X_train_df['weather'])
X_train_transformed_df = pd.concat([X_train_transformed_df, df_weather], axis=1)

X_train_transformed_df.columns = ['datetime','season','holiday','workingday','weather','temp','atemp','humidity', \
                                  'windspeed','spring','summer','fall','winter','clear','mist','snow','rain']
X_train_transformed_df['datetime'] = pd.to_datetime(X_train_transformed_df['datetime'], errors='coerce')
X_train_transformed_df['weekday'] = X_train_transformed_df['datetime'].dt.dayofweek
X_train_transformed_df['timehour'] = X_train_transformed_df['datetime'].dt.hour

df_weekday = pd.get_dummies(X_train_transformed_df['weekday'])
X_train_transformed_df = pd.concat([X_train_transformed_df, df_weekday], axis=1)


X_train_transformed_df.columns = ['datetime','season','holiday','workingday','weather','temp','atemp','humidity', \
                                  'windspeed','spring','summer','fall','winter','clear','mist','snow','rain' , \
                                  'weekday','time hour','monday','tuesday','wednesday','thursday','friday', \
                                  'satuday','sunday']

########   maybe creat a binary variable for tempurature???

X_train_transformed_and_reduced_df=X_train_transformed_df.drop(['season','weather','weekday'], axis=1)

In [71]:
X_train_transformed_and_reduced_df

Unnamed: 0,datetime,holiday,workingday,temp,atemp,humidity,windspeed,spring,summer,fall,...,snow,rain,time hour,monday,tuesday,wednesday,thursday,friday,satuday,sunday
9639,2012-10-06 00:00:00,0,0,22.96,26.515,83,11.0014,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7284,2012-05-02 21:00:00,0,1,21.32,25.000,77,15.0013,0,1,0,...,0,0,21,0,0,1,0,0,0,0
10463,2012-12-02 09:00:00,0,0,11.48,13.635,93,11.0014,0,0,0,...,0,0,9,0,0,0,0,0,0,1
3079,2011-07-17 05:00:00,0,0,26.24,30.305,69,15.0013,0,0,1,...,0,0,5,0,0,0,0,0,0,1
5630,2012-01-09 17:00:00,0,1,9.02,13.635,87,0.0000,1,0,0,...,1,0,17,1,0,0,0,0,0,0
1883,2011-05-05 09:00:00,0,1,18.86,22.725,44,19.0012,0,1,0,...,0,0,9,0,0,0,1,0,0,0
669,2011-02-11 05:00:00,0,1,3.28,6.060,63,6.0032,1,0,0,...,0,0,5,0,0,0,0,1,0,0
8554,2012-07-17 19:00:00,0,1,36.08,39.395,35,16.9979,0,0,1,...,0,0,19,0,1,0,0,0,0,0
9000,2012-08-17 09:00:00,0,1,28.70,12.120,58,7.0015,0,0,1,...,0,0,9,0,0,0,0,1,0,0
602,2011-02-08 06:00:00,0,1,10.66,11.365,70,22.0028,1,0,0,...,0,0,6,0,1,0,0,0,0,0


In [74]:
X_test_df = pd.DataFrame(X_test)
X_test_df.reset_index(drop=True, inplace=True)

df_season = pd.get_dummies(X_test_df['season'])
X_test_transformed_df = pd.concat([X_test_df, df_season], axis=1)

df_weather = pd.get_dummies(X_test_df['weather'])
X_test_transformed_df = pd.concat([X_test_transformed_df, df_weather], axis=1)

X_test_transformed_df.columns = ['datetime','season','holiday','workingday','weather','temp','atemp','humidity', \
                                  'windspeed','spring','summer','fall','winter','clear','mist','snow','rain']
X_test_transformed_df['datetime'] = pd.to_datetime(X_test_transformed_df['datetime'], errors='coerce')
X_test_transformed_df['weekday'] = X_test_transformed_df['datetime'].dt.dayofweek
X_test_transformed_df['timehour'] = X_test_transformed_df['datetime'].dt.hour

df_weekday = pd.get_dummies(X_test_transformed_df['weekday'])
X_test_transformed_df = pd.concat([X_test_transformed_df, df_weekday], axis=1)


X_test_transformed_df.columns = ['datetime','season','holiday','workingday','weather','temp','atemp','humidity', \
                                  'windspeed','spring','summer','fall','winter','clear','mist','snow','rain' , \
                                  'weekday','timehour','monday','tuesday','wednesday','thursday','friday', \
                                 'satuday','sunday']


X_test_transformed_and_reduced_df = X_test_transformed_df.drop(['season','weather','weekday'], axis=1)


In [75]:
X_test_transformed_and_reduced_df

Unnamed: 0,datetime,holiday,workingday,temp,atemp,humidity,windspeed,spring,summer,fall,...,snow,rain,timehour,monday,tuesday,wednesday,thursday,friday,satuday,sunday
0,2011-01-20 00:00:00,0,1,10.66,11.365,56,26.0027,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2011-01-20 01:00:00,0,1,10.66,13.635,56,0.0000,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2,2011-01-20 02:00:00,0,1,10.66,13.635,56,0.0000,1,0,0,...,0,0,2,0,0,0,1,0,0,0
3,2011-01-20 03:00:00,0,1,10.66,12.880,56,11.0014,1,0,0,...,0,0,3,0,0,0,1,0,0,0
4,2011-01-20 04:00:00,0,1,10.66,12.880,56,11.0014,1,0,0,...,0,0,4,0,0,0,1,0,0,0
5,2011-01-20 05:00:00,0,1,9.84,11.365,60,15.0013,1,0,0,...,0,0,5,0,0,0,1,0,0,0
6,2011-01-20 06:00:00,0,1,9.02,10.605,60,15.0013,1,0,0,...,0,0,6,0,0,0,1,0,0,0
7,2011-01-20 07:00:00,0,1,9.02,10.605,55,15.0013,1,0,0,...,0,0,7,0,0,0,1,0,0,0
8,2011-01-20 08:00:00,0,1,9.02,10.605,55,19.0012,1,0,0,...,0,0,8,0,0,0,1,0,0,0
9,2011-01-20 09:00:00,0,1,9.84,11.365,52,15.0013,1,0,0,...,0,0,9,0,0,0,1,0,0,0


In [78]:
#make predictions

#lm
X_train_transformed_and_reduced_df.to_csv("train_data_transformed.csv", index=False)
X_train_transformed = pd.read_csv('train_data_transformed.csv')

X_test_transformed_and_reduced_df.to_csv("test_data_transformed.csv", index=False)
X_test_transformed = pd.read_csv('test_data_transformed.csv')

lm2 = LinearRegression()
lm2.fit(X_train_transformed.iloc[:,2:], y_train)
predictionsLM2 = lm2.predict(X_test_transformed.iloc[:,2:])

In [80]:
#X_train_transformed.iloc[:,2:]

In [81]:
#lm
submission_countLM = pd.DataFrame(predictionsLM2)
submission_countLM[submission_countLM < 0] = 0

submissionLM = pd.concat([date_time,submission_countLM], axis=1)
submissionLM.columns = ['datetime','count'] 

In [85]:
#KNN

#A)length of documents and number of feature

#initiate instance for K-nearest Neighbors, fit to vectorized train data, and predict from vectorized test data
KNN = KNeighborsClassifier(n_neighbors = 1)
#classifierKNN = KNN.fit(X_train_transformed.iloc[:,2:], y_train)
#predictionsKNN = KNN.predict(X_dev.iloc[:,1:])

#calculate f1 score
#f1KNN = metrics.f1_score(y_dev, predictionsKNN, average = 'weighted')
#print "F1 Score for K-Nearest Neighbors with k=1: %3.2f" %(f1KNN)

#determine optimal parameter k by setting range for k and leveraging GridSearchCV
k_range = np.arange(30) + 1
kParameters = {'n_neighbors': k_range} 
optimalKNN = GridSearchCV(KNN, kParameters, cv = 10)
optimalKNN.fit(X_train_transformed.iloc[:,2:], y_train)
print "Optimal value of k for K Nearest Neighbors: ", (optimalKNN.best_params_), "\n"



Optimal value of k for K Nearest Neighbors:  {'n_neighbors': 7} 



In [86]:
#Multinomial Naive Bayes    

#initiate default Multinomial Naive Bayes, fit to vectorized train data, and predict from vectorized test data
Mult = MultinomialNB(alpha = 0.5358)
#classifierMNB = Mult.fit(X_train_transformed.iloc[:,2:], y_train)
#predictionsMNB = Mult.predict(X_dev.iloc[:,1:])

#calculate f1 score for Multinomial Naive Bayes fit.
#f1MNB = metrics.f1_score(y_dev, predictionsMNB, average = 'weighted')
#print "F1 Score for Multinomial Naive Bayes with alpha=0.5358: %3.2f" %(f1MNB)

#determine optimal parameter alpha by setting range for alpha and leveraging GridSearchCV
alpha_range = np.linspace(0.001, 1, num=100)
alphaParameters = {'alpha': alpha_range}
optimalMNB = GridSearchCV(Mult, alphaParameters, cv = 10)
optimalMNB.fit(X_train_transformed.iloc[:,2:], y_train)
print "Optimal value of alpha for Multinomial Naive Bayes: ", optimalMNB.best_params_, "\n"

Optimal value of alpha for Multinomial Naive Bayes:  {'alpha': 0.92936363636363639} 



In [87]:
KNN2 = KNeighborsClassifier(n_neighbors = 7)
classifierKNN2 = KNN2.fit(X_train_transformed.iloc[:,2:], y_train)
predictionsKNN2 = KNN2.predict(X_test_transformed.iloc[:,2:])


Mult2 = MultinomialNB(alpha=0.92936363636363639)
classifierMNB2 = Mult2.fit(X_train_transformed.iloc[:,2:], y_train)
predictionsMNB2 = Mult2.predict(X_test_transformed.iloc[:,2:])

In [88]:
#knn
submission_countKNN = pd.DataFrame(predictionsKNN2)
submission_countKNN[submission_countKNN < 0] = 0

submissionKNN = pd.concat([date_time,submission_countKNN], axis=1)
submissionKNN.columns = ['datetime','count']

#MNB
submission_countMNB = pd.DataFrame(predictionsMNB2)
submission_countMNB[submission_countMNB < 0] = 0

submissionMNB = pd.concat([date_time,submission_countMNB], axis=1)
submissionMNB.columns = ['datetime','count'] 

In [94]:
submissionKNN

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,3
1,2011-01-20 01:00:00,4
2,2011-01-20 02:00:00,4
3,2011-01-20 03:00:00,1
4,2011-01-20 04:00:00,27
5,2011-01-20 05:00:00,1
6,2011-01-20 06:00:00,1
7,2011-01-20 07:00:00,1
8,2011-01-20 08:00:00,124
9,2011-01-20 09:00:00,89


In [95]:
submissionKNN.to_csv("Trial 2 - KNN.csv", index=False)