In [1]:
# importing dependencies
import numpy as np
import pandas as pd
import datetime as dt

# For model building
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

# Optimizations
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## This notebook will be used for building a random forest in python.

Loading in the data to be used

In [2]:
# Load in the data
weather = pd.read_csv("~/Projects/NiceRide/Weather_data/01012010_12312017.csv") # Weather data

temp = []
for x in [2010 + x for x in range(8)] :
#   add dtype declaration to remove warning
    x = pd.read_csv("~/Projects/NiceRide/Nice_Ride_data/"+str(x)+"/NiceRide_trip_history_"+str(x)+".csv")
                   # dtype = {"Start station ID" : pd.int64, "End station ID" : pd.int64, "Account type": pd.Categorical}
    temp.append(x)
    nr = pd.concat(temp)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Changing dates from objects into datetimes
weather['DATE'] = weather['DATE'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

nr['Start date'] = nr['Start date'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %H:%M'))
nr['End date'] = nr['End date'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %H:%M'))

Correcting missing data in the weather dataframe

In [4]:
# Changing TAVG-NAN values to the mean of the TMAX and TMIN
weather = weather.fillna(value={'TAVG': weather[weather.TAVG.isna()][['TMAX', 'TMIN']]
                                .agg("mean", axis="columns")})

Preparing the data to be used for the model

In [5]:
# Resampling our NR data so it takes the start date and Total duration, returns a count of rides per day 'daily_totals'

daily_totals = nr[['Start date', 'Total duration (seconds)']].resample('D', on='Start date').count()
daily_totals = daily_totals.drop('Start date', axis=1)
daily_totals = daily_totals.reset_index()
daily_totals = daily_totals.rename(index=str, columns={"Total duration (seconds)": "Dcount",'Start date':'DATE'})

In [6]:
weather.keys()
weather = weather.drop(['STATION', 'NAME', 'SNOW', 'SNWD'], axis=1)

In [7]:
daily_totals.head()

Unnamed: 0,DATE,Dcount
0,2010-06-07,1
1,2010-06-08,2
2,2010-06-09,7
3,2010-06-10,104
4,2010-06-11,287


In [8]:
daily_totals.describe()

Unnamed: 0,Dcount
count,2709.0
mean,994.531931
std,1058.519993
min,0.0
25%,0.0
50%,722.0
75%,1732.0
max,5625.0


In [9]:
weather.head()

Unnamed: 0,DATE,AWND,PRCP,TAVG,TMAX,TMIN
0,2010-01-01,8.95,0.0,-1.5,6,-9
1,2010-01-02,4.03,0.0,-7.0,1,-15
2,2010-01-03,2.91,0.0,-3.5,7,-14
3,2010-01-04,5.59,0.0,-1.5,7,-10
4,2010-01-05,3.13,0.0,0.5,10,-9


In [10]:
temp = weather.merge(daily_totals, how='outer',on='DATE')
temp.info()
temp.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2922 entries, 0 to 2921
Data columns (total 7 columns):
DATE      2922 non-null datetime64[ns]
AWND      2922 non-null float64
PRCP      2922 non-null float64
TAVG      2922 non-null float64
TMAX      2922 non-null int64
TMIN      2922 non-null int64
Dcount    2709 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 182.6 KB


Unnamed: 0,DATE,AWND,PRCP,TAVG,TMAX,TMIN,Dcount
0,2010-01-01,8.95,0.0,-1.5,6,-9,
1,2010-01-02,4.03,0.0,-7.0,1,-15,
2,2010-01-03,2.91,0.0,-3.5,7,-14,
3,2010-01-04,5.59,0.0,-1.5,7,-10,
4,2010-01-05,3.13,0.0,0.5,10,-9,


In [11]:
features = temp.fillna(value=0,axis=0) # Where we don't have a daily count (Dcount) fill this with zeros instead
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2922 entries, 0 to 2921
Data columns (total 7 columns):
DATE      2922 non-null datetime64[ns]
AWND      2922 non-null float64
PRCP      2922 non-null float64
TAVG      2922 non-null float64
TMAX      2922 non-null int64
TMIN      2922 non-null int64
Dcount    2922 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 182.6 KB


In [12]:
features.head()

Unnamed: 0,DATE,AWND,PRCP,TAVG,TMAX,TMIN,Dcount
0,2010-01-01,8.95,0.0,-1.5,6,-9,0.0
1,2010-01-02,4.03,0.0,-7.0,1,-15,0.0
2,2010-01-03,2.91,0.0,-3.5,7,-14,0.0
3,2010-01-04,5.59,0.0,-1.5,7,-10,0.0
4,2010-01-05,3.13,0.0,0.5,10,-9,0.0


This next section will change our date column from dtype datetime into seperate numerical relevant date colummns

In [13]:
features['YEAR'] = features['DATE'].dt.year
features['MONTH'] = features['DATE'].dt.month
features['DAY'] = features['DATE'].dt.day

In [14]:
features.head()

Unnamed: 0,DATE,AWND,PRCP,TAVG,TMAX,TMIN,Dcount,YEAR,MONTH,DAY
0,2010-01-01,8.95,0.0,-1.5,6,-9,0.0,2010,1,1
1,2010-01-02,4.03,0.0,-7.0,1,-15,0.0,2010,1,2
2,2010-01-03,2.91,0.0,-3.5,7,-14,0.0,2010,1,3
3,2010-01-04,5.59,0.0,-1.5,7,-10,0.0,2010,1,4
4,2010-01-05,3.13,0.0,0.5,10,-9,0.0,2010,1,5


In [15]:
features.describe()

Unnamed: 0,AWND,PRCP,TAVG,TMAX,TMIN,Dcount,YEAR,MONTH,DAY
count,2922.0,2922.0,2922.0,2922.0,2922.0,2922.0,2922.0,2922.0,2922.0
mean,8.989535,0.091164,47.887235,56.341547,39.34976,922.03525,2013.500342,6.52293,15.729637
std,3.581173,0.279348,22.70818,23.899131,22.065311,1051.488192,2.291531,3.449293,8.801598
min,0.89,0.0,-17.0,-12.0,-23.0,0.0,2010.0,1.0,1.0
25%,6.49,0.0,30.5,36.0,24.0,0.0,2012.0,4.0,8.0
50%,8.5,0.0,50.5,60.0,41.0,579.5,2013.5,7.0,16.0
75%,11.18,0.03,68.0,77.75,59.0,1629.25,2015.75,10.0,23.0
max,24.61,4.13,91.0,103.0,81.0,5625.0,2017.0,12.0,31.0


In [16]:
# Need to drop all days where there where no rides taken (Dcount = 0)
features = features[features['Dcount'] != 0].reset_index(drop=True)