In [1]:
# importing dependencies
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

# For model building
from sklearn.model_selection import train_test_split, cross_val_score

# Models
from sklearn.ensemble import RandomForestRegressor

## This notebook will be used for building a random forest in python.

Loading in the data to be used

In [2]:
# Load in the data
weather = pd.read_csv("~/Projects/NiceRide/Weather_data/01012010_12312017.csv") # Weather data

temp = []
for x in [2010 + x for x in range(8)] :
#   add dtype declaration to remove warning
    x = pd.read_csv("~/Projects/NiceRide/Nice_Ride_data/"+str(x)+"/NiceRide_trip_history_"+str(x)+".csv")
                   # dtype = {"Start station ID" : pd.int64, "End station ID" : pd.int64, "Account type": pd.Categorical}
    temp.append(x)
    nr = pd.concat(temp)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Changing dates from objects into datetimes
weather['DATE'] = weather['DATE'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

nr['Start date'] = nr['Start date'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %H:%M'))
nr['End date'] = nr['End date'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %H:%M'))

Correcting missing data in the weather dataframe

In [4]:
# Changing TAVG-NAN values to the mean of the TMAX and TMIN
weather = weather.fillna(value={'TAVG': weather[weather.TAVG.isna()][['TMAX', 'TMIN']]
                                .agg("mean", axis="columns")})

### Preparing the data to be used for the model

In [5]:
# Resampling our NR data so it takes the start date and Total duration, returns a count of rides per day 'daily_totals'

daily_totals = nr[['Start date', 'Total duration (seconds)']].resample('D', on='Start date').count()
daily_totals = daily_totals.drop('Start date', axis=1)
daily_totals = daily_totals.reset_index()
daily_totals = daily_totals.rename(index=str, columns={"Total duration (seconds)": "DCOUNT",'Start date':'DATE'})

In [6]:
print(daily_totals.head())
print(daily_totals.describe())

        DATE  DCOUNT
0 2010-06-07       1
1 2010-06-08       2
2 2010-06-09       7
3 2010-06-10     104
4 2010-06-11     287
            DCOUNT
count  2709.000000
mean    994.531931
std    1058.519993
min       0.000000
25%       0.000000
50%     722.000000
75%    1732.000000
max    5625.000000


The daily totals DataFrame looks good

In [7]:
print(weather.keys())
# The weather station and name won't be necessary or add anything to our analysis
weather = weather.drop(['STATION', 'NAME'], axis=1)

Index(['STATION', 'NAME', 'DATE', 'AWND', 'PRCP', 'SNOW', 'SNWD', 'TAVG',
       'TMAX', 'TMIN'],
      dtype='object')


In [8]:
weather.head()

Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN
0,2010-01-01,8.95,0.0,0.0,9,-1.5,6,-9
1,2010-01-02,4.03,0.0,0.0,9,-7.0,1,-15
2,2010-01-03,2.91,0.0,0.0,9,-3.5,7,-14
3,2010-01-04,5.59,0.0,0.0,9,-1.5,7,-10
4,2010-01-05,3.13,0.0,0.0,9,0.5,10,-9


In [9]:
temp = weather.merge(daily_totals, how='outer',on='DATE')
temp.info()
temp.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2922 entries, 0 to 2921
Data columns (total 9 columns):
DATE      2922 non-null datetime64[ns]
AWND      2922 non-null float64
PRCP      2922 non-null float64
SNOW      2922 non-null float64
SNWD      2922 non-null int64
TAVG      2922 non-null float64
TMAX      2922 non-null int64
TMIN      2922 non-null int64
DCOUNT    2709 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(3)
memory usage: 228.3 KB


Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,DCOUNT
0,2010-01-01,8.95,0.0,0.0,9,-1.5,6,-9,
1,2010-01-02,4.03,0.0,0.0,9,-7.0,1,-15,
2,2010-01-03,2.91,0.0,0.0,9,-3.5,7,-14,
3,2010-01-04,5.59,0.0,0.0,9,-1.5,7,-10,
4,2010-01-05,3.13,0.0,0.0,9,0.5,10,-9,


In [10]:
# Notice how the count of DCOUNT varies from what the other columns have
features = temp.fillna(value=0,axis=0) # Where we don't have a daily count (Dcount) fill this with zeros instead
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2922 entries, 0 to 2921
Data columns (total 9 columns):
DATE      2922 non-null datetime64[ns]
AWND      2922 non-null float64
PRCP      2922 non-null float64
SNOW      2922 non-null float64
SNWD      2922 non-null int64
TAVG      2922 non-null float64
TMAX      2922 non-null int64
TMIN      2922 non-null int64
DCOUNT    2922 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(3)
memory usage: 228.3 KB


In [11]:
features.head()

Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,DCOUNT
0,2010-01-01,8.95,0.0,0.0,9,-1.5,6,-9,0.0
1,2010-01-02,4.03,0.0,0.0,9,-7.0,1,-15,0.0
2,2010-01-03,2.91,0.0,0.0,9,-3.5,7,-14,0.0
3,2010-01-04,5.59,0.0,0.0,9,-1.5,7,-10,0.0
4,2010-01-05,3.13,0.0,0.0,9,0.5,10,-9,0.0


This next section will change our date column from dtype datetime into seperate numerical relevant date colummns

In [13]:
features['YEAR'] = features['DATE'].dt.year
features['MONTH'] = features['DATE'].dt.month
features['DAY'] = features['DATE'].dt.day

In [14]:
features.head() # Three new, seperate, columns that identify year, month, day

Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,DCOUNT,YEAR,MONTH,DAY,WEND
0,2010-01-01,8.95,0.0,0.0,9,-1.5,6,-9,0.0,2010,1,1,4
1,2010-01-02,4.03,0.0,0.0,9,-7.0,1,-15,0.0,2010,1,2,5
2,2010-01-03,2.91,0.0,0.0,9,-3.5,7,-14,0.0,2010,1,3,6
3,2010-01-04,5.59,0.0,0.0,9,-1.5,7,-10,0.0,2010,1,4,0
4,2010-01-05,3.13,0.0,0.0,9,0.5,10,-9,0.0,2010,1,5,1


In [32]:
# Create a weekday variable column Mon = 0, Sun = 7
features['WEND'] = features['DATE'].dt.weekday

In [33]:
features.head(7)

Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,DCOUNT,YEAR,MONTH,DAY,WEND
0,2010-01-01,8.95,0.0,0.0,9,-1.5,6,-9,0.0,2010,1,1,4
1,2010-01-02,4.03,0.0,0.0,9,-7.0,1,-15,0.0,2010,1,2,5
2,2010-01-03,2.91,0.0,0.0,9,-3.5,7,-14,0.0,2010,1,3,6
3,2010-01-04,5.59,0.0,0.0,9,-1.5,7,-10,0.0,2010,1,4,0
4,2010-01-05,3.13,0.0,0.0,9,0.5,10,-9,0.0,2010,1,5,1
5,2010-01-06,2.91,0.0,0.0,9,6.0,16,-4,0.0,2010,1,6,2
6,2010-01-07,15.88,0.06,2.2,11,10.0,16,4,0.0,2010,1,7,3


# This section:

I'm looking to create a numerical catagorical variable from features.WEND that will state if it's a weekend, yes or no.

In [39]:
# Adjust WEND column to be catagorical; If it's a weekday WEND = 0 else if weekend WEND = 1
features['WEND'].apply([(features['WEND'] = 1) if (features.WEND > 5) else (features['WEND'] = 0) for features.WEND])

SyntaxError: invalid syntax (<ipython-input-39-88e27a1c1074>, line 2)

In [31]:
features.head(7)

Unnamed: 0,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,DCOUNT,YEAR,MONTH,DAY,WEND
0,2010-01-01,8.95,0.0,0.0,9,-1.5,6,-9,0.0,2010,1,1,0
1,2010-01-02,4.03,0.0,0.0,9,-7.0,1,-15,0.0,2010,1,2,0
2,2010-01-03,2.91,0.0,0.0,9,-3.5,7,-14,0.0,2010,1,3,0
3,2010-01-04,5.59,0.0,0.0,9,-1.5,7,-10,0.0,2010,1,4,0
4,2010-01-05,3.13,0.0,0.0,9,0.5,10,-9,0.0,2010,1,5,0
5,2010-01-06,2.91,0.0,0.0,9,6.0,16,-4,0.0,2010,1,6,0
6,2010-01-07,15.88,0.06,2.2,11,10.0,16,4,0.0,2010,1,7,0
