In [1]:
# importing dependencies
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import seaborn as sns

# Load in dependencies

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Load in the data
weather = pd.read_csv("~/Projects/NiceRide/Weather_data/01012010_12312017.csv") # Weather data

temp = []
for x in [2010 + x for x in range(8)] :
#   add dtype declaration to remove warning
    x = pd.read_csv("~/Projects/NiceRide/Nice_Ride_data/"+str(x)+"/NiceRide_trip_history_"+str(x)+".csv")
                   # dtype = {"Start station ID" : pd.int64, "End station ID" : pd.int64, "Account type": pd.Categorical}
    temp.append(x)
    nr = pd.concat(temp)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Changing dates from objects into datetimes
weather['DATE'] = weather['DATE'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

nr['Start date'] = nr['Start date'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %H:%M'))
nr['End date'] = nr['End date'].apply(lambda x: dt.datetime.strptime(x,'%m/%d/%Y %H:%M'))

In [4]:
# Changing TAVG-NAN values to the mean of the TMAX and TMIN
weather = weather.fillna(value={'TAVG': weather[weather.TAVG.isna()][['TMAX', 'TMIN']]
                                .agg("mean", axis="columns")})

In [5]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 10 columns):
STATION    2922 non-null object
NAME       2922 non-null object
DATE       2922 non-null datetime64[ns]
AWND       2922 non-null float64
PRCP       2922 non-null float64
SNOW       2922 non-null float64
SNWD       2922 non-null int64
TAVG       2922 non-null float64
TMAX       2922 non-null int64
TMIN       2922 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(3), object(2)
memory usage: 228.4+ KB


In [6]:
nr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2694187 entries, 0 to 460715
Data columns (total 8 columns):
Start date                  datetime64[ns]
Start station               object
Start station ID            object
End date                    datetime64[ns]
End station                 object
End station ID              object
Account type                object
Total duration (seconds)    int64
dtypes: datetime64[ns](2), int64(1), object(5)
memory usage: 185.0+ MB


In [7]:
# Resampling our NR data so it takes the start date and Total duration, returns a count of rides per day 'daily_totals'

daily_totals = nr[['Start date', 'Total duration (seconds)']].resample('D', on='Start date').count()
daily_totals = daily_totals.drop('Start date', axis=1)
daily_totals = daily_totals.reset_index()
daily_totals = daily_totals.rename(index=str, columns={"Total duration (seconds)": "Dcount",'Start date':'DATE'})

In [8]:
daily_totals.info()
daily_totals.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2709 entries, 0 to 2708
Data columns (total 2 columns):
DATE      2709 non-null datetime64[ns]
Dcount    2709 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 63.5+ KB


Unnamed: 0,DATE,Dcount
0,2010-06-07,1
1,2010-06-08,2
2,2010-06-09,7
3,2010-06-10,104
4,2010-06-11,287


In [9]:
weather.keys()
weather = weather.drop(['STATION', 'NAME', 'SNOW', 'SNWD'], axis=1)

In [10]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 6 columns):
DATE    2922 non-null datetime64[ns]
AWND    2922 non-null float64
PRCP    2922 non-null float64
TAVG    2922 non-null float64
TMAX    2922 non-null int64
TMIN    2922 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 137.0 KB


In [11]:
temp = weather.merge(daily_totals, how='outer',on='DATE')
temp.info()
temp.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2922 entries, 0 to 2921
Data columns (total 7 columns):
DATE      2922 non-null datetime64[ns]
AWND      2922 non-null float64
PRCP      2922 non-null float64
TAVG      2922 non-null float64
TMAX      2922 non-null int64
TMIN      2922 non-null int64
Dcount    2709 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 182.6 KB


Unnamed: 0,DATE,AWND,PRCP,TAVG,TMAX,TMIN,Dcount
0,2010-01-01,8.95,0.0,-1.5,6,-9,
1,2010-01-02,4.03,0.0,-7.0,1,-15,
2,2010-01-03,2.91,0.0,-3.5,7,-14,
3,2010-01-04,5.59,0.0,-1.5,7,-10,
4,2010-01-05,3.13,0.0,0.5,10,-9,


In [12]:
train_test_data = temp.dropna(axis=0) # Dropping all columns with NaN values in Dcount
train_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2709 entries, 157 to 2865
Data columns (total 7 columns):
DATE      2709 non-null datetime64[ns]
AWND      2709 non-null float64
PRCP      2709 non-null float64
TAVG      2709 non-null float64
TMAX      2709 non-null int64
TMIN      2709 non-null int64
Dcount    2709 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 169.3 KB


In [13]:
X = train_test_data.drop(['DATE','Dcount'], axis=1)
# X = train_test_data.drop(['DATE', 'AWND', 'PRCP', 'TAVG', 'TMIN', 'Dcount'], axis=1)
y = train_test_data.drop(['DATE', 'AWND', 'PRCP', 'TAVG','TMAX', 'TMIN'], axis=1)

In [14]:
X.info()
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2709 entries, 157 to 2865
Data columns (total 5 columns):
AWND    2709 non-null float64
PRCP    2709 non-null float64
TAVG    2709 non-null float64
TMAX    2709 non-null int64
TMIN    2709 non-null int64
dtypes: float64(3), int64(2)
memory usage: 127.0 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2709 entries, 157 to 2865
Data columns (total 1 columns):
Dcount    2709 non-null float64
dtypes: float64(1)
memory usage: 42.3 KB


In [15]:
# Split the data into a training and test set.

Xlr, Xtestlr, ylr, ytestlr = train_test_split(X, y, random_state=42) # for Douglas Adams.

log_reg = LogisticRegression()

# Fit the model on the trainng data.
log_reg.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(log_reg.predict(Xtestlr), ytestlr))

  y = column_or_1d(y, warn=True)


0.38495575221238937


**0.385 - Back to the drawing board**