In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


# Drivr 
## An interactive predictive model to give rideshare drivers more control over their day

### Time series predictions using a deep neural network (via keras) for each region. My current model for fares yields gives over 80% accuracy on the my test data.


#### Load in and Clean Data

In [4]:
filename = 'Transportation_Network_Providers_Trips.csv'
raw_df = pd.read_csv(filename,
#                       nrows=15000000,
                     usecols=['Trip ID',
                              'Fare','Tip',
                              'Trip Total', 
                              'Trip Start Timestamp',
                              'Pickup Community Area',
                              'Dropoff Community Area'])

In [None]:
plt.figure(figsize=(10, 6))
raw_df['Fare'].plot();

In [5]:
def clean_data(raw_data, region=8.0, metric='Fare'):
    area_df = raw_df[raw_df['Pickup Community Area'] == region]
    
    metric_df = area_df[['Trip Start Timestamp', metric]].copy()
    metric_df = metric_df.dropna()
    metric_df.reset_index(inplace=True, drop=True)
    metric_df['datetime'] = metric_df['Trip Start Timestamp'].apply(lambda x : datetime.strptime(x,"%m/%d/%Y %I:%M:%S %p"))

    grouped_df = metric_df[['datetime', metric]].groupby('datetime').mean() #or reduc_df
    grouped_df.reset_index(inplace=True)
    grouped_df.columns = ['datetime', 'metric']
    return grouped_df

def scale_data(train_data, test_data):
    '''
    This function takes the train and test data and scales the values to [-1, 1]
    The input arrays must be of shape (N_samples, 1) (may need to do something like train.values.reshape(-1,1))
    Returns:
        X_train: scaled train inputs
        y_train: scaled train targets
        X_test: scaled test inputs
        y_train: scaled train targets
    '''
    scaler = MinMaxScaler(feature_range=(-1, 1))
    train_sc = scaler.fit_transform(train_data)
    test_sc = scaler.transform(test_data)
    
    X_train = train_sc[:-1]
    y_train = train_sc[1:]

    X_test = test_sc[:-1]
    y_test = test_sc[1:]
    
    return X_train, y_train, X_test, y_test

def create_train_test_data(cleaned_data, test_size=1000):
    train = cleaned_data['metric'][:-test_size]
    test = cleaned_data['metric'][test_size:]

    train = train.values.reshape(-1,1)
    test = test.values.reshape(-1,1)
    
    X_train, y_train, X_test, y_test = scale_data(train, test)
    return X_train, y_train, X_test, y_test

def fit_model(X_train, y_train):
    '''
    Function that fits a Sequential neural net on training data and returns a trained model
    '''
    nn_model = Sequential()
    nn_model.add(Dense(12, input_dim=1, activation='relu'))
    nn_model.add(Dense(1))
    nn_model.compile(loss='mean_squared_error', optimizer='adam')
    early_stop = EarlyStopping(monitor='loss', patience=2, verbose=1)
    history = nn_model.fit(X_train, y_train, epochs=100, batch_size=100,
                           verbose=1, callbacks=[early_stop], shuffle=True)
    nn_model.save('./nn_mod_reg'+str(region)+'_tip.h5')
    return nn_model



#### Insert region numbers to run model over each. Train and pickle regions for faster access. Limit test_size to improve runtime.

In [6]:
regions = [70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0] #Regions 1-77
test_size=10000

for region in regions:

    cleaned_data = clean_data(raw_df, region, 'Tip')
    X_train, y_train, X_test, y_test = create_train_test_data(cleaned_data)
    model = fit_model(X_train, y_train)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping


In [123]:
from keras.models import load_model
model = load_model('./nn_mod_reg71.0_tip.h5')

In [124]:
y_pred_test_nn = model.predict(X_test)
y_train_pred_nn = model.predict(X_train)
print("The R2 score on the Train set is:\t{:0.3f}".format(r2_score(y_train, y_train_pred_nn)))
print("The R2 score on the Test set is:\t{:0.3f}".format(r2_score(y_test, y_pred_test_nn)))

The R2 score on the Train set is:	0.833
The R2 score on the Test set is:	0.830
