# Task 2

This is the second task for the 'Practical Course Data Science' (M.Inf.1800). In this task you will tackle your first real data science problem with sort of big data: given a list of taxi rides, the goal is to predict the price of each taxi ride.

In [1]:
# Manage imports
import numpy as np
import pandas as pd
import seaborn as sns  
import matplotlib.pyplot as plt

## Clean Data

In [2]:
# read data in memory
data = pd.read_csv("clean.csv")

In [3]:
df = data

In [None]:
df.groupby(['RatecodeID']).mta_tax.mean()

In [None]:
df["tpep_pickup_datetime"] = pd.to_datetime(df.tpep_pickup_datetime)
df['tpep_dropoff_datetime'] = pd.to_datetime(df.tpep_dropoff_datetime)

Add duration

In [None]:
'''
Add duration element: How long was the trip
'''
# Get duration
df['duration'] = pd.to_datetime(df["tpep_dropoff_datetime"])-pd.to_datetime(df["tpep_pickup_datetime"])

#drop all negative and too short rides  (< 30s , 40s is lowest in test data!)
df = df.drop(df[df.duration < pd.Timedelta("30 seconds")].index)

# drop all > 3h (unreasonable long, 2:30 is longest in test data)
df = df.drop(df[df.duration > pd.Timedelta("3 hours")].index)

Remove outliers.
Check each category and remove all values which are more than 5 stds away!

In [None]:
'''
Add a weekday element
'''
df['day'] = (pd.to_datetime(df['tpep_pickup_datetime']).dt.weekday_name)

In [None]:
# Get Holidays
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
dr = pd.date_range(start=df['tpep_pickup_datetime'].min(), end=df['tpep_pickup_datetime'].max())
holidays = cal.holidays(start=dr.min(), end=dr.max())
df['holiday'] = df['tpep_pickup_datetime'].isin(holidays)

In [None]:
df['day'].value_counts()

In [None]:
df['day'] = df.day.astype('category')
df['RatecodeID'] = df.RatecodeID.astype('category')

In [None]:
df['meanDate'] = (  pd.to_datetime(df['tpep_pickup_datetime']) \
 + ((pd.to_datetime(df['tpep_dropoff_datetime'])-pd.to_datetime(df['tpep_pickup_datetime']))/2) \
)

In [None]:
df['time']= df.meanDate - pd.to_datetime(df.meanDate.dt.date)

In [None]:
df['pickup_time'] = df.tpep_pickup_datetime - pd.to_datetime(df.tpep_pickup_datetime.dt.date)

In [None]:
df['dropoff_time'] = df.tpep_dropoff_datetime - pd.to_datetime(df.tpep_dropoff_datetime.dt.date)

Check for overlapping Overnight and Rushhours (should be 1.5 tax)

As there is no overlapping, all 1.5 taxes have to be invalid -> Remove!

In [None]:
df = df.drop(df[df.extra == 1.5].index)

## Clean obvious errors

In [None]:
df.extra.value_counts()

In [None]:
# Rush Hour on Weekends
df = df.drop(df[(df.extra.isin([1,4.5])) & (df.day.isin(['Sunday','Saturday']))].index)
# Rush Hour picked up after 20
df = df.drop(df[(df.extra.isin([1,4.5])) & (pd.to_datetime(df.tpep_pickup_datetime).dt.hour >= 20)].index)
# Rush Hour dropped before 16
df = df.drop(df[(df.extra.isin([1,4.5])) & (pd.to_datetime(df.tpep_dropoff_datetime).dt.hour < 16)].index)
# Overnight picked up after 6 and dropped before 20
df = df.drop(df[(df.extra == 0.5) & \
    ((pd.to_datetime(df.tpep_pickup_datetime).dt.hour >= 6) & \
    (pd.to_datetime(df.tpep_dropoff_datetime).dt.hour < 20))].index)

In [None]:
df.extra.value_counts()

## Calculate Actual Overnight and Rush hour (Test when it actually applies!)

In [4]:
%run functions.py

In [None]:
df['rush_hour'] = getRushHour(df)

In [None]:
df['overnight'] = getOvernight(df)

In [None]:
conditions = [ \
    (df['overnight'] == True) & (df['rush_hour'] == False).isin([1,3,4]), \
    (df['overnight'] == True) & (df['rush_hour'] == True).isin([1,3,4]), \
    (df['overnight'] == False) & (df['rush_hour'] == True) & (df['RatecodeID'] == 2), \
    (df['overnight'] == False) & (df['rush_hour'] == True) & (df['RatecodeID'].isin([1,3,4])), \
]
choices = [.5, 1.5, 4.5, 1]
df['calcExtra'] = np.select(conditions, choices, default=0)

In [None]:
df['calcExtra'].value_counts()

In [None]:
err = df.extra-df.calcExtra
err.describe()

In [None]:
sns.distplot(err,hist_kws={"log":True},kde=False);

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mean_squared_error(df.calcExtra,df.extra)

In [None]:
mean_absolute_error(df.calcExtra,df.extra)

## Check the differences of calculation and real extra. Why does it not match perfectly?

Check all values which are not overnight but still get the 0.5 charge

In [None]:
outlier = df[(df.extra == 0.5) & (df.overnight == False)]
sns.violinplot(x=outlier.extra,y=outlier.pickup_time.astype('timedelta64[m]'))
plt.yticks(np.arange(1440,step=60),np.arange(24,step=1))
plt.ylabel('Hour')
outlier[['pickup_time','dropoff_time']].describe()

There are 14k values wrongfully having a .5 extra cost.
The values range mostly from 19-20, however none was after 20 and so none should have the 0.5 extra cost. The same is true for the bottom, some are right after 6 (Pickup time!), in which case there shouldnt be any tax.

Check the opposite: Values which should have a 0.5 extra, but dont!

In [None]:
outlier = df[(df.RatecodeID == 1) & (df.extra != 0.5) & (df.overnight == True)]
sns.violinplot(x=outlier.extra,y=outlier.pickup_time.astype('timedelta64[m]'))
plt.yticks(np.arange(1440,step=60),np.arange(24,step=1))
plt.ylabel('Hour')
outlier[['pickup_time','dropoff_time']].describe()

Most values started before 20, for half the values the mean time is also below 50%, maybe the tax wasn't added then. However, there are many values which cant be correct

In [None]:
outlier.day.value_counts()

Check for overnight extra charge!
First check all with extra 1 and outside of an rush hour!

In [None]:
outlier = df[(df.extra ==1) & (df.rush_hour == False) & (df.RatecodeID.isin([1,3,4,5]))]
sns.violinplot(x=outlier.overnight,y=outlier.pickup_time.astype('timedelta64[m]'))
plt.yticks(np.arange(0,1440,step=120),np.arange(1,24,step=2))
plt.ylabel('Hour')
outlier[['pickup_time','dropoff_time']].describe()

In [None]:
outlier = df[(df.extra != 1) & (df.rush_hour == True) & (df.RatecodeID.isin([1,3,4,5]))]
sns.violinplot(x=outlier.extra,y=outlier.pickup_time.astype('timedelta64[m]'))
plt.yticks(np.arange(0,1440,step=120),np.arange(0,24,step=2))
plt.ylabel('Hour')
outlier[['pickup_time','dropoff_time','rush_hour']].describe()

# Train model to predict extra more precise

In [None]:
df['workday'] = (df.tpep_pickup_datetime.dt.weekday < 5)&(df.holiday==False)
df['pickup_time_s'] = df['dropoff_time'].astype('timedelta64[s]')
df['duration_s'] = df['duration'].astype('timedelta64[s]')
df['dropoff_time_s'] = df['pickup_time'].astype('timedelta64[s]')
df['weekday'] = df.tpep_pickup_datetime.dt.weekday

In [None]:
df[df.workday==True].day.value_counts()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
sample = df.sample(10000)

le.fit(sample.extra)
extra_coded = le.transform(sample.extra)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(sample[['pickup_time_s','dropoff_time_s','workday']]\
                                                     ,extra_coded, test_size=0.2, random_state=1337)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
rfr = RandomForestClassifier(random_state=1337)
rfr.fit(X_train,y_train)


In [None]:
rfr_pred = rfr.predict(X_test)

In [None]:
# Decode
rfr_pred = le.inverse_transform(rfr_pred)
y_test = le.inverse_transform(y_test)

In [None]:
err = pd.DataFrame(y_test-rfr_pred)
err.describe()

In [None]:
ax = sns.distplot(err,hist_kws={"log":True},kde=False);

In [None]:
# Feature importance# Feature 
pd.DataFrame({"Features" : X_train.columns,"estimatedCoefficients": rfr.feature_importances_})

In [None]:
rfr_pred

In [None]:
mean_absolute_error(rfr_pred,y_test)

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)

In [None]:
clf_pred = clf.predict(X_test)
# Decode
clf_pred = le.inverse_transform(clf_pred)
#y_test = le.inverse_transform(y_test)
err = pd.DataFrame(y_test-clf_pred)
err.describe()

In [None]:
ax = sns.distplot(err,hist_kws={"log":True},kde=False);

In [None]:
mean_absolute_error(clf_pred,y_test)

## Predict mta_tax

In [5]:
subset = df[df.RatecodeID==5]

In [6]:
subset.mta_tax.value_counts()

0.0    7764
0.5    3401
Name: mta_tax, dtype: int64

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [8]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [9]:
le.fit(subset.mta_tax)
tax_coded = subset.mta_tax*2

X_train, X_test, y_train, y_test  = train_test_split(subset[['dropoff_latitude','dropoff_longitude']]\
                                                     ,tax_coded, test_size=0.2, random_state=1337)
y_test_label = y_test/2

In [None]:
rfc = RandomForestClassifier(random_state=1337)
rfc.fit(X_train,y_train)

In [None]:
pred = rfc.predict(X_test)
pred = pred/2

In [None]:
mean_absolute_error(pred,y_test_label)

In [10]:
clf = svm.SVC()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
pred = pred/2
mean_absolute_error(pred,y_test_label)

0.14890282131661442

In [27]:
pred.max()

0.0

In [11]:
clf

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(clf, 'mtaPredSVC')

In [19]:
C_range = np.logspace(-1,1,3)
C_range

array([ 0.1,  1. , 10. ])

In [24]:


from sklearn.model_selection import GridSearchCV

model = svm.SVC()
C_range = np.logspace(-2, 0, 3)
C_range = np.logspace(-2, 0, 3)
param_grid = dict(C=C_range)
grid = GridSearchCV(model,param_grid=param_grid,error_score='neg_mean_absolute_error')
grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 0.01} with a score of 0.69


In [25]:
pred = grid.predict(X_test)
pred = pred/2
mean_absolute_error(pred,y_test_label)

0.14890282131661442

# Predict tips amount

In [None]:
# Load packages
from pygeocoder import Geocoder
import pandas as pd
import numpy as np
import reverse_geocoder as rg


In [None]:
# Payment type 2 doesnt have tips
df['total_without_tips'] = df.total_amount-df.tip_amount

In [None]:
df = df.drop(['totalWithoutTip','tpep_pickup_datetime','tpep_dropoff_datetime','calcExtra','workday','pickup_area','dropoff_area'],axis=1)

In [None]:
df.info()

In [None]:
df['pickup_coord']=list(zip(df.pickup_latitude.round(2), df.pickup_longitude.round(2)))
df['dropoff_coord']=list(zip(df.dropoff_latitude.round(2), df.dropoff_longitude.round(2)))

In [None]:
df.pickup_coord.value_counts()

In [None]:
df = df.drop(df[df.pickup_coord == (0,0)].index)
df = df.drop(df[df.dropoff_coord == (0,0)].index)

In [None]:
df['avg_speed'] = df.trip_distance/df.duration.astype('timedelta64[s]')
df['pickup_hour'] = df.pickup_time.astype('timedelta64[h]')

In [None]:
import sklearn.preprocessing

In [None]:
lb = preprocessing.LabelBinarizer()
lb.fit(sample.pickup_area)
lb.transform(sample.pickup_area)
lb.transform(sample.dropoff_area)

In [None]:
df.pickup_area.head()

In [None]:
X_train, X_test, y_train, y_test \
    = train_test_split(df[['pickup_hour','day','RatecodeID','total_without_tips',\
                               'avg_speed','trip_distance','pickup_coord','dropoff_coord']]\
                     ,df['tip_amount'], test_size=0.2, random_state=1337)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=1337)

rfr.fit(X_train,y_train)

In [None]:
rfr_pred = rfr.predict(X_test)

In [None]:
err = pd.DataFrame(y_test-rfr_pred)
err.describe()

In [None]:
ax = sns.distplot(err,hist_kws={"log":True},kde=False);

In [None]:
# Feature importance# Feature 
pd.DataFrame({"Features" : X_train.columns,"estimatedCoefficients": rfr.feature_importances_})

In [None]:
mean_squared_error(rfr_pred,y_test)

# Exploration

In [None]:
import plotly.plotly as py
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook
import plotly.graph_objs as go

In [None]:
sample = df[df.RatecodeID==1].sample(100)

In [None]:
layout = dict(
        title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
        ),
    )

scl = [0,"rgb(150,0,90)"],[0.125,"rgb(0, 0, 200)"],[0.25,"rgb(0, 25, 255)"],\
[0.375,"rgb(0, 152, 255)"],[0.5,"rgb(44, 255, 150)"],[0.625,"rgb(151, 255, 0)"],\
[0.75,"rgb(255, 234, 0)"],[0.875,"rgb(255, 111, 0)"],[1,"rgb(255, 0, 0)"]

data = [ dict(
    lat = sample['pickup_latitude'],
    lon = sample['pickup_longitude'],
    text = "$"+sample['tip_amount'].astype(str),
    marker = dict(
        color = df['tip_amount'],
        colorscale = scl,
        reversescale = True,
        opacity = 0.7,
        size = 2,
        colorbar = dict(
            thickness = 10,
            titleside = "right",
            outlinecolor = "rgba(68, 68, 68, 0)",
            ticks = "outside",
            ticklen = 3,
            showticksuffix = "last",
            ticksuffix = " $",
            dtick = 10
        ),
    ),
    type = 'scattergeo'
) ]

plotly.offline.iplot({ 'data':data, 'layout':layout })

In [None]:
import plotly
print(plotly.__version__)  # version >1.9.4 required
from plotly.graph_objs import Scatter, Layout
plotly.offline.plot({
"data": [
    Scatter(x=[1, 2, 3, 4], y=[4, 1, 3, 7])
],
"layout": Layout(
    title="hello world"
)
})

In [None]:
df[df.RatecodeID == 4].extra.value_counts()

In [None]:
df[df.payment_type==1].tip_amount.value_counts()

In [None]:
sns.distplot(df[df.payment_type==1].tip_amount)

In [None]:
df.info()

In [None]:
sample = df[df.payment_type==1].sample(10000)
sns.lmplot(y='tip_amount',x='totalWithoutTip', data=sample)

tipping is mostly categorical:
* Fixed sum
    * $0,$1,$2,$3,$4,$5... depending on the total. Round to the next full dollar, with around 20% tips
* Fixed percentage
    * There are 3 clear lines
        * lowest and biggest equals 20% tip
        * middle equals 24% tip
        * highest equals 26 % tip?
        
total regression is below the 20% line however
* Due to many 0 tippers around all ranges
* Many low tipper with fixed amount


### Predict 0 tippers
### Predict fixed tippers?
### Calculate regression w/ 0 tippers, use as base?

In [None]:
sns.lmplot(y='dropoff_longitude',x='dropoff_latitude', data=sample)

In [None]:
a = sns.distplot(df.time.astype('timedelta64[m]'))
plt.xticks(np.arange(1440,step=180),np.arange(24,step=3))
plt.xlabel('Hour')

In [None]:
sns.violinplot(x=df.extra,y=df.time.astype('timedelta64[m]'))

In [None]:
df.mta_tax.value_counts t()

sns.distplot(df.duration.astype('timedelta64[m]'))

In [None]:
plt.figure(figsize=(16,9))
plt.subplot(2,3,1)
sns.countplot(df.extra)
plt.subplot(2,3,2)
sns.countplot(df.overnight)
plt.subplot(2,3,3)
sns.countplot(df.rush_hour)
plt.subplot(2,3,4)
sns.countplot(df.day)
plt.subplot(2,3,5)
sns.countplot(df.passenger_count)
plt.subplot(2,3,6)
sns.countplot(df.RatecodeID)

In [None]:
sample["duration_m"] = sample.duration.astype('timedelta64[m]')

In [None]:
sns.lmplot(x="trip_distance", y="duration_m",hue="RatecodeID", data=df.sample(10000))

In [None]:
df.RatecodeID.value_counts()

# Check Test Data

In [None]:
test = pd.read_csv("test.csv")

In [None]:
test.info()

In [None]:
test.payment_type.value_counts()

In [None]:
test.describe()

In [None]:
test['duration'] = pd.to_datetime(test["tpep_dropoff_datetime"])-pd.to_datetime(test["tpep_pickup_datetime"])

In [None]:
test[pd.to_datetime(test["tpep_pickup_datetime"]).dt.day != pd.to_datetime(test["tpep_dropoff_datetime"]).dt.day].duration.describe()

In [None]:
test[test.duration > "2 hours"].RatecodeID.value_counts()

In [None]:
sns.countplot(test["RatecodeID"])

In [None]:
sns.countplot(df["extra"])

In [None]:
sns.countplot(df[df.RatecodeID == 6].extra)

In [None]:
df[df.mta_tax == 0].RatecodeID.value_counts()

In [None]:
sns.distplot(df.trip_distance)

In [None]:
sns.distplot(test.trip_distance)

In [None]:
sns.distplot(test.duration)

## Notes 

* *improvement_surgcharge* should be constant. Minimum seems to be a sign error. Maximum seems to be a outlier (mistake??)


In [None]:
df.info()

In [None]:
df = df.drop(['VendorID','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','Holiday','improvement_surcharge','mta_tax','store_and_fwd_flag','payment_type','duration_m'],axis=1)

In [None]:
df=df.drop('total',axis=1)

In [None]:
corr = df.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'],axis=1).corr()

plt.figure(figsize=(16,9))

plt.title("Heatmap",fontsize=20)
sns.heatmap(corr, cmap="RdBu_r", square=True)

In [None]:
sns.distplot(df['total_amount'], hist=True)

In [None]:
sample = df.sample(10000)

In [None]:
sns.lmplot(x='extra',y='Overnight',hue='RushHour',data=sample)

In [None]:
sns.jointplot(x='total_amount',y='fare_amount',data=df, kind='hex')

In [None]:
sns.lmplot(x='total_amount',y='tip_amount',hue='extra',data=sample)

In [None]:
sns.lmplot(x='improvement_surcharge',y='mta_tax',data=sample)