# Clean train Data

In [2]:
# Manage imports
import numpy as np
import pandas as pd
import seaborn as sns  
import matplotlib.pyplot as plt

In [None]:
# read data in memory
data = pd.read_csv("train.csv")

In [None]:
data.RatecodeID.value_counts()

## Obvious invalid values

Clean the data.
Remove all obvious outliers:
* Invalid Ratecoded (must be in range 1,2,3,4,5,6)
* Invalid improvement_surcharge (only .5 is valid!)
* Invalid mta_tax (either 0 or .5)
* Invalid extra (0,0.5,1,1.5,4.5)
* All negative values (costs can't be negative!)
* Drop useless information
    * store_and_fwd_flag (Useless)
    * Vendor ID (Useless)
    * Improvement surcharge (Fixed value!)
* Calculate total amount, drop outliers (values which doesnt equal to the total amount must be wrong!)
* Save pickup and dropoff time as datetime object

In [None]:
# Drop all data with invalid extra, improvement_surcharge, RatecodeID, mta_tax -> Invalid!
df = data.drop(data[data.extra.isin([0,0.5,1,1.5,4.5]) == False].index)
df = df.drop(df[df.improvement_surcharge!=0.3].index)
# Also drop 6 -> No group rides in test data!
df = df.drop(df[df.RatecodeID.isin([1,2,3,4,5]) == False].index)
df = df.drop(df[df.mta_tax.isin([0,0.5]) == False].index)

df = df.drop(df[df.tip_amount < 0].index)
df = df.drop(df[df.fare_amount <= 0].index)
df = df.drop(df[df.total_amount <= 0].index)
df = df.drop(df[df.tolls_amount < 0].index)
df = df.drop(df[df.pickup_latitude == 0].index)
df = df.drop(df[df.pickup_longitude == 0].index)
df = df.drop(df[df.dropoff_latitude == 0].index)
df = df.drop(df[df.dropoff_longitude == 0].index)

In [None]:
df.RatecodeID.value_counts()

In [None]:
# Drop useless columns
df = df.drop(['VendorID','store_and_fwd_flag','improvement_surcharge'],axis=1)

In [None]:
# drop invalid rush hour surtaxes
df = df.drop(df[(df.RatecodeID !=2) & ((df.extra.isin([0,0.5,1,1.5]))==False)].index)
df = df.drop(df[(df.RatecodeID ==2) & ((df.extra.isin([0,4.5]))==False)].index)
df = df.drop(df[(df.RatecodeID ==3) & (df.mta_tax == .5)].index)
df = df.drop(df[(df.RatecodeID.isin([2,4])) & (df.mta_tax == 0)].index)

# Rush Hour on Weekends
df = df.drop(df[(df.extra.isin([1,4.5])) & (pd.to_datetime(df.tpep_pickup_datetime).dt.weekday>=5)].index)
# Rush Hour picked up after 20
df = df.drop(df[(df.extra.isin([1,4.5])) & (pd.to_datetime(df.tpep_pickup_datetime).dt.hour >= 20)].index)
# Rush Hour dropped before 16
df = df.drop(df[(df.extra.isin([1,4.5])) & (pd.to_datetime(df.tpep_dropoff_datetime).dt.hour < 16)].index)
# Overnight picked up after 6 and dropped before 20
df = df.drop(df[(df.extra == 0.5) & \
    ((pd.to_datetime(df.tpep_pickup_datetime).dt.hour >= 6) & \
    (pd.to_datetime(df.tpep_dropoff_datetime).dt.hour < 20))].index)

# Remove Dependant on Ratecode invalid data
# All price steps are in 0.5! (excluding negotiated)
df = df.drop(df[(df.RatecodeID !=5) & ((df.fare_amount%0.5)!=0)].index)
df = df.drop(df[(df.RatecodeID ==2) & ((df.fare_amount!=52))].index)
df = df.drop(df[(df.RatecodeID.isin([1,4,5])) & ((df.fare_amount<3))].index)

# Remove values where total isnt the sum of all
total = df.extra+df.tip_amount+df.tolls_amount+df.mta_tax+df.fare_amount + 0.3
df = df.drop(df[df.total_amount != total].index)

In [None]:
df = df.drop(df[df.tip_amount > 100].index)
df = df.drop(df[df.fare_amount > 200].index)
df = df.drop(df[df.tolls_amount > 30].index)
df = df.drop(df[(df.passenger_count < 1) | (df.passenger_count > 6)].index)

In [None]:
#removing outliers (far away from manhattan)
manLat = 40.756716
manLong = -73.985368
margin = 1

print("With outliers:",np.shape(df))
df = df.loc[(abs(df["dropoff_latitude"]-manLat)<margin) 
                   &(abs(df["pickup_latitude"]-manLat)<margin) 
                   &(abs(df["dropoff_longitude"]-manLong)<margin) 
                   &(abs(df["pickup_longitude"]-manLong)<margin)]
print("Without outliers:",np.shape(df))

Transform datetime in datetime objects

In [None]:
df["tpep_pickup_datetime"] = pd.to_datetime(df.tpep_pickup_datetime)
df['tpep_dropoff_datetime'] = pd.to_datetime(df.tpep_dropoff_datetime)

In [None]:
# Get duration
df['duration'] = pd.to_datetime(df["tpep_dropoff_datetime"])-pd.to_datetime(df["tpep_pickup_datetime"])

In [None]:
#drop all negative and too short rides  (< 30s , 40s is lowest in test data!)
df = df.drop(df[df.duration < pd.Timedelta("30 seconds")].index)

# drop all > 3h (unreasonable long, 2:30 is longest in test data)
df = df.drop(df[df.duration > pd.Timedelta("3 hours")].index)

Obvious wrong labeld extras!

In [None]:
df.info()

In [None]:
df.RatecodeID.value_counts()

In [None]:
df.head()

# Check if data makes sense now

In [None]:
df.describe()

In [None]:
df.trip_distance.max()

# Check Test Data

In [3]:
test = pd.read_csv("test.csv")

In [6]:
test.payment_type.value_counts()

1    42950
2    20635
3      310
4      105
Name: payment_type, dtype: int64

In [5]:
test[test.trip_distance > 30].RatecodeID.value_counts()

5    217
4    109
3    100
2     13
1      3
Name: RatecodeID, dtype: int64

In [None]:
test.improvement_surcharge.value_counts()

In [None]:
test.passenger_count.value_counts()

In [None]:
test.pickup_latitude.value_counts()

In [None]:
#removing outliers (far away from manhattan)
manLat = 40.756716
manLong = -73.985368
margin = 1.0

print("With outliers:",np.shape(test))
test2 = test.loc[(abs(test["dropoff_latitude"]-manLat)<margin) 
                   &(abs(test["pickup_latitude"]-manLat)<margin) 
                   &(abs(test["dropoff_longitude"]-manLong)<margin) 
                   &(abs(test["pickup_longitude"]-manLong)<margin)]
print("Without outliers:",np.shape(test2))

In [None]:
test.loc[(abs(test["dropoff_latitude"]-manLat)<margin) 
                   &(abs(test["pickup_latitude"]-manLat)<margin) 
                   &(abs(test["dropoff_longitude"]-manLong)>margin) 
                   &(abs(test["pickup_longitude"]-manLong)<margin)]

# Save

In [None]:
df.to_csv('clean.csv', index=True)