## Setup and Dependencies

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import scipy.stats as st

In [3]:
# Load data file into pandas
bike_data = pd.read_csv("./Rolling_12M_citibike-tripdata.csv", encoding='utf-8')
bike_data.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,260,11/1/2019 0:02,11/1/2019 0:07,3186,Grove St PATH,40.719586,-74.043117,3203,Hamilton Park,40.727596,-74.044247,26314,Subscriber,1964,1
1,285,11/1/2019 0:05,11/1/2019 0:10,3206,Hilltop,40.731169,-74.057574,3212,Christ Hospital,40.734786,-74.050444,26238,Subscriber,1986,2
2,501,11/1/2019 0:05,11/1/2019 0:14,3186,Grove St PATH,40.719586,-74.043117,3192,Liberty Light Rail,40.711242,-74.055701,29645,Subscriber,1995,1
3,556,11/1/2019 0:07,11/1/2019 0:17,3209,Brunswick St,40.724176,-74.050656,3678,Fairmount Ave,40.725726,-74.071959,29221,Customer,1992,1
4,321,11/1/2019 0:11,11/1/2019 0:17,3273,Manila & 1st,40.721651,-74.042884,3273,Manila & 1st,40.721651,-74.042884,29591,Subscriber,1988,1


### Remove Outliers from Trip Duration

In [4]:
quartiles = bike_data['tripduration'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

print(f"Trip Duration Outliers")
print(f"-----------------------------")
print(f"Lower quartile: {lowerq}")
print(f"Upper quartile: {upperq}")
print(f"Interquartle range: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")
print()

Trip Duration Outliers
-----------------------------
Lower quartile: 301.0
Upper quartile: 1277.0
Interquartle range: 976.0
Values below -1163.0 could be outliers.
Values above 2741.0 could be outliers.



In [5]:
bike_data.count()

tripduration               354307
starttime                  354307
stoptime                   354307
start station id           354307
start station name         354307
start station latitude     354307
start station longitude    354307
end station id             354307
end station name           354307
end station latitude       354307
end station longitude      354307
bikeid                     354307
usertype                   354307
birth year                 354307
gender                     354307
dtype: int64

In [7]:
bike_data.drop(bike_data[bike_data['tripduration'] < lower_bound].index, inplace = True) 
bike_data.drop(bike_data[bike_data['tripduration'] > upper_bound].index, inplace = True) 

bike_data.count()

tripduration               325044
starttime                  325044
stoptime                   325044
start station id           325044
start station name         325044
start station latitude     325044
start station longitude    325044
end station id             325044
end station name           325044
end station latitude       325044
end station longitude      325044
bikeid                     325044
usertype                   325044
birth year                 325044
gender                     325044
dtype: int64

## Export CSV

In [10]:
# Export file as csv
bike_data.to_csv(r'.\Rolling_12M_citibike-tripdata_final.csv', encoding='utf-8', index=False)