# Trip Cleaning

### Proudly presented by team mucki-schnucki

Import of packages

In [2]:
import pandas as pd
import numpy as np

Import of data

In [3]:
df_Trips = pd.read_csv('boston_2017.csv')

###  1.1 Getting a first overview of the data

In [4]:
df_Trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313774 entries, 0 to 1313773
Data columns (total 8 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   start_time          1313774 non-null  object
 1   end_time            1313774 non-null  object
 2   start_station_id    1313774 non-null  int64 
 3   end_station_id      1313774 non-null  int64 
 4   start_station_name  1313774 non-null  object
 5   end_station_name    1313774 non-null  object
 6   bike_id             1313774 non-null  int64 
 7   user_type           1313774 non-null  object
dtypes: int64(3), object(5)
memory usage: 80.2+ MB


In [5]:
len(df_Trips)

1313774

In [6]:
df_Trips.columns

Index(['start_time', 'end_time', 'start_station_id', 'end_station_id',
       'start_station_name', 'end_station_name', 'bike_id', 'user_type'],
      dtype='object')

In [7]:
df_Trips.head(5)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
0,2017-01-01 00:06:58,2017-01-01 00:12:49,67,139,MIT at Mass Ave / Amherst St,Dana Park,644,Subscriber
1,2017-01-01 00:13:16,2017-01-01 00:28:07,36,10,Boston Public Library - 700 Boylston St.,B.U. Central - 725 Comm. Ave.,230,Subscriber
2,2017-01-01 00:16:17,2017-01-01 00:44:10,36,9,Boston Public Library - 700 Boylston St.,Agganis Arena - 925 Comm Ave.,980,Customer
3,2017-01-01 00:21:22,2017-01-01 00:33:50,46,19,Christian Science Plaza,Buswell St. at Park Dr.,1834,Subscriber
4,2017-01-01 00:30:06,2017-01-01 00:40:28,10,8,B.U. Central - 725 Comm. Ave.,Union Square - Brighton Ave. at Cambridge St.,230,Subscriber


In [8]:
df_Trips.tail(5)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type
1313769,2017-12-31 23:46:18,2017-12-31 23:50:27,117,141,Binney St / Sixth St,Kendall Street,1846,Subscriber
1313770,2017-12-29 16:11:56,2017-12-29 16:16:18,54,42,Tremont St at West St,Boylston St at Arlington St TEMPORARY WINTER L...,2,Subscriber
1313771,2017-12-30 08:09:44,2017-12-30 08:26:08,54,58,Tremont St at West St,Beacon St at Arlington St,1534,Subscriber
1313772,2017-12-30 12:20:01,2017-12-30 12:49:12,54,46,Tremont St at West St,Christian Science Plaza - Massachusetts Ave at...,1978,Subscriber
1313773,2017-12-30 18:27:39,2017-12-30 18:53:54,54,21,Tremont St at West St,Prudential Center - Belvedere St,15,Subscriber


Now the range of the variables start_time, end_time, station_id, bike_id and user_type is examined. 

In [9]:
print("Start times range from %s to %s" % (df_Trips['start_time'].min(), df_Trips['start_time'].max()))
print("End times range from %s to %s \n" % (df_Trips['end_time'].min(), df_Trips['end_time'].max()))

station_ids_total = df_Trips[["start_station_id", "end_station_id"]].values
station_ids_unique = np.unique(station_ids_total)
print("Station IDs range from %d to %d. %d different stations in total" % (station_ids_unique.min(), station_ids_unique.max(), len(station_ids_unique)))

bike_ids_unique = np.unique(df_Trips[["bike_id"]].values)
print("Bike IDs range from %d to %d" % (bike_ids_unique.min(), bike_ids_unique.max()))

print("Possible user types types are %s" % (np.unique(df_Trips["user_type"].values)))

Start times range from 2017-01-01 00:06:58 to 2017-12-31 23:46:18
End times range from 2017-01-01 00:12:49 to 2018-01-07 20:00:16 

Station IDs range from 1 to 232. 200 different stations in total
Bike IDs range from 1 to 1981
Possible user types types are ['Customer' 'Subscriber']


### 1.2 Identifying missing or wrong values and duplicates

In [10]:
df_Trips.isnull().values.any()

False

In [11]:
# .sum() counts the TRUE values 
df_Trips.duplicated().sum()

0

As there are no missing or duplicates values in the dataframe, it can be suspected that the quality of the data is already very good and we can continue with the feature engineering. 

### 1.3 Feature Engineering 

Now additional features will be calculated out of the existing data. 
This will contain: 
- temporal data eg. duration of the ride, weekday or hour

In [12]:
df_Trips[['start_time','end_time']] = df_Trips[['start_time','end_time']].apply(pd.to_datetime)

In [13]:
df_Trips['start_hour'] = pd.DatetimeIndex(df_Trips['start_time']).hour
df_Trips['weekday'] = pd.DatetimeIndex(df_Trips['start_time']).weekday
df_Trips['duration'] = df_Trips['end_time'] - df_Trips['start_time']

In [14]:
df_Trips['duration'].max()

Timedelta('48 days 08:40:21')

In [15]:
df_Trips['duration'].min()

Timedelta('-1 days +23:06:07')

In [16]:
len(df_Trips[df_Trips.duplicated(subset = ["bike_id", "start_time"], keep = False)])

0

There are no duplicates of the combination bike_id and start_time

Now wrong values of the duration are excluded of the data frame.

First we look for trips, who take shorter than 1min. 



In [30]:
df2 = df_Trips[(df_Trips['duration'].dt.total_seconds() < 60) & (df_Trips['duration'].dt.total_seconds() > 0)] 
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3253 entries, 419 to 1313616
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   start_time          3253 non-null   datetime64[ns] 
 1   end_time            3253 non-null   datetime64[ns] 
 2   start_station_id    3253 non-null   int64          
 3   end_station_id      3253 non-null   int64          
 4   start_station_name  3253 non-null   object         
 5   end_station_name    3253 non-null   object         
 6   bike_id             3253 non-null   int64          
 7   user_type           3253 non-null   object         
 8   start_hour          3253 non-null   int64          
 9   weekday             3253 non-null   int64          
 10  duration            3253 non-null   timedelta64[ns]
dtypes: datetime64[ns](2), int64(5), object(3), timedelta64[ns](1)
memory usage: 305.0+ KB


Unnamed: 0,start_time,end_time,start_station_id,end_station_id,start_station_name,end_station_name,bike_id,user_type,start_hour,weekday,duration
419,2017-01-01 19:18:02,2017-01-01 19:19:08,189,189,Kendall T,Kendall T,1574,Subscriber,19,6,0 days 00:01:06
501,2017-01-02 06:59:45,2017-01-02 07:01:09,14,3,HMS / HSPH - Ave. Louis Pasteur at Longwood Ave.,Colleges of the Fenway,964,Subscriber,6,0,0 days 00:01:24
1208,2017-01-02 20:04:40,2017-01-02 20:05:45,107,189,Ames St at Main St,Kendall T,1035,Subscriber,20,0,0 days 00:01:05
1771,2017-01-03 15:59:41,2017-01-03 16:01:10,178,184,MIT Pacific St at Purrington St,Sidney Research Campus/ Erie Street at Waverly,767,Subscriber,15,1,0 days 00:01:29
2757,2017-01-04 14:52:03,2017-01-04 14:53:14,115,176,Porter Square Station,Lesley University,635,Subscriber,14,2,0 days 00:01:11
2838,2017-01-04 15:44:20,2017-01-04 15:45:36,44,23,Faneuil Hall - Union St. at North St.,Mayor Martin J Walsh - 28 State St,1107,Subscriber,15,2,0 days 00:01:16
2960,2017-01-04 16:43:58,2017-01-04 16:45:16,184,179,Sidney Research Campus/ Erie Street at Waverly,MIT Vassar St,1339,Subscriber,16,2,0 days 00:01:18
3061,2017-01-04 17:11:42,2017-01-04 17:13:00,178,184,MIT Pacific St at Purrington St,Sidney Research Campus/ Erie Street at Waverly,507,Subscriber,17,2,0 days 00:01:18
3079,2017-01-04 17:16:28,2017-01-04 17:17:55,218,218,Watermark Seaport,Watermark Seaport,356,Customer,17,2,0 days 00:01:27
3616,2017-01-05 07:50:02,2017-01-05 07:51:16,93,93,JFK / UMASS at MBTA Station,JFK / UMASS at MBTA Station,900,Subscriber,7,3,0 days 00:01:14


In [22]:
df_Trips_Modified = df_Trips[(df_Trips['duration'].dt.total_seconds() > 0) & (df_Trips['duration'].dt.total_seconds() < 18000)]

In [23]:
df_Trips_Modified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1310760 entries, 0 to 1313773
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype          
---  ------              --------------    -----          
 0   start_time          1310760 non-null  datetime64[ns] 
 1   end_time            1310760 non-null  datetime64[ns] 
 2   start_station_id    1310760 non-null  int64          
 3   end_station_id      1310760 non-null  int64          
 4   start_station_name  1310760 non-null  object         
 5   end_station_name    1310760 non-null  object         
 6   bike_id             1310760 non-null  int64          
 7   user_type           1310760 non-null  object         
 8   start_hour          1310760 non-null  int64          
 9   weekday             1310760 non-null  int64          
 10  duration            1310760 non-null  timedelta64[ns]
dtypes: datetime64[ns](2), int64(5), object(3), timedelta64[ns](1)
memory usage: 120.0+ MB


### Weather data Cleaning 

In [None]:
df_Weather.info()

In [None]:
df_Weather.isnull().values.sum()

In [None]:
df_Weather.head(5)

In [None]:
df_Weather.tail(5)

The dataset containing information about the weather reaches from 2015 - 2020. As only data of 2017 is needed, the data can limited to this year. 

In [None]:
df_Weather['date_time'] = pd.to_datetime(df_Weather['date_time'])

df_Weather2 = df_Weather[(df_Weather['date_time'] > "2017-01-01") & (df_Weather['date_time'] < "2018-01-01")]


In [None]:
df_Weather2.info()

To display information about every hour in 2017 the weather data should contain 365 * 24 = 8760 rows. As the filtered dataframe only has 8689 entries, information about 71 hours is missing. 

In [None]:
df_Weather2.isnull().values.sum()

Fortunatley no NA values are in the dataset anymore.

In [None]:
df_Weather2.head(5)

In [None]:
df_Weather2.tail(5)

In [None]:
df_Stations.head()