# UBER Data Analysis

## 1. Importing libraries and loading data

**Data Source : https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf**

In [15]:
import pandas as pd  
import numpy as np 

In [16]:
data = pd.read_csv(r"C:\Users\saiba\OneDrive - nsut.ac.in\Uber Data Analysis\data\uber_data.csv")

In [17]:
pd.set_option("display.max_rows", 10000)
pd.set_option("display.max_columns", 25)

In [18]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,1,01-03-2016 00:00,01-03-2016 00:07,1,2.5,-73.976746,40.765152,1,N,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,1,01-03-2016 00:00,01-03-2016 00:11,1,2.9,-73.983482,40.767925,1,N,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,2,01-03-2016 00:00,01-03-2016 00:31,2,19.98,-73.782021,40.64481,1,N,-73.974541,40.67577,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
3,2,01-03-2016 00:00,01-03-2016 00:00,3,10.78,-73.863419,40.769814,1,N,-73.96965,40.757767,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
4,2,01-03-2016 00:00,01-03-2016 00:00,5,30.43,-73.971741,40.792183,3,N,-74.17717,40.695053,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8


## 2. EDA

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VendorID               100000 non-null  int64  
 1   tpep_pickup_datetime   100000 non-null  object 
 2   tpep_dropoff_datetime  100000 non-null  object 
 3   passenger_count        100000 non-null  int64  
 4   trip_distance          100000 non-null  float64
 5   pickup_longitude       100000 non-null  float64
 6   pickup_latitude        100000 non-null  float64
 7   RatecodeID             100000 non-null  int64  
 8   store_and_fwd_flag     100000 non-null  object 
 9   dropoff_longitude      100000 non-null  float64
 10  dropoff_latitude       100000 non-null  float64
 11  payment_type           100000 non-null  int64  
 12  fare_amount            100000 non-null  float64
 13  extra                  100000 non-null  float64
 14  mta_tax                100000 non-nul

In [20]:
## changing timeline to datettime object
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])

In [21]:
## changing `VendorID` to categorical format
data.loc[data.VendorID == 1, 'VendorID'] = 'Creative Mobile Technologies'
data.loc[data.VendorID == 2, 'VendorID'] = ' VeriFone Inc.'

In [22]:
data.VendorID.nunique()

2

In [23]:
data.VendorID.unique()

array(['Creative Mobile Technologies', ' VeriFone Inc.'], dtype=object)

In [24]:
data.RatecodeID.unique()

array([1, 3, 2, 5, 4, 6], dtype=int64)

In [25]:
## changing RateCodeID to categorical format
data.loc[data.RatecodeID == 1, 'RatecodeID'] = 'Standard rate'
data.loc[data.RatecodeID == 2, 'RatecodeID'] = 'JFK'
data.loc[data.RatecodeID == 3, 'RatecodeID'] = 'Newark'
data.loc[data.RatecodeID == 4, 'RatecodeID'] = 'Nassau or Westchester'
data.loc[data.RatecodeID == 5, 'RatecodeID'] = 'Negotiated fare'
data.loc[data.RatecodeID == 6, 'RatecodeID'] = 'Group ride'

In [26]:
data.RatecodeID.unique()

array(['Standard rate', 'Newark', 'JFK', 'Negotiated fare',
       'Nassau or Westchester', 'Group ride'], dtype=object)

In [29]:
data.store_and_fwd_flag.unique()

array(['N', 'Y'], dtype=object)

**Here `N` stands for  `not a store and forward trip` 
and `Y` stands for `store and forward trip`**

In [30]:
## payment type

data.payment_type.unique()

array([1, 2, 3, 4], dtype=int64)

In [31]:
## changing payment_type to categorical values
data.loc[data.payment_type == 1, 'payment_type'] = 'Credit card'
data.loc[data.payment_type == 2, 'payment_type'] = 'Cash'
data.loc[data.payment_type == 3, 'payment_type'] = 'No charge'
data.loc[data.payment_type == 4, 'payment_type'] = 'Dispute'
data.loc[data.payment_type == 5, 'payment_type'] = 'Unknown'
data.loc[data.payment_type == 6, 'payment_type'] = 'Voided trip'

In [32]:
data.payment_type.unique()

array(['Credit card', 'Cash', 'No charge', 'Dispute'], dtype=object)

In [27]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,Creative Mobile Technologies,2016-01-03,2016-01-03 00:07:00,1,2.5,-73.976746,40.765152,Standard rate,N,-74.004265,40.746128,1,9.0,0.5,0.5,2.05,0.0,0.3,12.35
1,Creative Mobile Technologies,2016-01-03,2016-01-03 00:11:00,1,2.9,-73.983482,40.767925,Standard rate,N,-74.005943,40.733166,1,11.0,0.5,0.5,3.05,0.0,0.3,15.35
2,VeriFone Inc.,2016-01-03,2016-01-03 00:31:00,2,19.98,-73.782021,40.64481,Standard rate,N,-73.974541,40.67577,1,54.5,0.5,0.5,8.0,0.0,0.3,63.8
3,VeriFone Inc.,2016-01-03,2016-01-03 00:00:00,3,10.78,-73.863419,40.769814,Standard rate,N,-73.96965,40.757767,1,31.5,0.0,0.5,3.78,5.54,0.3,41.62
4,VeriFone Inc.,2016-01-03,2016-01-03 00:00:00,5,30.43,-73.971741,40.792183,Newark,N,-74.17717,40.695053,1,98.0,0.0,0.0,0.0,15.5,0.3,113.8


**`extra` - 1. Extra is calculated for 0.50 dollar for rush hour and  $1.0 for overnight charges**

**`mta_tax` - 2. $0.50 MTA tax that is automatically triggered based on the metered 
rate in use.**

**`improvement_surcharge` - 3. $0.30 improvement surcharge assessed trips at the flag drop**

In [35]:
## extra fares
data.extra.value_counts()

 0.0    79649
 0.5    20313
-0.5       19
 1.0       18
 4.5        1
Name: extra, dtype: int64

In [37]:
## MTA taxes
data.mta_tax.value_counts()

 0.5    99466
 0.0      468
-0.5       66
Name: mta_tax, dtype: int64

In [38]:
## improvement_surcharge
data.improvement_surcharge.value_counts()

 0.3    99902
-0.3       70
 0.0       28
Name: improvement_surcharge, dtype: int64

In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   VendorID               100000 non-null  object        
 1   tpep_pickup_datetime   100000 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  100000 non-null  datetime64[ns]
 3   passenger_count        100000 non-null  int64         
 4   trip_distance          100000 non-null  float64       
 5   pickup_longitude       100000 non-null  float64       
 6   pickup_latitude        100000 non-null  float64       
 7   RatecodeID             100000 non-null  object        
 8   store_and_fwd_flag     100000 non-null  object        
 9   dropoff_longitude      100000 non-null  float64       
 10  dropoff_latitude       100000 non-null  float64       
 11  payment_type           100000 non-null  object        
 12  fare_amount            100000 non-null  float

In [52]:
data.to_csv(r"C:\Users\saiba\OneDrive - nsut.ac.in\Uber Data Analysis\data\UBER_DATA_update.csv")

![https://github.com/darshilparmar/uber-etl-pipeline-data-engineering-project/blob/main/data_model.jpeg?raw=true](https://github.com/darshilparmar/uber-etl-pipeline-data-engineering-project/blob/main/data_model.jpeg?raw=true)

In [39]:
data_parquet = pd.read_parquet(r"C:\Users\saiba\OneDrive - nsut.ac.in\Uber Data Analysis\data\yellow_tripdata_2023-01.parquet", engine = 'pyarrow')

In [43]:
data_parquet.head(10)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0
5,2,2023-01-01 00:50:34,2023-01-01 01:02:52,1.0,1.84,1.0,N,161,137,1,12.8,1.0,0.5,10.0,0.0,1.0,27.8,2.5,0.0
6,2,2023-01-01 00:09:22,2023-01-01 00:19:49,1.0,1.66,1.0,N,239,143,1,12.1,1.0,0.5,3.42,0.0,1.0,20.52,2.5,0.0
7,2,2023-01-01 00:27:12,2023-01-01 00:49:56,1.0,11.7,1.0,N,142,200,1,45.7,1.0,0.5,10.74,3.0,1.0,64.44,2.5,0.0
8,2,2023-01-01 00:21:44,2023-01-01 00:36:40,1.0,2.95,1.0,N,164,236,1,17.7,1.0,0.5,5.68,0.0,1.0,28.38,2.5,0.0
9,2,2023-01-01 00:39:42,2023-01-01 00:50:36,1.0,3.01,1.0,N,141,107,2,14.9,1.0,0.5,0.0,0.0,1.0,19.9,2.5,0.0


In [41]:
data_parquet.shape

(3066766, 19)

In [42]:
data_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [46]:
data_parquet.PULocationID.value_counts()[:10]

132    160030
237    148074
236    138391
161    135417
186    109227
162    105334
142    100228
230     98991
138     89188
170     88346
Name: PULocationID, dtype: int64

In [47]:
data_parquet.DOLocationID.value_counts()[:10]

236    146348
237    132364
161    116149
230     89878
170     88783
239     87969
142     87969
141     87655
162     82739
48      77383
Name: DOLocationID, dtype: int64

In [50]:
data_parquet.PULocationID.nunique(), data_parquet.DOLocationID.nunique()

(257, 261)