In [39]:
import pandas as pd

**Task 6: Feature Engineering From Raw Data**

**Dataset:** Uber Pickups in New York City

In [40]:
# Loading the Dataset
ub_df = pd.read_csv("/content/uber-raw-data-apr14.csv")

In [41]:
# Inspect the data
ub_df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [42]:
ub_df.shape

(564516, 4)

In [43]:
ub_df.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base'], dtype='object')

In [44]:
# Check missing values
ub_df.isnull().sum()

Unnamed: 0,0
Date/Time,0
Lat,0
Lon,0
Base,0


In [45]:
ub_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  564516 non-null  object 
 1   Lat        564516 non-null  float64
 2   Lon        564516 non-null  float64
 3   Base       564516 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.2+ MB


In [46]:
# Find total duplicated rows in the dataset
ub_df.duplicated().sum()

# Drop Duplicated rows
ub_df.drop_duplicates(inplace=True)

# Print Shape after removing the duplicates
ub_df.shape

(556767, 4)

In [47]:
# Handle "datetime"
ub_df['Date/Time'] = pd.to_datetime(ub_df['Date/Time'], format='%m/%d/%Y %H:%M:%S')

In [48]:
ub_df.dtypes

Unnamed: 0,0
Date/Time,datetime64[ns]
Lat,float64
Lon,float64
Base,object


In [49]:
# Create new features like hour_of_day, day_of_week from pickup datetime
ub_df['hour_of_day'] = ub_df['Date/Time'].dt.hour

In [50]:
ub_df['day_of_week'] = ub_df['Date/Time'].dt.day_name()

In [51]:
ub_df['Month'] = ub_df['Date/Time'].dt.month

In [52]:
ub_df.head(10)

Unnamed: 0,Date/Time,Lat,Lon,Base,hour_of_day,day_of_week,Month
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,0,Tuesday,4
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,0,Tuesday,4
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,0,Tuesday,4
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,0,Tuesday,4
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,0,Tuesday,4
5,2014-04-01 00:33:00,40.7383,-74.0403,B02512,0,Tuesday,4
6,2014-04-01 00:39:00,40.7223,-73.9887,B02512,0,Tuesday,4
7,2014-04-01 00:45:00,40.762,-73.979,B02512,0,Tuesday,4
8,2014-04-01 00:55:00,40.7524,-73.996,B02512,0,Tuesday,4
9,2014-04-01 01:01:00,40.7575,-73.9846,B02512,1,Tuesday,4


In [53]:
ub_df.tail(10)

Unnamed: 0,Date/Time,Lat,Lon,Base,hour_of_day,day_of_week,Month
564506,2014-04-30 23:00:00,40.7316,-73.9891,B02764,23,Wednesday,4
564507,2014-04-30 23:04:00,40.7267,-73.9937,B02764,23,Wednesday,4
564508,2014-04-30 23:05:00,40.7788,-73.96,B02764,23,Wednesday,4
564509,2014-04-30 23:15:00,40.742,-74.0037,B02764,23,Wednesday,4
564510,2014-04-30 23:18:00,40.7514,-74.0066,B02764,23,Wednesday,4
564511,2014-04-30 23:22:00,40.764,-73.9744,B02764,23,Wednesday,4
564512,2014-04-30 23:26:00,40.7629,-73.9672,B02764,23,Wednesday,4
564513,2014-04-30 23:31:00,40.7443,-73.9889,B02764,23,Wednesday,4
564514,2014-04-30 23:32:00,40.6756,-73.9405,B02764,23,Wednesday,4
564515,2014-04-30 23:48:00,40.688,-73.9608,B02764,23,Wednesday,4


In [54]:
ub_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 556767 entries, 0 to 564515
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Date/Time    556767 non-null  datetime64[ns]
 1   Lat          556767 non-null  float64       
 2   Lon          556767 non-null  float64       
 3   Base         556767 non-null  object        
 4   hour_of_day  556767 non-null  int32         
 5   day_of_week  556767 non-null  object        
 6   Month        556767 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(2), object(2)
memory usage: 29.7+ MB


In [55]:
# Create CSV File of Cleaned dataset
ub_df.to_csv("uber-raw-data-apr14-cleaned.csv", index=False)

In [56]:
# Read the Cleaned dataset
df = pd.read_csv('/content/uber-raw-data-apr14-cleaned.csv')
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,hour_of_day,day_of_week,Month
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,0,Tuesday,4
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,0,Tuesday,4
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,0,Tuesday,4
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,0,Tuesday,4
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,0,Tuesday,4
