# UBER Pickups 
### PREPROCESSING

In [1]:
# Importing useful librairies

import pandas as pd
from datetime import datetime

### GETTING ACQUAINTED WITH THE DATASET

In [8]:
dataset = pd.read_csv("uber-raw-data-sep14.csv")

In [9]:
dataset.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/1/2014 0:01:00,40.2201,-74.0021,B02512
1,9/1/2014 0:01:00,40.75,-74.0027,B02512
2,9/1/2014 0:03:00,40.7559,-73.9864,B02512
3,9/1/2014 0:06:00,40.745,-73.9889,B02512
4,9/1/2014 0:11:00,40.8145,-73.9444,B02512


In [10]:
# Compute usefull statistics 

dataset.describe(include = 'all')

Unnamed: 0,Date/Time,Lat,Lon,Base
count,1028136,1028136.0,1028136.0,1028136
unique,42907,,,5
top,9/13/2014 18:44:00,,,B02617
freq,82,,,377695
mean,,40.73922,-73.97182,
std,,0.04082861,0.05831413,
min,,39.9897,-74.7736,
25%,,40.7204,-73.9962,
50%,,40.7418,-73.9831,
75%,,40.7612,-73.9628,


In [11]:
# Dropping columns that won't be useful for the analysis

dataset.drop("Base", axis = 1, inplace = True)
dataset.head()

Unnamed: 0,Date/Time,Lat,Lon
0,9/1/2014 0:01:00,40.2201,-74.0021
1,9/1/2014 0:01:00,40.75,-74.0027
2,9/1/2014 0:03:00,40.7559,-73.9864
3,9/1/2014 0:06:00,40.745,-73.9889
4,9/1/2014 0:11:00,40.8145,-73.9444


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028136 entries, 0 to 1028135
Data columns (total 3 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   Date/Time  1028136 non-null  object 
 1   Lat        1028136 non-null  float64
 2   Lon        1028136 non-null  float64
dtypes: float64(2), object(1)
memory usage: 23.5+ MB


In [13]:
# Converting the 'Date/Time' column in datetime type to extract year, month, day, hour
dataset['Date/Time'] = pd.to_datetime(dataset['Date/Time'], format = "%m/%d/%Y %H:%M:%S")
dataset["day"] = dataset['Date/Time'].dt.day
dataset["hour"] = dataset['Date/Time'].dt.hour
dataset["weekofyear"] = dataset['Date/Time'].dt.weekofyear
dataset["dayofweek"] = dataset['Date/Time'].dt.dayofweek
dataset.head()

  dataset["weekofyear"] = dataset['Date/Time'].dt.weekofyear


Unnamed: 0,Date/Time,Lat,Lon,day,hour,weekofyear,dayofweek
0,2014-09-01 00:01:00,40.2201,-74.0021,1,0,36,0
1,2014-09-01 00:01:00,40.75,-74.0027,1,0,36,0
2,2014-09-01 00:03:00,40.7559,-73.9864,1,0,36,0
3,2014-09-01 00:06:00,40.745,-73.9889,1,0,36,0
4,2014-09-01 00:11:00,40.8145,-73.9444,1,0,36,0


### SELECTING ONE PARTICULAR DAY AND ONE PARTICULAR HOUR FOR THE ANALYSIS

In [14]:
# We want to keep the data for the 1st of September at 6pm only

mask = (dataset["day"] == 1) & (dataset['hour'] == 18)
dataset_day_hour = dataset.loc[mask, :]
dataset_day_hour = dataset_day_hour.reset_index(drop = True)
dataset_day_hour.head()

Unnamed: 0,Date/Time,Lat,Lon,day,hour,weekofyear,dayofweek
0,2014-09-01 18:03:00,40.721,-73.9977,1,18,36,0
1,2014-09-01 18:03:00,40.7641,-73.967,1,18,36,0
2,2014-09-01 18:04:00,40.7469,-73.996,1,18,36,0
3,2014-09-01 18:05:00,40.6864,-73.9686,1,18,36,0
4,2014-09-01 18:07:00,40.7451,-73.9768,1,18,36,0


In [15]:
# We drop the time columns, we only study the localization of the driver
to_drop = ["Date/Time", "day", "hour", "weekofyear", "dayofweek"]
dataset_day_hour = dataset_day_hour.drop(to_drop, axis = 1)
dataset_day_hour.head()

Unnamed: 0,Lat,Lon
0,40.721,-73.9977
1,40.7641,-73.967
2,40.7469,-73.996
3,40.6864,-73.9686
4,40.7451,-73.9768


In [16]:
# Saving this data in a new csv for clustering
dataset_day_hour.to_csv('uber_pickups_1_09_6PM.csv', index = False)

### SELECTING A WEEK FOR THE ANALYSIS

In [17]:
# We want to keep the data for the week 36 only

mask = dataset["weekofyear"] == 36
dataset_week = dataset.loc[mask, :]
dataset_week = dataset_week.reset_index(drop = True)
dataset_week.head()

Unnamed: 0,Date/Time,Lat,Lon,day,hour,weekofyear,dayofweek
0,2014-09-01 00:01:00,40.2201,-74.0021,1,0,36,0
1,2014-09-01 00:01:00,40.75,-74.0027,1,0,36,0
2,2014-09-01 00:03:00,40.7559,-73.9864,1,0,36,0
3,2014-09-01 00:06:00,40.745,-73.9889,1,0,36,0
4,2014-09-01 00:11:00,40.8145,-73.9444,1,0,36,0


In [18]:
# We drop the time columns, we only study the localization of the driver
to_drop = ["Date/Time", "day", "hour", "weekofyear"]
dataset_week = dataset_week.drop(to_drop, axis = 1)
dataset_week.head()

Unnamed: 0,Lat,Lon,dayofweek
0,40.2201,-74.0021,0
1,40.75,-74.0027,0
2,40.7559,-73.9864,0
3,40.745,-73.9889,0
4,40.8145,-73.9444,0


In [117]:
# Saving this data in a new csv for clustering
dataset_week.to_csv('uber_pickups_week36.csv', index = False)