## Exploratory Data Analysis

In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")

df = pd.read_csv('/mnt/c/wo_pessoal/uber_assessment/data_original/latam_aa_train_data_mlops.csv', low_memory=False)
print('\n\033[1mInference:\033[0m The Datset consists of {} features & {} samples.'.format(df.shape[1], df.shape[0]))
df.sample(5)


[1mInference:[0m The Datset consists of 18 features & 1500000 samples.


Unnamed: 0,uuid,pickup_ts,pick_lat,pick_lng,dropoff_ts,dropoff_lat,dropoff_lng,eta,ata,trip_distance,trip_duration,is_airport,pickup_airport_code,dropoff_airport_code,is_surged,surge_multiplier,driver_rating,lifetime_trips
971102,8b3f93c1-d7ff-4dc1-b788-03529264d4ad,2022-12-26 16:27:25,-23.64752,-46.78896,2022-12-26 16:33:31,-23.653177,-46.78805,153.0,171.0,1.060825,366,False,,,True,1.3,4.99,1858
651013,e1b05fa3-9a4c-477a-b1bc-f986641500e5,2023-03-01 09:15:04,-23.461946,-46.3495,2023-03-01 09:37:42,-23.491135,-46.352367,169.0,230.0,5.254688,1358,False,,,False,1.0,4.95,2178
1335503,bc15add4-08d3-485c-92bc-bdd820f3fa54,2023-03-03 23:00:41,-23.425392,-46.48116,2023-03-03 23:53:47,-23.663527,-46.68457,93.0,\N,43.42047,3186,True,GRU,,False,1.0,4.99,7886
613540,42cbad56-9d47-45ea-857e-bb418c26c136,2022-12-21 12:38:35,-23.559381,-46.200195,2022-12-21 12:49:51,-23.53248,-46.22585,79.0,73.0,4.542861,676,False,,,False,1.0,4.96,11017
716181,8a6514d3-06e2-4f57-83b3-4546f34b109d,2022-12-29 07:07:38,-23.702808,-46.62199,2022-12-29 07:23:12,-23.712301,-46.59458,192.0,203.0,5.337109,934,False,,,False,1.0,4.97,8103


In [2]:
# Checking for duplicated uuids
df['uuid'].value_counts()

uuid
402dbb9c-ad2e-4771-8ebd-d09d896f2301    1
227bdcb4-464b-41eb-8f6a-72f2216b75f3    1
efb20913-cca4-4b68-9496-5fc531c26118    1
a69eefe7-192c-428e-a5c8-792a6d38e5db    1
f8943099-577c-48e8-8a21-4964f6c7d256    1
                                       ..
f1739782-6ccb-4d80-a88d-fdd4af7807e9    1
9f8ad376-877f-4c75-8263-72a9aa02c0fa    1
7d858b04-9ee6-4953-9ac0-178e75551fe7    1
92047189-a07c-4f57-8d2f-1c17de393671    1
068eeda2-7666-4761-9937-8ae276e14a3b    1
Name: count, Length: 1500000, dtype: int64

In [3]:
# Convert `pickup_ts` column to datetime
df['pickup_ts'] = pd.to_datetime(df['pickup_ts'])
df['pickup_hour'] = df['pickup_ts'].dt.hour
df['pickup_day_of_the_week'] = df['pickup_ts'].dt.day_name()
df['pickup_month'] = df['pickup_ts'].dt.month

# pickup_hour into categories (e.g., morning, afternoon, evening, night)
def categorize_hour(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'
df['pickup_hour_cat'] = df['pickup_hour'].apply(categorize_hour)

df[['pickup_ts', 'pickup_hour', 'pickup_day_of_the_week', 'pickup_month', 'pickup_hour_cat']].sample(5)

Unnamed: 0,pickup_ts,pickup_hour,pickup_day_of_the_week,pickup_month,pickup_hour_cat
206397,2023-02-04 16:16:26,16,Saturday,2,afternoon
1462622,2022-12-06 12:10:37,12,Tuesday,12,afternoon
898672,2023-02-07 08:20:49,8,Tuesday,2,morning
17499,2023-03-03 20:48:33,20,Friday,3,evening
1252141,2023-03-07 17:01:42,17,Tuesday,3,evening


In [4]:
contagem_df = df['pickup_airport_code'].value_counts(dropna=False).reset_index()
contagem_df.columns = ['pickup_airport_code', 'count']
contagem_df

Unnamed: 0,pickup_airport_code,count
0,,1487834
1,GRU,7947
2,CGH,4219


In [5]:
contagem_df = df['dropoff_airport_code'].value_counts(dropna=False).reset_index()
contagem_df.columns = ['dropoff_airport_code', 'count']
contagem_df

Unnamed: 0,dropoff_airport_code,count
0,,1496130
1,GRU,2509
2,CGH,1331
3,VCP,29
4,SOD,1


In [6]:
# Fill empty values ​​of `pickup_airport_code` and `dropoff_airport_code`
df['pickup_airport_code'] = df['pickup_airport_code'].fillna('Unknown').astype('category')
df['dropoff_airport_code'] = df['dropoff_airport_code'].fillna('Unknown').astype('category')

contagem_df = df['pickup_airport_code'].value_counts(dropna=False).reset_index()
contagem_df.columns = ['pickup_airport_code', 'count']
contagem_df

Unnamed: 0,pickup_airport_code,count
0,Unknown,1487834
1,GRU,7947
2,CGH,4219


In [7]:
# Filtering the DataFrame to ensure latitude and longitude coordinates are within valid limits
# Latitude must be between -90 and 90 degrees, and Longitude between -180 and 180 degrees
df = df[(df.pick_lat<90) & (df.dropoff_lat<90) &
        (df.pick_lat>-90) & (df.dropoff_lat>-90) &
        (df.pick_lng<180) & (df.dropoff_lng<180) &
        (df.pick_lng>-180) & (df.dropoff_lng>-180)]

In [10]:
df[df['driver_rating'] == '\\N']['driver_rating'].value_counts()

driver_rating
\N    4
Name: count, dtype: int64

In [11]:
df = df[df['driver_rating'] != '\\N']
df['driver_rating'] = df['driver_rating'].astype(float)
df['driver_rating'].value_counts()

driver_rating
4.99    285089
4.98    274748
4.97    220918
5.00    158805
4.96    150551
4.95    105008
4.94     71835
4.93     60236
4.92     46228
4.91     24716
4.90     24001
4.89     13386
4.88     13066
4.87     10652
4.85      9007
4.86      7289
4.80      4067
4.83      3999
4.84      3356
4.82      2992
4.78      1611
4.81      1556
4.73      1512
4.79      1416
4.75       941
4.71       687
4.74       552
4.77       528
4.76       447
4.56       271
4.67       173
4.70        73
4.64        44
4.68        43
4.46        42
4.65        25
4.66        25
4.72        24
4.51        22
4.50        20
4.62        15
4.00        10
4.47         7
4.69         2
4.25         1
Name: count, dtype: int64

#### Weekends
- **Column Added**: `is_weekend`
- **Explanation**: This column is based on `pickup_day_of_week`. Trips on weekends tend to face different traffic conditions compared to weekdays, which may significantly impact trip durations.

#### Outside São Paulo
- **Columns Added**: `pickup_outside_sp`, `dropoff_outside_sp`
- **Explanation**: These columns are based on the geographical boundaries of São Paulo, defined by specific latitude and longitude limits. The function checks whether the pickup or dropoff location is outside São Paulo. This is useful because trips that start or end outside the city may face different road conditions, potentially impacting trip durations.
  - **Latitude Range**: -23.68 (South) to -23.35 (North)
  - **Longitude Range**: -46.83 (West) to -46.40 (East)
- **Why it's useful**: Identifying trips that begin or end outside São Paulo is important for understanding different traffic patterns and road conditions, which can vary significantly from those within the city. These trips might be longer and subject to different regulations or transportation policies, which can affect trip duration predictions.

#### Holidays
- **Column Added**: `is_holiday`
- **Source**: https://portal.inmet.gov.br/dadoshistoricos#:~:text=Instituto%20Nacional%20de%20Meteorologia%20%2D%20INMET
- **Potential Enhancement**: Considering adding a column for **holidays on Fridays or Mondays**. These extended weekends often lead to traffic congestion, and thus, such holidays could have a greater impact on trip duration predictions.

#### Rush Hour
- **Column Added**: `is_rush_hour`
- **Source**: [Companhia de Engenharia de Tráfego - CET São Paulo](https://www.cetsp.com.br/)
- **Explanation**: This column flags trips that occur during peak traffic times in São Paulo, specifically during **morning rush hour** (6 AM - 9 AM) and **evening rush hour** (5 PM - 8 PM). These periods generally have heavier traffic due to commuters, which can significantly impact trip duration.
- **Why it's useful**: Identifying trips that occur during rush hour is crucial because traffic congestion is typically at its highest during these times. Trips taken during rush hour are more likely to be delayed or experience longer travel times due to increased traffic volume. This flag can help improve model accuracy by considering the impact of traffic congestion during these critical periods.


In [8]:
df['is_weekend'] = df['pickup_day_of_the_week'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

# Limites geográficos de São Paulo
LAT_MIN = -23.68
LAT_MAX = -23.35
LNG_MIN = -46.83
LNG_MAX = -46.40

# Função para verificar se a coordenada está dentro dos limites de São Paulo
def is_outside_sao_paulo(lat, lng):
    if lat < LAT_MIN or lat > LAT_MAX or lng < LNG_MIN or lng > LNG_MAX:
        return 1  # Fora de São Paulo
    return 0  # Dentro de São Paulo

# Aplicar a função para criar colunas de flags
df['pickup_outside_sp'] = df.apply(lambda row: is_outside_sao_paulo(row['pick_lat'], row['pick_lng']), axis=1)
df['dropoff_outside_sp'] = df.apply(lambda row: is_outside_sao_paulo(row['dropoff_lat'], row['dropoff_lng']), axis=1)

holidays_df = pd.read_excel('/mnt/c/wo_pessoal/uber_assessment/data_extra/feriados_cidade_de_sao_paulo_2022-2023.xlsx')
holidays_df['date'] = pd.to_datetime(holidays_df['Data'])
df['pickup_date'] = pd.to_datetime(df['pickup_ts']).dt.date
df['is_holiday'] = df['pickup_date'].isin(holidays_df['date'].dt.date).astype(int)

def is_morning_rush(hour):
    return 1 if 6 <= hour <= 9 else 0

def is_evening_rush(hour):
    return 1 if 16 <= hour <= 19 else 0

# Step 4: Apply the new features
df['is_morning_rush_hour'] = df['pickup_hour'].apply(is_morning_rush)
df['is_evening_rush_hour'] = df['pickup_hour'].apply(is_evening_rush)

# Step 5: (Optional) Keep original rush hour feature if needed
def is_rush_hour(hour):
    return 1 if (6 <= hour <= 9) or (17 <= hour <= 20) else 0
df['is_rush_hour'] = df['pickup_hour'].apply(is_rush_hour)

df[['pickup_ts', 'is_weekend', 'pickup_outside_sp', 'dropoff_outside_sp', 'is_morning_rush_hour', 'is_evening_rush_hour', 'is_rush_hour']].sample(5)

Unnamed: 0,pickup_ts,is_weekend,pickup_outside_sp,dropoff_outside_sp,is_morning_rush_hour,is_evening_rush_hour,is_rush_hour
1136853,2023-01-29 20:05:19,1,0,0,0,0,1
1051615,2022-12-25 17:02:16,1,0,1,0,1,1
216015,2022-12-22 15:54:29,0,1,1,0,0,0
447984,2023-01-28 09:44:28,1,0,0,1,0,1
1201091,2023-02-06 16:17:11,0,1,1,0,1,0
