In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math



In [None]:
# importing the data

ride_df=pd.read_csv('Trip_data.csv')
driver_df=pd.read_csv('driver_data_raw.csv')

In [None]:
driver_df.drop('Unnamed: 0',axis=1,inplace=True)
ride_df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the  distance between two points on a Earth."""


    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

    # Radius of the Earth in kilometers 
    r = 6371

    # Calculate the distance
    distance = r * c
    return distance

In [None]:
#  DataFrame named 'ride_df' with columns 'latitude' and 'longitude'
ride_df['distance_km'] = ride_df.apply(lambda row: haversine_distance(row['Origin_latitude'], row['Origin_longitude'], row['Dest_latitude'], row['Dest_longitude']), axis=1)


In [None]:
# Convert the 'datetime_column' to datetime format
ride_df['Trip_StartTime'] = pd.to_datetime(ride_df['Trip_StartTime'])

# Create new columns for date and time (without UTC)
ride_df['Trip_StartTime'] = ride_df['Trip_StartTime'].dt.strftime('%Y-%m-%d %H:%M:%S')


In [None]:
# Sorting data by Trip_StartTime
ride_df=ride_df.sort_values(by='Trip_StartTime')

In [None]:
ride_df=ride_df.reset_index(drop=True)

In [None]:
# Convert 'pickup_datetime' to datetime format (skip this step if already in datetime format)
ride_df['Trip_StartTime'] = pd.to_datetime(ride_df['Trip_StartTime'])

In [None]:
def remove_outliers_iqr(series,q3):
    """Removes outliers from a pandas Series using the interquartile range (IQR) method.

    Args:
        series: A pandas Series.
        q3 : 75 percentile.

    Returns:
        A pandas Series with outliers removed.
    """
    # Calculate the first and third quartiles (Q1 and Q3) of the Series
    q1 = series.quantile(0.25)
    q3 = series.quantile(q3)
    
    # Calculate the interquartile range (IQR)
    iqr = q3 - q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    # Remove values outside the lower and upper bounds
    series_outliers_removed = series[(series >= lower_bound) & (series <= upper_bound)]
    
    
    return series_outliers_removed


In [None]:
# outlier removal of fare amount
ride_df['fare_amount']=remove_outliers_iqr(ride_df['fare_amount'],0.90)

In [None]:
# fill nan by median
ride_df['fare_amount'].fillna(ride_df['fare_amount'].median(),inplace=True)

In [None]:
ride_df.loc[ride_df['distance_km']>2000,'distance_km']=ride_df['distance_km'].quantile(0.75)

In [None]:
ride_df.loc[ride_df.distance_km==0,['fare_amount']]=0

In [None]:
ride_df.loc[(ride_df.distance_km >80) & (ride_df.fare_amount ),['fare_amount']]=49

In [None]:
ride_df.loc[(ride_df.distance_km < 1) & (ride_df.fare_amount >30 ),['fare_amount']] = ride_df.fare_amount.quantile(0.25)

In [None]:
driver_df['Driver_Age_range']=pd.cut(driver_df.Age,bins=[25,33,40,45,60],labels=['25-33','33-40','40-45','45-60'])

In [None]:
ride_df['distance_km_range']=pd.cut(ride_df.distance_km,bins=[-1,2.0,6,15,127],labels=['0-2KM','2.1-6KM','6.1-15KM','Above 15KM'])

In [None]:
ride_df=ride_df.loc[~(ride_df.Origin_longitude <= -91)]

In [None]:
ride_df=ride_df.loc[ride_df['distance_km'] < 100]

In [None]:
ride_df.reset_index(drop=True,inplace=True)

In [None]:
s=ride_df.loc[ride_df.distance_km!=0].index.to_list()

In [None]:
# Adding column of hours range 
ride_df['hours_range']=pd.cut(ride_df.hour,bins=[-1,3,6,8,11,14,17,21,24],labels=['12:00AM-03:00AM','03:00AM-06:00AM','06:00AM-09:00AM','09:00AM-12:00PM','12:00PM-03:00PM','03:00PM-06:00PM','06:00PM-09:00PM','12:00AM-3:00AM'])


In [None]:
ride_df['fare_amount_range']=pd.cut(ride_df.fare_amount,bins=[-1,5,20,40,500],labels=['Inexpensive','Affordable','Economical','Expensive'])

In [None]:
s2=ride_df.groupby(['year','month'])['Trip_ID'].count().values

In [None]:
driver_df['REVENUE_GENERATED_BY_DRIVER']=pd.merge(ride_df,driver_df,how='inner',on='Driver_ID').groupby('Driver_ID')['fare_amount'].sum().values

In [None]:
driver_df['REVENUE_GENERATED_BY_DRIVER_GROUP']=pd.cut(driver_df.REVENUE_GENERATED_BY_DRIVER,bins=[-1,driver_df.REVENUE_GENERATED_BY_DRIVER.quantile(0.18),driver_df.REVENUE_GENERATED_BY_DRIVER.quantile(0.85),driver_df.REVENUE_GENERATED_BY_DRIVER.quantile(0.95),driver_df.REVENUE_GENERATED_BY_DRIVER.max()],labels=['LOW REVENUE','MEDIUM REVENUE','HIGH REVENUE','VERY HIGH REVENUE'])

In [None]:
driver_df.to_csv('cdg_demo_driver_data_nb.csv',index=False)