In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

In [2]:
cus_df=pd.read_csv('cust_data.csv')
ride_df=pd.read_csv('trip_data.csv')
driver_df=pd.read_csv('driver_data.csv')

In [3]:
cus_df.drop('Unnamed: 0',axis=1,inplace=True)
driver_df.drop('Unnamed: 0',axis=1,inplace=True)
ride_df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the  distance between two points on a Earth."""


    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))

    # Radius of the Earth in kilometers 
    r = 6371

    # Calculate the distance
    distance = r * c
    return distance

In [5]:
#  DataFrame named 'ride_df' with columns 'latitude' and 'longitude'
ride_df['distance_km'] = ride_df.apply(lambda row: haversine_distance(row['Origin_latitude'], row['Origin_longitude'], row['Dest_latitude'], row['Dest_longitude']), axis=1)


In [6]:
# Convert the 'datetime_column' to datetime format
ride_df['Trip_StartTime'] = pd.to_datetime(ride_df['Trip_StartTime'])

# Create new columns for date and time (without UTC)
ride_df['Trip_StartTime'] = ride_df['Trip_StartTime'].dt.strftime('%Y-%m-%d %H:%M:%S')


In [7]:
ride_df=ride_df.sort_values(by='Trip_StartTime')

In [8]:
ride_df=ride_df.reset_index(drop=True)

In [9]:
# Convert 'pickup_datetime' to datetime format (skip this step if already in datetime format)
ride_df['Trip_StartTime'] = pd.to_datetime(ride_df['Trip_StartTime'])

In [10]:
def remove_outliers_iqr(series,q3):
    """Removes outliers from a pandas Series using the interquartile range (IQR) method.

    Args:
        series: A pandas Series.
        q3 : 75 percentile.

    Returns:
        A pandas Series with outliers removed.
    """
    # Calculate the first and third quartiles (Q1 and Q3) of the Series
    q1 = series.quantile(0.25)
    q3 = series.quantile(q3)
    
    # Calculate the interquartile range (IQR)
    iqr = q3 - q1
    
    # Define the lower and upper bounds for outliers
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    # Remove values outside the lower and upper bounds
    series_outliers_removed = series[(series >= lower_bound) & (series <= upper_bound)]
    
    
    return series_outliers_removed


In [11]:
ride_df['fare_amount']=remove_outliers_iqr(ride_df['fare_amount'],0.90)

In [12]:
# fill nan by median
ride_df['fare_amount'].fillna(ride_df['fare_amount'].median(),inplace=True)

In [13]:
ride_df.loc[ride_df['distance_km']>2000,'distance_km']=ride_df['distance_km'].quantile(0.75)

In [14]:
ride_df.loc[ride_df.distance_km==0,['fare_amount']]=0

In [15]:
ride_df.loc[(ride_df.distance_km >80) & (ride_df.fare_amount ),['fare_amount']]=49

In [16]:
ride_df.loc[(ride_df.distance_km < 1) & (ride_df.fare_amount >30 ),['fare_amount']] = ride_df.fare_amount.quantile(0.25)

In [17]:
driver_df['Driver_Age_range']=pd.cut(driver_df.Age,bins=[26,33,40,45,60],labels=['Young','Adult','Elder','Old age'])

In [18]:
ride_df['distance_km_range']=pd.cut(ride_df.distance_km,bins=[-1,2,4,8,127],labels=['very_short','short','long','very_long'])

In [19]:
cus_df['Cust_Age_range']=pd.cut(cus_df.Age,bins=[20,30,40,45,60],labels=['Young','Adult','Elder','Old age'])

In [21]:
ride_df=ride_df.loc[~(ride_df.Origin_longitude <= -91)]

In [22]:
driver_df['Driver_Age_range'].value_counts()

Old age    17
Young      15
Adult      11
Elder       6
Name: Driver_Age_range, dtype: int64

In [23]:
ride_df=ride_df.loc[ride_df['distance_km'] < 100]

In [24]:
ride_df.reset_index(drop=True,inplace=True)

In [25]:
s=ride_df.loc[ride_df.distance_km!=0].index.to_list()


In [26]:
for i in ride_df.iloc[s].index:
    ride_df.iloc[i]['distance_km'] = np.random.randint(0,3) + ride_df.iloc[i]['distance_km']
    



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ride_df.iloc[i]['distance_km'] = np.random.randint(0,3) + ride_df.iloc[i]['distance_km']


In [28]:
cancelled_trip_df=ride_df[ride_df.Trip_cancellation==1]

In [29]:
cancelled_trip_df=cancelled_trip_df[['Trip_ID','Trip_StartTime','Origin_longitude','Origin_latitude','Dest_longitude','Dest_latitude','Customer_ID','Driver_ID','driver_rated_for_trip','customer_rated_for_trip','Trip_cancellation','Cancelled_by','Reason','year','month']]

In [40]:
# cancelled_trip_df.to_csv('cancelled_data.csv',index=False)