In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Read the dataset

uberDS = pd.read_csv('data/uber.csv');

# Counting number of duplicated rows are present in dataset

print("No. of rows before cleaning the dataset:",len(uberDS));

uberDupCount = uberDS.duplicated().sum();
print("No. of duplicated rows:",uberDupCount);


# Droping duplicated rows from dataset

uberCDS = uberDS.drop_duplicates();
print("No. of rows after cleaning the dataset:",len(uberCDS));

No. of rows before cleaning the dataset: 150000
No. of duplicated rows: 0
No. of rows after cleaning the dataset: 150000


In [3]:
# Detecting and handling the missing values using atleast two imputating stratergies

#  fill the null values: mean => numerical column
# Taking Driver Ratings as a numerical column

print(f"No. of null values in Driver Ratings(Before):",uberDS['Driver Ratings'].isnull().sum());
uberDS['Driver Ratings'] = uberDS['Driver Ratings'].fillna(uberDS['Driver Ratings'].mean());
print(f"No. of null values in Driver Ratings(After):",uberDS['Driver Ratings'].isnull().sum());

#  fill the null values: mode => categorical columns
# Taking Payment Method as a categorical column
print(f"No. of null values in Payment Method(Before):",uberDS['Payment Method'].isnull().sum());
uberDS['Payment Method'] = uberDS['Payment Method'].fillna(uberDS['Payment Method'].mode()[0]);
print(f"No. of null values in Payment Method(After):",uberDS['Payment Method'].isnull().sum());

No. of null values in Driver Ratings(Before): 57000
No. of null values in Driver Ratings(After): 0
No. of null values in Payment Method(Before): 48000
No. of null values in Payment Method(After): 0


In [4]:
# using label encoder, OneHotEndcoder for categorival values
# Booking Status, Vehicle Type


le = LabelEncoder();

# Display the unique values and their counts before encoding (assuming uberDS was loaded and not yet encoded)

print("Unique values and counts in 'Vehicle Type' BEFORE Label Encoding:");
print(uberDS['Vehicle Type'].value_counts());

uberDS['Vehicle Type'] = le.fit_transform(uberDS['Vehicle Type']);

# Now, display the first few rows of the DataFrame to see the encoded values

print("\nFirst few rows of 'Vehicle Type' AFTER Label Encoding:");
display(uberDS[['Vehicle Type']].head());

# To see the unique encoded values and their counts

print("\nUnique values and counts in 'Vehicle Type' AFTER Label Encoding:");
print(uberDS['Vehicle Type'].value_counts());

Unique values and counts in 'Vehicle Type' BEFORE Label Encoding:
Vehicle Type
Auto             37419
Go Mini          29806
Go Sedan         27141
Bike             22517
Premier Sedan    18111
eBike            10557
Uber XL           4449
Name: count, dtype: int64

First few rows of 'Vehicle Type' AFTER Label Encoding:


Unnamed: 0,Vehicle Type
0,6
1,3
2,0
3,4
4,1



Unique values and counts in 'Vehicle Type' AFTER Label Encoding:
Vehicle Type
0    37419
2    29806
3    27141
1    22517
4    18111
6    10557
5     4449
Name: count, dtype: int64


In [5]:
# Perform One-Hot Encoding
# pd.get_dummies creates new columns and drops the original 'Payment Method' column by default

uberDS = pd.get_dummies(uberDS, columns=['Payment Method'], drop_first=True);

# After one-hot encoding, the original 'Payment Method' column no longer exists.
# You can check the new columns created by get_dummies instead.

print("\nDataFrame columns after OneHotEncoding:");
print(uberDS.columns)

# dropping unnecessary columns

uberDS = uberDS.drop(["Payment Method_Credit Card", "Payment Method_Debit Card"], axis=1, errors='ignore');
print(uberDS.head())


DataFrame columns after OneHotEncoding:
Index(['Date', 'Time', 'Booking ID', 'Booking Status', 'Customer ID',
       'Vehicle Type', 'Pickup Location', 'Drop Location', 'Avg VTAT',
       'Avg CTAT', 'Cancelled Rides by Customer',
       'Reason for cancelling by Customer', 'Cancelled Rides by Driver',
       'Driver Cancellation Reason', 'Incomplete Rides',
       'Incomplete Rides Reason', 'Booking Value', 'Ride Distance',
       'Driver Ratings', 'Customer Rating', 'Payment Method_Credit Card',
       'Payment Method_Debit Card', 'Payment Method_UPI',
       'Payment Method_Uber Wallet'],
      dtype='object')
         Date      Time    Booking ID   Booking Status   Customer ID  \
0  2024-03-23  12:29:38  "CNR5884300"  No Driver Found  "CID1982111"   
1  2024-11-29  18:01:39  "CNR1326809"       Incomplete  "CID4604802"   
2  2024-08-23  08:56:10  "CNR8494506"        Completed  "CID9202816"   
3  2024-10-21  17:17:25  "CNR8906825"        Completed  "CID2610914"   
4  2024-09-16  22:

In [6]:
#  Standard scaling (z-scores) and normalization (Min-Max sclaing) to numeric features

numeric_cols = uberDS.select_dtypes(include=['int64', 'float64']).columns

# Standardizing

scaler_standard = StandardScaler()
dsStandard = uberDS.copy()
dsStandard[numeric_cols] = scaler_standard.fit_transform(uberDS[numeric_cols]);
print(dsStandard[numeric_cols].head())

# normalizing

scaler_minmax = MinMaxScaler()
numeric_cols = uberDS.select_dtypes(include=['int64','float64']).columns
uberDS_minmax = uberDS.copy()
uberDS_minmax[numeric_cols] = scaler_minmax.fit_transform(uberDS[numeric_cols])
print(uberDS_minmax[numeric_cols].head())

   Vehicle Type  Avg VTAT  Avg CTAT  Cancelled Rides by Customer  \
0      2.160985       NaN       NaN                          NaN   
1      0.479770 -0.942442 -1.701722                          NaN   
2     -1.201444  1.310079 -0.376257                          NaN   
3      1.040175  1.230578 -0.072972                          NaN   
4     -0.641040 -0.836441 -1.072687                          NaN   

   Cancelled Rides by Driver  Incomplete Rides  Booking Value  Ride Distance  \
0                        NaN               NaN            NaN            NaN   
1                        NaN               0.0      -0.685430      -1.350301   
2                        NaN               NaN       0.299906      -0.789670   
3                        NaN               NaN      -0.233186       0.670114   
4                        NaN               NaN       0.577822       1.683536   

   Driver Ratings  Customer Rating  
0   -2.581981e-15              NaN  
1   -2.581981e-15              NaN  

In [7]:
# detecting the outliers using the IQR method

feature = 'Customer Rating'
Q1 = uberDS[feature].quantile(0.25)
Q3 = uberDS[feature].quantile(0.75)
IQR = Q3 - Q1
lowerBound = Q1 - 1.5 * IQR
upperBound = Q3 + 1.5 * IQR
outliers = uberDS[(uberDS[feature] < lowerBound) | (uberDS[feature] > upperBound)]
uberDS_removed_outliers = uberDS[(uberDS[feature] >= lowerBound) & (uberDS[feature] <= upperBound)]
print("shape before outliers removed",uberDS.shape);
print("shape after outliers removed",uberDS_removed_outliers.shape);

shape before outliers removed (150000, 22)
shape after outliers removed (89743, 22)


In [8]:
# Outliers in fare and distances

Q11 = uberDS['Ride Distance'].quantile(0.25)
Q12 = uberDS['Ride Distance'].quantile(0.75)

IQR = Q12 - Q11

lowerBound = Q11 - 1.5 * IQR
upperBound = Q12 + 1.5 * IQR

outlierss = uberDS[(uberDS['Ride Distance'] < lowerBound) | (uberDS['Ride Distance'] > upperBound)]
uberDS_removed_outlierss = uberDS[(uberDS['Ride Distance'] >= lowerBound) & (uberDS['Ride Distance'] <= upperBound)]
print("shape before outliers removed",uberDS.shape);
print("shape after outliers removed",uberDS_removed_outlierss.shape);

shape before outliers removed (150000, 22)
shape after outliers removed (102000, 22)
