Importing the needed libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import missingno as msno
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , mean_absolute_error
from sklearn.metrics import r2_score
warnings.filterwarnings('ignore')


Preparing the dataset & printing basic information

In [68]:
data_file = pd.read_csv("final_internship_data.csv")
print("Shape: ", data_file.shape); print("\n")
print("First rows:\n", data_file.head()); print("\n")
print("Data types:\n", data_file.dtypes); print("\n")
print("Duplicated rows:\n", data_file.duplicated().sum()); print("\n")

data_file.columns = [col.strip().lower().replace(" ", "_") for col in data_file.columns]

Shape:  (500000, 26)


First rows:
     User ID          User Name       Driver Name Car Condition Weather  \
0  KHVrEVlD     Kimberly Adams        Amy Butler     Very Good   windy   
1  lPxIuEri       Justin Tapia  Hannah Zimmerman     Excellent  cloudy   
2  gsVN8JLS    Elizabeth Lopez    Amanda Jackson           Bad  stormy   
3  9I7kWFgd      Steven Wilson          Amy Horn     Very Good  stormy   
4  8QN5ZaGN  Alexander Andrews  Cassandra Larson           Bad  stormy   

   Traffic Condition                            key  fare_amount  \
0  Congested Traffic    2009-06-15 17:26:21.0000001          4.5   
1       Flow Traffic    2010-01-05 16:52:16.0000002         16.9   
2  Congested Traffic   2011-08-18 00:35:00.00000049          5.7   
3       Flow Traffic    2012-04-21 04:30:42.0000001          7.7   
4  Congested Traffic  2010-03-09 07:51:00.000000135          5.3   

       pickup_datetime  pickup_longitude  ...  month  weekday  year  \
0  2009-06-15 17:26:21         -1.28882

Checking for any missing or duplicated values

In [69]:
#checking for missing values
missing_part = data_file.isnull().mean() * 100
print("Missing values (%):\n", missing_part)

#dropping duplicates
data_file.drop_duplicates(inplace=True)
print("The remaining columns: ", data_file.columns.tolist())

Missing values (%):
 user_id              0.000
user_name            0.000
driver_name          0.000
car_condition        0.000
weather              0.000
traffic_condition    0.000
key                  0.000
fare_amount          0.000
pickup_datetime      0.000
pickup_longitude     0.000
pickup_latitude      0.000
dropoff_longitude    0.001
dropoff_latitude     0.001
passenger_count      0.000
hour                 0.000
day                  0.000
month                0.000
weekday              0.000
year                 0.000
jfk_dist             0.001
ewr_dist             0.001
lga_dist             0.001
sol_dist             0.001
nyc_dist             0.001
distance             0.001
bearing              0.001
dtype: float64
The remaining columns:  ['user_id', 'user_name', 'driver_name', 'car_condition', 'weather', 'traffic_condition', 'key', 'fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'hour', '

Feature engineering

In [70]:
#fature Engineering - datetime features
if 'pickup_datetime' in data_file.columns:
    data_file['pickup_datetime'] = pd.to_datetime(data_file['pickup_datetime'])
    data_file['hour'] = data_file['pickup_datetime'].dt.hour
    data_file['day'] = data_file['pickup_datetime'].dt.day
    data_file['month'] = data_file['pickup_datetime'].dt.month
    data_file['weekday'] = data_file['pickup_datetime'].dt.weekday
    data_file['year'] = data_file['pickup_datetime'].dt.year


Creating some new features

In [71]:
#5 = saturday, 6 = sunday
data_file['is_weekend'] = data_file['weekday'].isin([5,6])

#mornings and evenings
data_file['rsuh_hours'] = data_file['hour'].between(7 , 10) | data_file['hour'].between(16,19)

Data cleansing Part

In [72]:
#--> filling the numerical missing values with median method
numer_col = data_file.select_dtypes(include=['int64','float64']).columns
for column in numer_col:
    data_file[column].fillna(data_file[column].median(), inplace=True)

#categorical with mode
cate_col = data_file.select_dtypes(include='object').columns
for col in cate_col:
    data_file[col].fillna(data_file[col].mode()[0], inplace= True)

#fixing the inconsistent values
if 'meal' in data_file.columns:
    data_file['meal'] = data_file['meal'].replace('Undefined', 'SC')

#handling outliers with IQR

def IQR_Function(data_file, columns):
    for col in columns:
        Quart_1 = data_file[col].quantile(0.25)
        Quart_3 = data_file[col].quantile(0.75)
        IQR = Quart_3 - Quart_1
        lower_b = Quart_1 - 1.5 * IQR
        upper_b = Quart_3 + 1.5 * IQR
        outliers = data_file[(data_file[col] < lower_b) | data_file[col] > upper_b]
        print(f"{col}: {len(outliers)} outliers have been removed")
        data_file = data_file[(data_file[col] >= lower_b) & (data_file[col]<=upper_b)]
    return data_file



data_file = IQR_Function(data_file, numer_col)

fare_amount: 0 outliers have been removed
pickup_longitude: 456669 outliers have been removed
pickup_latitude: 439229 outliers have been removed
dropoff_longitude: 432451 outliers have been removed
dropoff_latitude: 424984 outliers have been removed
passenger_count: 0 outliers have been removed
jfk_dist: 0 outliers have been removed
ewr_dist: 0 outliers have been removed
lga_dist: 0 outliers have been removed
sol_dist: 0 outliers have been removed
nyc_dist: 0 outliers have been removed
distance: 0 outliers have been removed
bearing: 0 outliers have been removed


Encoding

In [73]:
LE_Cols = ['user_id', 'user_name', 'driver_name']
LE = LabelEncoder()
for col in LE_Cols:
    data_file[col] = LE.fit_transform(data_file[col])


OH_Cols = ['car_condition', 'weather', 'traffic_condition']
data_file = pd.get_dummies(data_file, columns=OH_Cols, drop_first=True)

#non-useful columns
data_file.drop(['pickup_datetime', 'key'], axis=1, inplace=True, errors='ignore')

Linear Regression Algorithm

In [None]:
from sklearn.linear_model import LinearRegression


X = data_file.drop(columns=['fare_amount'])
Y = data_file['fare_amount']

X_train, X_test, Y_train, Y_Test = train_test_split(X , Y, test_size=0.4, random_state=30)

linearRegression = LinearRegression()
linearRegression.fit(X_train, Y_train)
predict = linearRegression.predict(X_test)

MAE = mean_absolute_error(Y_Test , predict)
acc_lin = 100 * (1 - (MAE / Y.mean()))
print(f"MAE = {MAE:.2f}")
print(f"Approximated Accuracy = {acc_lin:.2f}%")
#First try: 82.41%, 0.5 , 40
#Second try: 82.44%, 0.4 , 40
#Third try: 82.25%, 0.1 , 40
#Fourth try: 82.36, 0.4, 30


MAE = 1.46
Approximated Accuracy = 82.36%


RandomForest Algorithm!

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , mean_absolute_error
from sklearn.metrics import r2_score

X = data_file.drop(columns=['fare_amount'])
y = data_file['fare_amount']

X_train, X_test, Y_train, Y_Test = train_test_split(X , y , test_size = 0.3, random_state = 35)
R_Forest = RandomForestRegressor(n_estimators=50, max_depth= 2 , random_state= 40, n_jobs=-1)
R_Forest.fit(X_train , Y_train)

Results = R_Forest.predict(X_test)


R2 = r2_score(Y_Test , Results)
MAE = mean_absolute_error(Y_Test , Results)
print(f"R Sqaured = {R2:.4f}")
Accuracy = 100 * (1-(MAE/y.mean()))
print(f"Approximated accuracy = {Accuracy:.2f}%")
#First try: 82.60%, 100 , 5 , 30
#Second try: 85.63%, 150 , 15 , 40
#Third try: 85.73%, 250 , 20 , 40  
#Fourth try: 85.71%, 350 , 40 , 40


R Sqaured = 0.5578
Approximated accuracy = 79.77%


Linear Regression Algorithm

In [90]:
from sklearn.linear_model import LinearRegression


X = data_file.drop(columns=['fare_amount'])
Y = data_file['fare_amount']

X_train, X_test, Y_train, Y_Test = train_test_split(X , Y, test_size=0.3, random_state=40)

linearRegression = LinearRegression()
linearRegression.fit(X_train, Y_train)
predict = linearRegression.predict(X_test)

MAE = mean_absolute_error(Y_Test , predict)
acc_lin = 100 * (1 - (MAE / Y.mean()))
print(f"MAE = {MAE:.2f}")
print(f"Approximated Accuracy = {acc_lin:.2f}%")



MAE = 1.45
Approximated Accuracy = 82.44%
