In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
import joblib
import os

In [2]:
# 1. Load and filter dataset
dt = pd.read_csv('final_internship_data.csv')
sample_size = 0.3
dt_sample = dt.sample(frac=sample_size, random_state=42)

In [3]:
# Handle missing values
print("Initial Missing Values:")
print(dt_sample.isnull().sum())

Initial Missing Values:
User ID              0
User Name            0
Driver Name          0
Car Condition        0
Weather              0
Traffic Condition    0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
hour                 0
day                  0
month                0
weekday              0
year                 0
jfk_dist             0
ewr_dist             0
lga_dist             0
sol_dist             0
nyc_dist             0
distance             0
bearing              0
dtype: int64


In [4]:
dt_sample = dt_sample.dropna()

In [5]:
# Specify the features to be used
features = ['day', 'month', 'weekday', 'year', 'distance', 'bearing', 'Traffic Condition', 'pickup_datetime', 'dropoff_longitude']
target = 'fare_amount'

In [6]:
# Impute missing numerical values with median and categorical with mode
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [7]:
# Separate numeric and categorical columns
numerical_cols = ['day', 'month', 'weekday', 'year', 'distance', 'bearing', 'dropoff_longitude']
categorical_cols = ['Traffic Condition']

In [8]:
# Impute missing values
dt_sample[numerical_cols] = num_imputer.fit_transform(dt_sample[numerical_cols])
dt_sample[categorical_cols] = cat_imputer.fit_transform(dt_sample[categorical_cols])

In [9]:
# Outlier detection and removal for robustness
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = isolation_forest.fit_predict(dt_sample[numerical_cols])
dt_sample = dt_sample[outliers == 1]

In [10]:
# Convert 'pickup_datetime' to datetime format
dt_sample['pickup_datetime'] = pd.to_datetime(dt_sample['pickup_datetime'])

In [11]:
# Define the feature matrix and target variable
X = dt_sample[features]
y = dt_sample[target]

In [12]:
# Feature scaling and one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

In [13]:
# Dimensionality reduction using PCA
pca = PCA(n_components=0.95)

In [14]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
# Decision Tree with Grid Search
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

In [16]:
dt_param_grid = {
    'regressor__max_depth': [5, 10, 15, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

In [17]:
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=cv, scoring='neg_mean_squared_error')
dt_grid_search.fit(X_train, y_train)

In [18]:
# Random Forest with Randomized Search
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [19]:
rf_param_dist = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_features': ['sqrt', 'log2'],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}

In [20]:
rf_random_search = RandomizedSearchCV(rf_pipeline, rf_param_dist, n_iter=50, cv=cv, scoring='neg_mean_squared_error', random_state=42)
rf_random_search.fit(X_train, y_train)

In [21]:
# Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', pca),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

In [22]:
gb_param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5, 10]
}

In [23]:
gb_grid_search = GridSearchCV(gb_pipeline, gb_param_grid, cv=cv, scoring='neg_mean_squared_error')
gb_grid_search.fit(X_train, y_train)

In [24]:
# Evaluate models
dt_best_model = dt_grid_search.best_estimator_
rf_best_model = rf_random_search.best_estimator_
gb_best_model = gb_grid_search.best_estimator_

In [25]:
dt_y_pred = dt_best_model.predict(X_test)
rf_y_pred = rf_best_model.predict(X_test)
gb_y_pred = gb_best_model.predict(X_test)

In [26]:
# Performance evaluation
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_y_pred))
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_y_pred))
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_y_pred))

In [27]:
dt_r2 = r2_score(y_test, dt_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)
gb_r2 = r2_score(y_test, gb_y_pred)

In [28]:
print(f'Decision Tree - RMSE: {dt_rmse:.2f}, R2: {dt_r2:.2f}')
print(f'Random Forest - RMSE: {rf_rmse:.2f}, R2: {rf_r2:.2f}')
print(f'Gradient Boosting - RMSE: {gb_rmse:.2f}, R2: {gb_r2:.2f}')

Decision Tree - RMSE: 5.16, R2: 0.60
Random Forest - RMSE: 4.75, R2: 0.66
Gradient Boosting - RMSE: 4.76, R2: 0.66


In [29]:
# Model comparison
best_model, best_rmse, best_r2 = min(
    [(dt_best_model, dt_rmse, dt_r2), (rf_best_model, rf_rmse, rf_r2), (gb_best_model, gb_rmse, gb_r2)],
    key=lambda x: x[1]
)
print(f'Best model: {best_model} with RMSE: {best_rmse:.2f} and R2: {best_r2:.2f}')

Best model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  ['day', 'month', 'weekday',
                                                   'year', 'distance',
                                                   'bearing',
                                                   'dropoff_longitude']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  ['Traffic Condition'])])),
                ('pca', PCA(n_components=0.95)),
                ('regressor',
                 RandomForestRegressor(max_depth=30, max_features='log2',
                                       min_samples_leaf=4, min_samples_split=5,
                                       n_estimators=200, rando

In [30]:
# Save the best model
os.makedirs('ml_app/models', exist_ok=True)
joblib.dump(best_model, 'ml_app/models/best_model.pkl')

['ml_app/models/best_model.pkl']