# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

import os
import joblib

# 1. Data Loading and Preprocessing

In [2]:
# load the dataset
dt = pd.read_csv(r'final_internship_data.csv')

In [3]:
dt.head()

Unnamed: 0,User ID,User Name,Driver Name,Car Condition,Weather,Traffic Condition,key,fare_amount,pickup_datetime,pickup_longitude,...,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,KHVrEVlD,Kimberly Adams,Amy Butler,Very Good,windy,Congested Traffic,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-1.288826,...,6,0,2009,20.26584,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897
1,lPxIuEri,Justin Tapia,Hannah Zimmerman,Excellent,cloudy,Flow Traffic,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-1.291824,...,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217
2,gsVN8JLS,Elizabeth Lopez,Amanda Jackson,Bad,stormy,Congested Traffic,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-1.291242,...,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961
3,9I7kWFgd,Steven Wilson,Amy Horn,Very Good,stormy,Flow Traffic,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-1.291319,...,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.79927,0.133905
4,8QN5ZaGN,Alexander Andrews,Cassandra Larson,Bad,stormy,Congested Traffic,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-1.290987,...,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703


In [4]:
len(dt)

500000

In [5]:
dt.columns

Index(['User ID', 'User Name', 'Driver Name', 'Car Condition', 'Weather',
       'Traffic Condition', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count', 'hour', 'day', 'month',
       'weekday', 'year', 'jfk_dist', 'ewr_dist', 'lga_dist', 'sol_dist',
       'nyc_dist', 'distance', 'bearing'],
      dtype='object')

In [6]:
# suumery statistics
dt.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
count,500000.0,500000.0,500000.0,499995.0,499995.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,499995.0,499995.0,499995.0,499995.0,499995.0,499995.0,499995.0
mean,11.358361,-1.265712,0.69674,-1.265755,0.696675,1.683428,13.510834,15.684206,6.26865,3.042008,2011.739132,385.279367,380.503657,363.843772,363.674038,355.991423,19.468775,0.297145
std,9.916617,0.206941,0.140909,0.205903,0.128997,1.307395,6.511571,8.681066,3.437815,1.94924,1.860889,2419.087483,2428.80474,2425.075903,2428.348683,2428.730839,367.299601,1.804548
min,-44.9,-52.119764,-54.38944,-59.049665,-44.676047,0.0,0.0,1.0,1.0,0.0,2009.0,1.017646,1.460945,0.382119,0.532545,0.0805,0.0,-3.141593
25%,6.0,-1.291405,0.710958,-1.291393,0.710943,1.0,9.0,8.0,3.0,1.0,2010.0,41.341514,32.173712,17.100762,14.886989,7.147384,1.21455,-0.854721
50%,8.5,-1.291226,0.711268,-1.291197,0.711277,1.0,14.0,16.0,6.0,3.0,2012.0,42.523163,34.787507,19.591554,18.34758,10.458151,2.11697,-0.050442
75%,12.5,-1.29097,0.71152,-1.290908,0.711538,2.0,19.0,23.0,9.0,5.0,2013.0,43.785649,38.304502,22.214815,22.417812,14.448699,3.89007,2.206769
max,500.0,37.360538,29.724576,0.712985,7.061893,6.0,23.0,31.0,12.0,6.0,2015.0,30133.06788,30167.595967,30167.285794,30159.407296,30162.285356,12399.956433,3.141593


In [7]:
dt.shape

(500000, 26)

In [8]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   User ID            500000 non-null  object 
 1   User Name          500000 non-null  object 
 2   Driver Name        500000 non-null  object 
 3   Car Condition      500000 non-null  object 
 4   Weather            500000 non-null  object 
 5   Traffic Condition  500000 non-null  object 
 6   key                500000 non-null  object 
 7   fare_amount        500000 non-null  float64
 8   pickup_datetime    500000 non-null  object 
 9   pickup_longitude   500000 non-null  float64
 10  pickup_latitude    500000 non-null  float64
 11  dropoff_longitude  499995 non-null  float64
 12  dropoff_latitude   499995 non-null  float64
 13  passenger_count    500000 non-null  int64  
 14  hour               500000 non-null  int64  
 15  day                500000 non-null  int64  
 16  mo

In [9]:
# Use a sample of the dataset (e.g., 30% of the data)
sample_size = 0.3  # You can adjust the sample size as needed
dt_sample = dt.sample(frac=sample_size, random_state=42)

In [10]:
# Handle missing values
print("Initial Missing Values:")
print(dt_sample.isnull().sum())

Initial Missing Values:
User ID              0
User Name            0
Driver Name          0
Car Condition        0
Weather              0
Traffic Condition    0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
hour                 0
day                  0
month                0
weekday              0
year                 0
jfk_dist             1
ewr_dist             1
lga_dist             1
sol_dist             1
nyc_dist             1
distance             1
bearing              1
dtype: int64


In [11]:
# Numerical and categorical columns
numerical_cols = dt_sample.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = ['Car Condition', 'Weather', 'Traffic Condition']

# 2. Advanced Missing Value Imputation

In [12]:
# Impute missing numerical values with median and categorical with mode
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

dt_sample[numerical_cols] = num_imputer.fit_transform(dt_sample[numerical_cols])
dt_sample[categorical_cols] = cat_imputer.fit_transform(dt_sample[categorical_cols])

In [13]:
print("\nMissing Values After Imputation:")
print(dt_sample.isnull().sum())


Missing Values After Imputation:
User ID              0
User Name            0
Driver Name          0
Car Condition        0
Weather              0
Traffic Condition    0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
hour                 0
day                  0
month                0
weekday              0
year                 0
jfk_dist             0
ewr_dist             0
lga_dist             0
sol_dist             0
nyc_dist             0
distance             0
bearing              0
dtype: int64


In [14]:
# Handle outliers using Robust Scaler to make the model less sensitive to outliers
# scaler = RobustScaler()
# dt_sample[numerical_cols] = scaler.fit_transform(dt_sample[numerical_cols])

# 3. Feature Engineering

In [15]:
# Extract new features from 'pickup_datetime'
dt_sample['pickup_datetime'] = pd.to_datetime(dt_sample['pickup_datetime'])
dt_sample['hour'] = dt_sample['pickup_datetime'].dt.hour
dt_sample['day_of_week'] = dt_sample['pickup_datetime'].dt.weekday

In [16]:
# Drop unnecessary columns
X = dt_sample.drop(columns=['fare_amount','User ID', 'User Name', 'Driver Name', 'pickup_datetime', 'key'])
y = dt_sample['fare_amount']

In [17]:
X.columns

Index(['Car Condition', 'Weather', 'Traffic Condition', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'hour', 'day', 'month', 'weekday', 'year',
       'jfk_dist', 'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'distance',
       'bearing', 'day_of_week'],
      dtype='object')

In [18]:
numerical_cols=[ 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'hour', 'day', 'month', 'weekday', 'year',
       'jfk_dist', 'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'distance',
       'bearing', 'day_of_week']

# 4. Feature Scaling and One-Hot Encoding

In [19]:
# One-hot encode categorical features, scale numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical features
        ('cat', OneHotEncoder(drop='first'), categorical_cols)  # One-hot encode categorical features
    ]
)
X = preprocessor.fit_transform(X)
# If you want to see the processed array
output_cols = numerical_cols + preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()

X = pd.DataFrame(X, columns=output_cols)
X

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,...,day_of_week,Car Condition_Excellent,Car Condition_Good,Car Condition_Very Good,Weather_rainy,Weather_stormy,Weather_sunny,Weather_windy,Traffic Condition_Dense Traffic,Traffic Condition_Flow Traffic
0,-0.088194,0.064012,-0.093301,0.089885,-0.524747,0.688763,1.534128,0.214780,-0.535590,-1.468651,...,-0.535590,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.100552,0.075502,-0.099520,0.089778,-0.524747,-0.848696,0.613873,-0.658381,-1.562340,1.213392,...,-1.562340,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.100985,0.074270,-0.099353,0.091410,-0.524747,-0.233712,1.074001,0.214780,-1.562340,-0.395834,...,-1.562340,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5.135157,-3.509793,5.183554,-4.152960,-0.524747,0.073780,-1.456701,0.214780,-0.535590,0.676984,...,-0.535590,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.102533,0.073719,-0.093722,0.090605,0.239529,0.535017,0.498841,-0.949435,1.517909,-0.395834,...,1.517909,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,-0.101990,0.071923,-0.102215,0.086885,-0.524747,-0.541204,1.419097,-1.531543,-0.535590,1.749801,...,-0.535590,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
149996,-0.095868,0.072805,-0.099558,0.089715,-0.524747,-1.002441,0.038714,-0.367327,-0.022215,0.676984,...,-0.022215,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
149997,-0.101942,0.073652,-0.101729,0.089077,-0.524747,0.073780,-0.076318,-0.949435,-1.048965,-0.395834,...,-1.048965,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
149998,-0.101197,0.074190,-0.099416,0.090035,0.239529,1.303746,-1.226637,-0.076274,-0.022215,1.213392,...,-0.022215,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [20]:
preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()

['Car Condition_Excellent',
 'Car Condition_Good',
 'Car Condition_Very Good',
 'Weather_rainy',
 'Weather_stormy',
 'Weather_sunny',
 'Weather_windy',
 'Traffic Condition_Dense Traffic',
 'Traffic Condition_Flow Traffic']

# 5. Use PCA for Dimensionality Reduction

In [21]:
# Use PCA to reduce dimensionality, retain 95% variance
pca = PCA(n_components=0.95)

In [22]:
X

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,...,day_of_week,Car Condition_Excellent,Car Condition_Good,Car Condition_Very Good,Weather_rainy,Weather_stormy,Weather_sunny,Weather_windy,Traffic Condition_Dense Traffic,Traffic Condition_Flow Traffic
0,-0.088194,0.064012,-0.093301,0.089885,-0.524747,0.688763,1.534128,0.214780,-0.535590,-1.468651,...,-0.535590,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.100552,0.075502,-0.099520,0.089778,-0.524747,-0.848696,0.613873,-0.658381,-1.562340,1.213392,...,-1.562340,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.100985,0.074270,-0.099353,0.091410,-0.524747,-0.233712,1.074001,0.214780,-1.562340,-0.395834,...,-1.562340,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5.135157,-3.509793,5.183554,-4.152960,-0.524747,0.073780,-1.456701,0.214780,-0.535590,0.676984,...,-0.535590,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.102533,0.073719,-0.093722,0.090605,0.239529,0.535017,0.498841,-0.949435,1.517909,-0.395834,...,1.517909,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,-0.101990,0.071923,-0.102215,0.086885,-0.524747,-0.541204,1.419097,-1.531543,-0.535590,1.749801,...,-0.535590,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
149996,-0.095868,0.072805,-0.099558,0.089715,-0.524747,-1.002441,0.038714,-0.367327,-0.022215,0.676984,...,-0.022215,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
149997,-0.101942,0.073652,-0.101729,0.089077,-0.524747,0.073780,-0.076318,-0.949435,-1.048965,-0.395834,...,-1.048965,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
149998,-0.101197,0.074190,-0.099416,0.090035,0.239529,1.303746,-1.226637,-0.076274,-0.022215,1.213392,...,-0.022215,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# 6. Model Building and Hyperparameter Tuning

In [23]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Create a cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Feature Selection using SelectKBest

In [25]:
# Feature Selection using SelectKBest
selector = SelectKBest(f_classif, k=8)  # Select the top 8 features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [26]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=8)  # Adjust to the number of selected features
X_train_pca = pca.fit_transform(X_train_selected)
X_test_pca = pca.transform(X_test_selected)

In [27]:
# Get the selected feature names
selected_features = X.columns[selector.get_support()]

In [28]:
# Display the top 8 selected features
print("Top 8 selected features for training:")
print(selected_features.tolist())

Top 8 selected features for training:
['month', 'year', 'jfk_dist', 'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'distance']


In [29]:
joblib.dump(preprocessor.named_transformers_['num'].fit(dt_sample[selected_features.tolist()]),'Task6\savedModels\scaler.pkl')


  joblib.dump(preprocessor.named_transformers_['num'].fit(dt_sample[selected_features.tolist()]),'Task6\savedModels\scaler.pkl')


['Task6\\savedModels\\scaler.pkl']

### Decision Tree Model with Cross-validation and Grid Search

In [30]:
dt_model = Pipeline(steps=[
    #('preprocessor', preprocessor),
    ('pca', pca),  # Apply PCA for dimensionality reduction
    ('regressor', DecisionTreeRegressor(random_state=42))
])

In [31]:
dt_param_grid = {
    'regressor__max_depth': [5, 10, 15, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

In [32]:
# Perform grid search with cross-validation to find the best hyperparameters for Decision Tree
dt_grid_search = GridSearchCV(dt_model, dt_param_grid, cv=cv, scoring='neg_mean_squared_error')
dt_grid_search.fit(X_train_pca, y_train)

KeyboardInterrupt: 

In [None]:
# Best Decision Tree parameters
dt_best_params = dt_grid_search.best_params_
dt_best_model = dt_grid_search.best_estimator_

In [None]:
# Predict on the test set
dt_y_pred = dt_best_model.predict(X_test_pca)

In [None]:
# Performance metrics
dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_rmse = np.sqrt(dt_mse)
dt_r2 = r2_score(y_test, dt_y_pred)

In [None]:
print(f'Decision Tree RMSE: {dt_rmse:.2f}')
print(f'Decision Tree R^2: {dt_r2:.2f}')
print(f'Best parameters for Decision Tree: {dt_best_params}')

Decision Tree RMSE: 0.95
Decision Tree R^2: 0.62
Best parameters for Decision Tree: {'regressor__max_depth': None, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10}


In [None]:
# 7. Save the Best Models
os.makedirs('ml_app/models', exist_ok=True)
joblib.dump(dt_best_model, 'ml_app/models/decision_tree_model.pkl')

### Neural Network Model with Grid Search and Cross-validation

In [None]:
nn_model = Pipeline(steps=[
    #('preprocessor', preprocessor),
    ('pca', pca),
    ('regressor', MLPRegressor(max_iter=1000, random_state=42))
])

In [None]:
nn_param_grid = {
    'regressor__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'regressor__activation': ['tanh', 'relu'],
    'regressor__solver': ['adam', 'sgd'],
    'regressor__alpha': [0.0001, 0.001],
    'regressor__learning_rate_init': [0.001, 0.01]
}

In [None]:
nn_grid_search = GridSearchCV(nn_model, nn_param_grid, cv=cv, scoring='neg_mean_squared_error')
nn_grid_search.fit(X_train_pca, y_train)

In [None]:
# Best Neural Network parameters
nn_best_params = nn_grid_search.best_params_
nn_best_model = nn_grid_search.best_estimator_

In [None]:
# Predict on the test set
nn_y_pred = nn_best_model.predict(X_test_pca)

In [None]:
# Performance metrics
nn_mse = mean_squared_error(y_test, nn_y_pred)
nn_rmse = np.sqrt(nn_mse)
nn_r2 = r2_score(y_test, nn_y_pred)

In [None]:
print(f'Neural Network RMSE: {nn_rmse:.2f}')
print(f'Neural Network R^2: {nn_r2:.2f}')
print(f'Best parameters for Neural Network: {nn_best_params}')

### Model Comparison Based on RMSE

In [None]:
# Model Comparison Between Decision Tree and Neural Network Based on RMSE
if dt_rmse < nn_rmse:
    print(f'Best model: Decision Tree with RMSE: {dt_rmse:.2f} and R^2: {dt_r2:.2f}')
else:
    print(f'Best model: Neural Network with RMSE: {nn_rmse:.2f} and R^2: {nn_r2:.2f}')

In [None]:
import joblib
import os

# 7. Save the Best Models
os.makedirs('ml_app/models', exist_ok=True)
joblib.dump(dt_best_model, 'ml_app/models/decision_tree_model.pkl')
joblib.dump(nn_best_model, 'ml_app/models/neural_network_model.pkl')
joblib.dump(preprocessor, 'ml_app/models/preprocessor.pkl')

In [None]:
X_train.columns