# import importan functions

In [15]:
import pandas as pd
import numpy as np
#preprocessing
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
#model imports
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
#evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Read dataSet from CSV to DataFrame

In [2]:
path = r"D:\AI_road_map\python\dataSets\final_internship_data.csv"
data = pd.read_csv(path)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   User ID            500000 non-null  object 
 1   User Name          500000 non-null  object 
 2   Driver Name        500000 non-null  object 
 3   Car Condition      500000 non-null  object 
 4   Weather            500000 non-null  object 
 5   Traffic Condition  500000 non-null  object 
 6   key                500000 non-null  object 
 7   fare_amount        500000 non-null  float64
 8   pickup_datetime    500000 non-null  object 
 9   pickup_longitude   500000 non-null  float64
 10  pickup_latitude    500000 non-null  float64
 11  dropoff_longitude  499995 non-null  float64
 12  dropoff_latitude   499995 non-null  float64
 13  passenger_count    500000 non-null  int64  
 14  hour               500000 non-null  int64  
 15  day                500000 non-null  int64  
 16  mo

## Excluded features that weren’t useful

In [3]:
data.drop(labels=['User ID', 'User Name', 'Driver Name', 'key', 'pickup_datetime','Weather','Traffic Condition','Car Condition'],axis=1,inplace=True)

## Removed null values since they were few in number.

In [4]:
data.dropna(axis=0,inplace=True)

## Removing outliers

In [6]:
def remove_outliers_zscore(data, column, threshold=3):

    z_scores = zscore(data[column])
    mask = abs(z_scores) < threshold
    return data[mask]

In [7]:
feauters = ['fare_amount', 'pickup_longitude', 'pickup_latitude',
            'dropoff_longitude', 'dropoff_latitude', 'jfk_dist', 'ewr_dist',
            'lga_dist','sol_dist', 'nyc_dist', 'distance',]

for col in feauters:
    data = remove_outliers_zscore(data, col)

## Split the dataset into features and target, then divided into train and test sets.

In [8]:
X = data.drop(labels=["fare_amount"],axis=1)
Y = data["fare_amount"].copy()

In [9]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2 , random_state=42)

## Feature extaraction and scalining

In [10]:
PCA_featuers = ['pickup_longitude', 'pickup_latitude',
            'dropoff_longitude', 'dropoff_latitude', 'jfk_dist', 'ewr_dist',
            'lga_dist','sol_dist', 'nyc_dist']
pca_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(random_state=42,n_components=0.95))
])

# Define column transformer
pca_colum_transformer = ColumnTransformer(
    transformers=[
        ('pca_features', pca_pipeline, PCA_featuers),
        ('scale_rest', StandardScaler(), [col for col in x_train.columns if col not in PCA_featuers])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Transform
x_train_scaled = pca_colum_transformer.fit_transform(x_train)
x_test_scaled = pca_colum_transformer.transform(x_test)
# To DataFrame
x_train_scaled = pd.DataFrame(
    x_train_scaled,
    columns=pca_colum_transformer.get_feature_names_out()
)

## Check for multicollinearity

In [12]:
from sklearn.linear_model import LinearRegression
def compute_vif(X_df):
    vif_dict = {}
    for col in X_df.columns:
        X_other = X_df.drop(columns=[col])
        y_target = X_df[col]
        model = LinearRegression().fit(X_other, y_target)
        r2 = model.score(X_other, y_target)
        tolerance = 1 - r2
        vif = 1 / tolerance if tolerance != 0 else float('inf')
        vif_dict[col] = round(vif, 2)
    return pd.DataFrame({'Feature': vif_dict.keys(), 'VIF': vif_dict.values()})
    
vif_df = compute_vif(x_train_scaled)
print(vif_df)

            Feature   VIF
0              pca0  1.01
1              pca1  1.12
2              pca2  1.55
3   passenger_count  1.00
4              hour  1.01
5               day  1.00
6             month  1.01
7           weekday  1.01
8              year  1.01
9          distance  1.11
10          bearing  1.57


## features selection

In [128]:
importance = RandomForestRegressor()
importance.fit(x_train_scaled,y_train)

important = importance.feature_importances_
feauters = x_train_scaled.columns

pd.DataFrame({
        'featuers' : feauters,
        'importance' : important
}).sort_values(by="importance",ascending=False,inplace=True)



In [131]:
df = pd.DataFrame({
        'featuers' : feauters,
        'importance' : important
})

df.sort_values(by="importance",ascending=False,inplace=True)
df

Unnamed: 0,featuers,importance
16,distance,0.733279
0,pca0,0.051372
1,pca1,0.040228
15,year,0.039447
17,bearing,0.034248
2,pca2,0.023259
11,hour,0.021657
12,day,0.01351
13,month,0.012565
14,weekday,0.009888


In [157]:
from sklearn.feature_selection import mutual_info_regression

mi = mutual_info_regression(x_train_scaled, y_train)
mi_series = pd.Series(mi, index=x_train_scaled.columns).sort_values(ascending=False)
print(mi_series)  # Top 10 features

distance           0.785844
pca2               0.479855
year               0.462017
pca1               0.090528
bearing            0.058874
pca0               0.054168
hour               0.014831
month              0.012855
passenger_count    0.008392
weekday            0.002283
day                0.001853
dtype: float64


## Training and evaluating the model

In [155]:
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=4,
    n_jobs=-1,
    random_state=42
)
model.fit(x_train_scaled,y_train)

y_pred = model.predict(x_test_scaled)
y_pred_train = model.predict(x_train_scaled)



In [None]:

print("Test Set:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

print("\nTrain Set:")
print("MAE:", mean_absolute_error(y_train, y_pred_train))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))
print("R²:", r2_score(y_train, y_pred_train))

Test Set:
MAE: 1.364682667703858
RMSE: 2.2846530280952537
R²: 0.8219737503509096

Train Set:
MAE: 0.9201935920317754
RMSE: 1.5726088798368119
R²: 0.914680667937954


In [11]:
model2 = XGBRegressor( )
model2.fit(x_train_scaled,y_train)

y_pred2 = model2.predict(x_test_scaled)
y_pred_train2 = model2.predict(x_train_scaled)

In [12]:

print("Test Set:")
print("MAE:", mean_absolute_error(y_test, y_pred2))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred2)))
print("R²:", r2_score(y_test, y_pred2))

print("\nTrain Set:")
print("MAE:", mean_absolute_error(y_train, y_pred_train2))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train2)))
print("R²:", r2_score(y_train, y_pred_train2))

Test Set:
MAE: 1.3183865309043195
RMSE: 2.241493292549924
R²: 0.8286364602178582

Train Set:
MAE: 1.2528237208896473
RMSE: 2.0503375468662712
R²: 0.8549703732666851


In [162]:
model3 = LinearRegression()
model3.fit(x_train_scaled,y_train)

y_pred3 = model3.predict(x_test_scaled)
y_pred_train3 = model3.predict(x_train_scaled)



In [163]:
print("Test Set:")
print("MAE:", mean_absolute_error(y_test, y_pred3))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred3)))
print("R²:", r2_score(y_test, y_pred3))

print("\nTrain Set:")
print("MAE:", mean_absolute_error(y_train, y_pred_train3))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train3)))
print("R²:", r2_score(y_train, y_pred_train3))

Test Set:
MAE: 1.8131996212865702
RMSE: 2.8079922582701884
R²: 0.7310724164261337

Train Set:
MAE: 1.7887366282621222
RMSE: 2.740245241342703
R²: 0.7409492439747407


In [164]:
model4 = KNeighborsRegressor()
model4.fit(x_train_scaled,y_train)

y_pred4 = model4.predict(x_test_scaled)
y_pred_train4 = model4.predict(x_train_scaled)



In [165]:
print("Test Set:")
print("MAE:", mean_absolute_error(y_test, y_pred4))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred4)))
print("R²:", r2_score(y_test, y_pred4))

print("\nTrain Set:")
print("MAE:", mean_absolute_error(y_train, y_pred_train4))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train4)))
print("R²:", r2_score(y_train, y_pred_train4))

Test Set:
MAE: 1.7528385575620018
RMSE: 2.6910778615036537
R²: 0.7530005055706375

Train Set:
MAE: 1.4170319572067602
RMSE: 2.135453495195475
R²: 0.842679167589044


In [None]:
model5 = DecisionTreeRegressor(random_state=42,)
model5.fit(x_train_scaled,y_train)

y_pred5 = model5.predict(x_test_scaled)
y_pred_train5 = model5.predict(x_train_scaled)



In [19]:
print("Test Set:")
print("MAE:", mean_absolute_error(y_test, y_pred5))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred5)))
print("R²:", r2_score(y_test, y_pred5))

print("\nTrain Set:")
print("MAE:", mean_absolute_error(y_train, y_pred_train5))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train5)))
print("R²:", r2_score(y_train, y_pred_train5))

Test Set:
MAE: 1.9948588081046241
RMSE: 3.322270885889596
R²: 0.6235445246791123

Train Set:
MAE: 1.1017826843985587e-06
RMSE: 0.0004694214917072565
R²: 0.9999999923979287


## hyperparameters tuning

In [None]:
from sklearn.model_selection import GridSearchCV

xgb_model = XGBRegressor(learning_rate = 0.1,n_estimators=300)

param_grid = {
    'max_depth': [6, 10, 15],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 1],
    'reg_alpha': [0, 0.1, 1.0],
    'reg_lambda': [1.0, 1.5, 2.0],
}

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=3,
    n_jobs=-1
)

grid_search.fit(x_train_scaled, y_train)


best_params = grid_search.best_params_



In [None]:
from sklearn.model_selection import RandomizedSearchCV
xgb_model = XGBRegressor(n_estimators=300)

param_grid = {
    'learning_rate' : [0.01,.03,0.1],
    'max_depth': [6, 10, 15],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0.1, 0.3, 1],
    'reg_alpha': [0, 0.1, 1.0],
    'reg_lambda': [1.0, 2.5, 5.0],
}
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=100,  
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=3, 
    n_jobs=-1
)
random_search.fit(x_train_scaled, y_train)