In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/NYC_DATA_SET/preprocessed_data_2020.csv')

In [None]:
df_new= df[df['month'].isin([6,7,8,9,10,11,12])]

In [None]:
df_new.shape

(542379, 7)

In [None]:
df_new.head(2)

Unnamed: 0,time_category_encoded,month,day,PULocationID,DOLocationID,trip_distance,total_amount
1110493,1,6,1,74,75,1.49,7.3
1110494,1,6,1,74,75,1.43,6.8


In [None]:
df_new.corr()['total_amount']

Unnamed: 0,total_amount
time_category_encoded,-0.12528
month,-0.012773
day,0.012356
PULocationID,0.172414
DOLocationID,0.078873
trip_distance,0.008135
total_amount,1.0


In [None]:
df_new.head()

Unnamed: 0,time_category_encoded,month,day,PULocationID,DOLocationID,trip_distance,total_amount
1110493,1,6,1,74,75,1.49,7.3
1110494,1,6,1,74,75,1.43,6.8
1110495,1,6,1,82,80,4.1,14.3
1110496,1,6,1,97,151,11.61,37.11
1110497,1,6,1,17,238,13.0,42.06


In [None]:
df_new.isnull().sum()

Unnamed: 0,0
time_category_encoded,0
month,0
day,0
PULocationID,0
DOLocationID,0
trip_distance,0
total_amount,0


In [None]:
y=df_new['total_amount']
x=df_new.drop(columns=['total_amount'])

In [None]:
from sklearn.model_selection import  train_test_split
x_train, x_test, y_train , y_test = train_test_split(x,y,random_state=7)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(406784, 6)
(406784,)
(135595, 6)
(135595,)


In [None]:
#Initialize XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [None]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.8, 1.0]
}

In [None]:
#  Grid Search
grid_search = GridSearchCV(estimator=xgb_model,
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_mean_squared_error',
                           verbose=1)

grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [None]:
# Best model & predictions
best_model = grid_search.best_estimator_

In [None]:
# Predict on train and test
y_train_pred = best_model.predict(x_train)
y_test_pred = best_model.predict(x_test)

In [None]:
# Compute metrics on test data
mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

# Also compute metrics on training data
mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print(f"Train MAE: {mae_train:.4f}")
print(f"Train MSE: {mse_train:.4f}")
print(f"Train RMSE: {rmse_train:.4f}")
print(f"Train R²: {r2_train:.4f}\n")

print(f"Test MAE: {mae_test:.4f}")
print(f"Test MSE: {mse_test:.4f}")
print(f"Test RMSE: {rmse_test:.4f}")
print(f"Test R²: {r2_test:.4f}")




Train MAE: 3.7280
Train MSE: 35.3837
Train RMSE: 5.9484
Train R²: 0.8681

Test MAE: 3.7797
Test MSE: 36.4500
Test RMSE: 6.0374
Test R²: 0.8632


In [None]:
# Step 7: Evaluate Overfitting / Underfitting
print("\nTraining R2 Score:", r2_score(y_train, y_train_pred))
print("Testing R2 Score:", r2_score(y_test, y_test_pred))
print("Training MSE:", mean_squared_error(y_train, y_train_pred))
print("Testing MSE:", mean_squared_error(y_test, y_test_pred))

In [None]:
# Scatter Plot (Actual vs Predicted)
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test, alpha=0.7)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs Predicted Values - XGBoost Regression")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')  # Ideal line
plt.grid(True)
plt.show()

In [None]:
# Pickle
import joblib
joblib.dump(best_model, 'xgb_model.pkl')
print("Saved tuned model: xgb_model.pkl")

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

# Parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Base model with early stopping and eval metric
xgb_model = xgb.XGBRegressor(
    random_state=42,
    objective='reg:squarederror',  # Added objective
    verbosity=0,
    eval_metric='rmse'  # Moved eval_metric here
)

# Grid SearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit model
grid_search.fit(x_train, y_train)

# Best model
best_xgb_model_tuned_gs = grid_search.best_estimator_

# Fit best model with eval_set to collect learning curves
best_xgb_model_tuned_gs.fit(
    x_train, y_train,
    eval_set=[(x_train, y_train), (x_test, y_test)],
    verbose=False  # Removed eval_metric here
)

# Predictions
y_pred_xgb_tuned_gs = best_xgb_model_tuned_gs.predict(x_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred_xgb_tuned_gs)
mse = mean_squared_error(y_test, y_pred_xgb_tuned_gs)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_xgb_tuned_gs)

print("\nEvaluation of the best XGBoost model after tuning:")
print(f"MAE:  {mae:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²:   {r2:.4f}")

# Plot Learning Curves (Train vs Test RMSE)
results = best_xgb_model_tuned_gs.evals_result()

train_rmse = results['validation_0']['rmse']
test_rmse = results['validation_1']['rmse']

plt.figure(figsize=(10, 5))
plt.plot(train_rmse, label='Train RMSE', color='blue')
plt.plot(test_rmse, label='Test RMSE', color='red')
plt.xlabel("Boosting Rounds")
plt.ylabel("RMSE")
plt.title("Train vs Test RMSE Over Boosting Rounds")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Import required libraries
from sklearn.model_selection import GridSearchCV, train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load your dataset (replace with your actual data loading code)
# df = pd.read_csv('your_data.csv')
# x = df.drop('target_column', axis=1)
# y = df['target_column']

# Split data into train and test sets
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 3, 5]
}

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    verbosity=0
)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1,
    return_train_score=True
)

# Perform grid search
grid_search.fit(x_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Fit the best model with evaluation set
best_model.fit(
    x_train, y_train,
    eval_set=[(x_train, y)])

In [None]:
# Pickle
import joblib
joblib.dump(best_model, 'xgb_model.pkl')
print("Saved tuned model: xgb_model.pkl")