In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

In [3]:
df_cars_final = pd.read_excel('all_processed_data.xlsx')

categorical_features = df_cars_final.select_dtypes(include=['object']).columns
numerical_features = df_cars_final.select_dtypes(include=['number']).columns

# Separate the target variable
X = df_cars_final.drop('price', axis=1)
y = df_cars_final['price']

# Apply one-hot encoding to categorical features
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# The numerical columns are automatically included as they are

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(f"Training feature set shape: {X_train.shape}")
print(f"Testing feature set shape: {X_test.shape}")

Training feature set shape: (4708, 98)
Testing feature set shape: (1177, 98)


In [5]:
# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor()
}

# Store the results
results = {}

# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100
    
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100
    
    # Store the results
    results[model_name] = {
        'MSE_train': mse_train,
        'MAE_train': mae_train,
        'R2_train': r2_train,
        'MAPE_train': mape_train,
        'MSE_test': mse_test,
        'MAE_test':mae_test,
        'R2_test': r2_test,
        'MAPE_test': mape_test
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

                      MSE_train     MAE_train  R2_train  MAPE_train  \
Linear Regression  2.227794e+10  86374.566035  0.873598   17.342779   
Decision Tree      2.424671e+07    476.847918  0.999862    0.087812   
Random Forest      1.670779e+09  24894.759658  0.990520    4.479399   
XGBoost            1.485230e+09  28003.323200  0.991573    5.478265   

                       MSE_test      MAE_test   R2_test  MAPE_test  
Linear Regression  2.349042e+10  89982.929662  0.882717  17.636654  
Decision Tree      2.331511e+10  89672.062872  0.883593  15.485004  
Random Forest      1.376649e+10  64724.712443  0.931267  11.469638  
XGBoost            1.040855e+10  57988.006419  0.948032  10.182868  


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distributions_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)
rf_random_search = RandomizedSearchCV(
    rf_model,
    param_distributions=param_distributions_rf,
    n_iter=50,
    scoring='neg_mean_absolute_percentage_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rf_random_search.fit(X_train, y_train)
best_rf_params = rf_random_search.best_params_
print(f"Best Random Forest Parameters: {best_rf_params}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Random Forest Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}


In [7]:
## For XGBoost

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_distributions_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'alpha': [0, 0.1, 0.5, 1],
    'lambda': [0, 0.1, 0.5, 1]
}

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_distributions_xgb,
    n_iter=50,
    scoring='neg_mean_absolute_percentage_error',
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

xgb_random_search.fit(X_train, y_train)
best_xgb_params = xgb_random_search.best_params_
print(f"Best XGBoost Parameters: {best_xgb_params}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best XGBoost Parameters: {'alpha': 1, 'colsample_bytree': 0.8329611783087483, 'lambda': 1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.7361074625809747}


In [8]:
# Revaluate with the best parameters 
# Best Random Forest Parameters: 
#{'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}

# Initialize and train the Random Forest model with best parameters
rf_model = RandomForestRegressor(n_estimators=300, 
                                 max_depth=30, 
                                 min_samples_split=2, 
                                 min_samples_leaf=1,
                                 random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred_rf = rf_model.predict(X_train)

# Calculate metrics for the training set
mse_train_rf = mean_squared_error(y_train, y_train_pred_rf)
r2_train_rf = r2_score(y_train, y_train_pred_rf)
mape_train_rf = mean_absolute_percentage_error(y_train, y_train_pred_rf) * 100

# Print metrics for the training set
print(f"Random Forest Train MSE: {mse_train_rf}")
print(f"Random Forest Train R^2: {r2_train_rf}")
print(f"Random Forest Train MAPE: {mape_train_rf}%")

# Make predictions on the test set
y_test_pred_rf = rf_model.predict(X_test)

# Calculate metrics for the test set
mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
r2_test_rf = r2_score(y_test, y_test_pred_rf)
mape_test_rf = mean_absolute_percentage_error(y_test, y_test_pred_rf) * 100

# Print metrics for the test set
print(f"Random Forest Test MSE: {mse_test_rf}")
print(f"Random Forest Test R^2: {r2_test_rf}")
print(f"Random Forest Test MAPE: {mape_test_rf}%")

Random Forest Train MSE: 1580786965.7633302
Random Forest Train R^2: 0.9910308323371625
Random Forest Train MAPE: 4.402445723898892%
Random Forest Test MSE: 13166113741.943487
Random Forest Test R^2: 0.9342643684964812
Random Forest Test MAPE: 11.39017146877129%


In [9]:
#Best XGBoost Parameters: 
#{'alpha': 1, 'colsample_bytree': 0.8329611783087483, 'lambda': 1, 
#'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.7361074625809747}

# Initialize and train the XGBoost model with best parameters
xgb_model = XGBRegressor(alpha=1, 
                         colsample_bytree=0.8329611783087483, 
                         lambda_=1, 
                         learning_rate=0.1, 
                         max_depth=5, 
                         n_estimators=300, 
                         subsample=0.7361074625809747, 
                         objective='reg:squarederror',
                         random_state=42)

xgb_model.fit(X_train, y_train)


# Make predictions
y_train_pred_xgb = xgb_model.predict(X_train)

# Calculate metrics
mse_train_xgb = mean_squared_error(y_train, y_train_pred_xgb)
r2_train_xgb = r2_score(y_train, y_train_pred_xgb)
mape_train_xgb = mean_absolute_percentage_error(y_train, y_train_pred_xgb) * 100

print(f"XGBoost Train MSE: {mse_train_xgb}")
print(f"XGBoost Train R^2: {r2_train_xgb}")
print(f"XGBoost Train MAPE: {mape_train_xgb}%")

# Make predictions
y_test_pred_xgb = xgb_model.predict(X_test)

# Calculate metrics
mse_test_xgb = mean_squared_error(y_test, y_test_pred_xgb)
r2_test_xgb = r2_score(y_test, y_test_pred_xgb)
mape_test_xgb = mean_absolute_percentage_error(y_test, y_test_pred_xgb) * 100

print(f"XGBoost Test MSE: {mse_test_xgb}")
print(f"XGBoost Test R^2: {r2_test_xgb}")
print(f"XGBoost Test MAPE: {mape_test_xgb}%")


Parameters: { "lambda_" } are not used.



XGBoost Train MSE: 2282918238.5075397
XGBoost Train R^2: 0.9870470361375757
XGBoost Train MAPE: 7.001669842309134%
XGBoost Test MSE: 8403454729.738378
XGBoost Test R^2: 0.9580433213400873
XGBoost Test MAPE: 9.541971031743254%


In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

models = {
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
}
# Store the results
results = {}

# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics
    mse_train = mean_squared_error(y_train, y_train_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)
    mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100
    
    mse_test = mean_squared_error(y_test, y_test_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100
    
    # Store the results
    results[model_name] = {
        'MSE_train': mse_train,
        'MAE_train': mae_train,
        'R2_train': r2_train,
        'MAPE_train': mape_train,
        'MSE_test': mse_test,
        'MAE_test':mae_test,
        'R2_test': r2_test,
        'MAPE_test': mape_test
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

                     MSE_train     MAE_train  R2_train  MAPE_train  \
Ridge Regression  2.250062e+10  87372.392955  0.872335   17.549932   
Lasso Regression  2.227805e+10  86377.428694  0.873597   17.342430   

                      MSE_test      MAE_test   R2_test  MAPE_test  
Ridge Regression  2.407698e+10  90804.128599  0.879789  17.710319  
Lasso Regression  2.349220e+10  89984.198237  0.882708  17.634649  


  model = cd_fast.enet_coordinate_descent(


In [11]:
import joblib

# Save the model
joblib.dump(xgb_model, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [12]:
# Save column names for encoding
encoded_columns = list(X_encoded.columns)
joblib.dump(encoded_columns, 'encoded_columns.pkl')

['encoded_columns.pkl']