In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
import logging
import warnings
warnings.filterwarnings('ignore')
import os

In [6]:
df = pd.read_csv('../data/Sport car price.csv')
df.head()

Unnamed: 0,Car Make,Car Model,Year,Engine Size (L),Horsepower,Torque (lb-ft),0-60 MPH Time (seconds),Price (in USD)
0,Porsche,911,2022,3.0,379,331,4.0,101200
1,Lamborghini,Huracan,2021,5.2,630,443,2.8,274390
2,Ferrari,488 GTB,2022,3.9,661,561,3.0,333750
3,Audi,R8,2022,5.2,562,406,3.2,142700
4,McLaren,720S,2021,4.0,710,568,2.7,298000


In [7]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('sports-car-price-prediction')

2025/03/24 15:09:19 INFO mlflow.tracking.fluent: Experiment with name 'sports-car-price-prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/sehej/Desktop/usf-msds/spring-module-2/ml-ops/notebooks/mlruns/2', creation_time=1742854159245, experiment_id='2', last_update_time=1742854159245, lifecycle_stage='active', name='sports-car-price-prediction', tags={}>

## Exploring Data

In [8]:
df.columns

Index(['Car Make', 'Car Model', 'Year', 'Engine Size (L)', 'Horsepower',
       'Torque (lb-ft)', '0-60 MPH Time (seconds)', 'Price (in USD)'],
      dtype='object')

In [9]:
df.shape

(1007, 8)

In [10]:
df.isnull().sum()

Car Make                    0
Car Model                   0
Year                        0
Engine Size (L)            10
Horsepower                  0
Torque (lb-ft)              3
0-60 MPH Time (seconds)     0
Price (in USD)              0
dtype: int64

In [12]:
data = df.rename(columns={
        'Car Make': 'Car_Make',
        'Car Model': 'Car_Model',
        'Engine Size (L)': 'Engine_Size',
        'Horsepower': 'Horsepower',
        'Torque (lb-ft)': 'Torque',
        '0-60 MPH Time (seconds)': 'Acceleration',
        'Price (in USD)': 'Price'
    })

In [17]:
data['Price'] = df['Price (in USD)'].str.replace(',', '').astype(float)

In [18]:
data

Unnamed: 0,Car_Make,Car_Model,Year,Engine_Size,Horsepower,Torque,Acceleration,Price
0,Porsche,911,2022,3,379,331,4,101200.0
1,Lamborghini,Huracan,2021,5.2,630,443,2.8,274390.0
2,Ferrari,488 GTB,2022,3.9,661,561,3,333750.0
3,Audi,R8,2022,5.2,562,406,3.2,142700.0
4,McLaren,720S,2021,4,710,568,2.7,298000.0
...,...,...,...,...,...,...,...,...
1002,Koenigsegg,Jesko,2022,5,1280,1106,2.5,3000000.0
1003,Lotus,Evija,2021,Electric Motor,1972,1254,2,2000000.0
1004,McLaren,Senna,2021,4,789,590,2.7,1000000.0
1005,Pagani,Huayra,2021,6,764,738,3,2600000.0


In [21]:
numeric_cols = ['Year', 'Engine_Size', 'Horsepower', 'Torque', 'Acceleration']
for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

In [23]:
data = data.dropna()

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 946 entries, 0 to 1005
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car_Make      946 non-null    object 
 1   Car_Model     946 non-null    object 
 2   Year          946 non-null    int64  
 3   Engine_Size   946 non-null    float64
 4   Horsepower    946 non-null    float64
 5   Torque        946 non-null    float64
 6   Acceleration  946 non-null    float64
 7   Price         946 non-null    float64
dtypes: float64(5), int64(1), object(2)
memory usage: 66.5+ KB


## Feature Engineering

In [29]:
# Create feature: Is luxury brand
luxury_brands = ['Bugatti', 'Bentley', 'Lamborghini', 'Ferrari', 'Rolls-Royce', 
                    'Aston Martin', 'McLaren', 'Koenigsegg', 'Pagani']
data['Is_Luxury_Brand'] = data['Car_Make'].apply(
    lambda x: 1 if x in luxury_brands else 0
)

In [33]:
# Create feature: Is recent model
data['Is_Recent'] = data['Year'].apply(lambda x: 1 if x >= 2021 else 0)

In [34]:
data

Unnamed: 0,Car_Make,Car_Model,Year,Engine_Size,Horsepower,Torque,Acceleration,Price,Is_Luxury_Brand,Is_Recent
0,Porsche,911,2022,3.0,379.0,331.0,4.0,101200.0,0,1
1,Lamborghini,Huracan,2021,5.2,630.0,443.0,2.8,274390.0,1,1
2,Ferrari,488 GTB,2022,3.9,661.0,561.0,3.0,333750.0,1,1
3,Audi,R8,2022,5.2,562.0,406.0,3.2,142700.0,0,1
4,McLaren,720S,2021,4.0,710.0,568.0,2.7,298000.0,1,1
...,...,...,...,...,...,...,...,...,...,...
1000,Aston Martin,Vantage,2021,4.0,503.0,505.0,3.6,146000.0,1,1
1001,Bugatti,Chiron,2021,8.0,1479.0,1180.0,2.4,3000000.0,1,1
1002,Koenigsegg,Jesko,2022,5.0,1280.0,1106.0,2.5,3000000.0,1,1
1004,McLaren,Senna,2021,4.0,789.0,590.0,2.7,1000000.0,1,1


In [36]:
numeric_features = ['Horsepower', 'Torque', 'Engine_Size', 'Acceleration', 'Year', 'Is_Luxury_Brand', 'Is_Recent']
categorical_features = ['Car_Make']

In [43]:
X_numeric = data[numeric_features]
X_cat = data[categorical_features]

X = X_numeric
y = data['Price']

In [44]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [45]:
logging.info(f"Selected features: {X.columns.tolist()}")
logging.info(f"Feature matrix shape: {X.shape}")

2025-03-24 15:38:00,441 - INFO - Selected features: ['Horsepower', 'Torque', 'Engine_Size', 'Acceleration', 'Year', 'Is_Luxury_Brand', 'Is_Recent']
2025-03-24 15:38:00,443 - INFO - Feature matrix shape: (946, 7)


In [48]:
logging.info("Splitting data into train, validation, and test sets...")

test_size = 0.2
val_size = 0.2

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42
)

val_ratio = val_size / (1 - test_size) 
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=val_ratio, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame to preserve column names
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

logging.info(f"Train set shape: {X_train.shape}")
logging.info(f"Validation set shape: {X_val.shape}")
logging.info(f"Test set shape: {X_test.shape}")

2025-03-24 15:39:38,289 - INFO - Splitting data into train, validation, and test sets...
2025-03-24 15:39:38,302 - INFO - Train set shape: (567, 7)
2025-03-24 15:39:38,302 - INFO - Validation set shape: (189, 7)
2025-03-24 15:39:38,302 - INFO - Test set shape: (190, 7)


## Model Training

In [49]:
with mlflow.start_run():
    # Set tags
    mlflow.set_tags({"Model": "linear-regression", "Train Data": "all data"})
    
    # Log parameters
    mlflow.log_param("features", numeric_features)
    
    # Train model
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    # Evaluate
    train_pred = lr.predict(X_train)
    test_pred = lr.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    train_r2 = r2_score(y_train, train_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    # Log metrics
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    
    # Log model
    mlflow.sklearn.log_model(lr, "model")
    
    logging.info(f"Linear Regression - Test RMSE: {test_rmse:.2f}, Test R²: {test_r2:.2f}")

mlflow.end_run()

2025-03-24 15:39:41,900 - INFO - Linear Regression - Test RMSE: 332833.33, Test R²: 0.69


In [61]:
best_rmse_dt = float('inf')
best_model_dt = None

# Hyperparameters to tune
max_depths = [None, 5, 10, 15, 20]
min_samples_splits = [2, 5, 10, 20]

for depth in max_depths:
    for min_samples in min_samples_splits:
        with mlflow.start_run():
            # Set tags
            mlflow.set_tags({"Model": "decision-tree", "Train Data": "all data"})
            
            # Log parameters
            mlflow.log_param("features", X_numeric.columns.tolist())
            mlflow.log_param("max_depth", depth)
            mlflow.log_param("min_samples_split", min_samples)
            
            # Train model
            dt = DecisionTreeRegressor(max_depth=depth, min_samples_split=min_samples, random_state=42)
            dt.fit(X_train, y_train)
            
            # Evaluate
            train_pred = dt.predict(X_train)
            test_pred = dt.predict(X_test)
            
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)
            
            # Log metrics
            mlflow.log_metric("train_rmse", train_rmse)
            mlflow.log_metric("test_rmse", test_rmse)
            mlflow.log_metric("train_r2", train_r2)
            mlflow.log_metric("test_r2", test_r2)
            
            # Log model
            mlflow.sklearn.log_model(dt, "model")
            
            # Track best model
            if test_rmse < best_rmse_dt:
                best_rmse_dt = test_rmse
                best_model_dt = dt
                best_r2_dt = test_r2

logging.info(f"Best Decision Tree - Test RMSE: {best_rmse_dt:.2f}, Test R²: {best_r2_dt:.2f}")
mlflow.end_run()

2025-03-24 15:47:43,307 - INFO - Best Decision Tree - Test RMSE: 85378.35, Test R²: 0.98


In [62]:
best_rmse_rf = float('inf')
best_model_rf = None

# Hyperparameters to tune
n_estimators_list = [50, 100, 150]
max_features_list = [1/3, 1/2, 2/3, 'sqrt']

for n_trees in n_estimators_list:
    for max_feat in max_features_list:
        with mlflow.start_run():
            # Set tags
            mlflow.set_tags({"Model": "random-forest", "Train Data": "all data"})
            
            # Log parameters
            mlflow.log_param("features", X_numeric.columns.tolist())
            mlflow.log_param("n_estimators", n_trees)
            mlflow.log_param("max_features", max_feat)
            
            # Train model
            rf = RandomForestRegressor(n_estimators=n_trees, max_features=max_feat, random_state=42)
            rf.fit(X_train, y_train)
            
            # Evaluate
            train_pred = rf.predict(X_train)
            test_pred = rf.predict(X_test)
            
            train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
            test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
            train_r2 = r2_score(y_train, train_pred)
            test_r2 = r2_score(y_test, test_pred)
            
            # Log metrics
            mlflow.log_metric("train_rmse", train_rmse)
            mlflow.log_metric("test_rmse", test_rmse)
            mlflow.log_metric("train_r2", train_r2)
            mlflow.log_metric("test_r2", test_r2)
            
            # Log model
            mlflow.sklearn.log_model(rf, "model")
            
            # Track best model
            if test_rmse < best_rmse_rf:
                best_rmse_rf = test_rmse
                best_model_rf = rf
                best_r2_rf = test_r2

logging.info(f"Best Random Forest - Test RMSE: {best_rmse_rf:.2f}, Test R²: {best_r2_rf:.2f}")
mlflow.end_run()

2025-03-24 15:47:53,855 - INFO - Best Random Forest - Test RMSE: 48958.83, Test R²: 0.99


In [55]:
best_model_dt

In [56]:
best_model_rf