In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pickle
from datetime import datetime

In [2]:
# Load the dataset from an Excel file
data = pd.read_excel('/content/vehicles.xlsx')

# Display the first few rows of the dataset
print("Original Dataset:")
print(data.head(10))

Original Dataset:
           id                                                url  \
0  7222695916  https://prescott.craigslist.org/cto/d/prescott...   
1  7218891961  https://fayar.craigslist.org/ctd/d/bentonville...   
2  7221797935  https://keys.craigslist.org/cto/d/summerland-k...   
3  7222270760  https://worcester.craigslist.org/cto/d/west-br...   
4  7210384030  https://greensboro.craigslist.org/cto/d/trinit...   
5  7222379453  https://hudsonvalley.craigslist.org/cto/d/west...   
6  7221952215  https://hudsonvalley.craigslist.org/cto/d/west...   
7  7220195662  https://hudsonvalley.craigslist.org/cto/d/poug...   
8  7209064557  https://medford.craigslist.org/cto/d/grants-pa...   
9  7219485069  https://erie.craigslist.org/cto/d/erie-2012-su...   

                   region                           region_url  price  year  \
0                prescott      https://prescott.craigslist.org   6000   NaN   
1            fayetteville         https://fayar.craigslist.org  11900   NaN

In [3]:
# Handle missing values by filling NaN values with appropriate defaults
data['region'] = data['region'].fillna("Unknown")
data['price'] = data['price'].fillna(0)
data['year'] = data['year'].fillna(0)
data['manufacturer'] = data['manufacturer'].fillna("Unknown")
data['model'] = data['model'].fillna("Unknown")
data['condition'] = data['condition'].fillna("Unknown")
data['cylinders'] = data['cylinders'].fillna("Unknown")
data['fuel'] = data['fuel'].fillna("Unknown")
data['odometer'] = data['odometer'].fillna(0)
data['title_status'] = data['title_status'].fillna("Unknown")
data['transmission'] = data['transmission'].fillna("Unknown")
data['VIN'] = data['VIN'].fillna("Unknown")
data['drive'] = data['drive'].fillna("Unknown")
data['size'] = data['size'].fillna("Unknown")
data['type'] = data['type'].fillna("Unknown")
data['paint_color'] = data['paint_color'].fillna("Unknown")
data['lat'] = data['lat'].fillna(0.0)
data['long'] = data['long'].fillna(0.0)

# Drop unnecessary columns
data = data.drop(columns=['county', 'state', 'image_url', 'description', 'posting_date'])

# Encode categorical variables
data = pd.get_dummies(data, columns=['region', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color'])

# Scale numerical features
scaler = StandardScaler()
data[['year', 'odometer', 'lat', 'long']] = scaler.fit_transform(data[['year', 'odometer', 'lat', 'long']])

# Extract year from the current year
current_year = datetime.now().year
data['car_age'] = current_year - data['year']

# Drop the 'year' column as it's now represented by 'car_age'
data = data.drop(columns=['year'])

# Ensure no NaN values
data = data.fillna(0)

# Display the updated dataset with 'car_age'
print("\nDataset with 'car_age':")
print(data.head())


Dataset with 'car_age':
           id                                                url  \
0  7222695916  https://prescott.craigslist.org/cto/d/prescott...   
1  7218891961  https://fayar.craigslist.org/ctd/d/bentonville...   
2  7221797935  https://keys.craigslist.org/cto/d/summerland-k...   
3  7222270760  https://worcester.craigslist.org/cto/d/west-br...   
4  7210384030  https://greensboro.craigslist.org/cto/d/trinit...   

                          region_url  price  odometer      VIN      lat  \
0    https://prescott.craigslist.org   6000  -0.19657  Unknown -6.42929   
1       https://fayar.craigslist.org  11900  -0.19657  Unknown -6.42929   
2        https://keys.craigslist.org  21000  -0.19657  Unknown -6.42929   
3   https://worcester.craigslist.org   1500  -0.19657  Unknown -6.42929   
4  https://greensboro.craigslist.org   4900  -0.19657  Unknown -6.42929   

       long  region_auburn  region_bellingham  ...  paint_color_custom  \
0  6.783784          False              F

In [4]:
# Split the dataset into training and testing sets
X = data.drop(columns=['price'])
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure training and testing sets have the same columns
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Hyperparameter tuning for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Save the best model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Evaluate and print model performance
y_pred = best_model.predict(X_test)
print(f"\nRandom Forest Performance after Tuning:")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}")
print(f"Root Mean Squared Error (RMSE): {mean_squared_error(y_test, y_pred, squared=False)}")
print(f"R-squared Score: {r2_score(y_test, y_pred)}")

Fitting 3 folds for each of 32 candidates, totalling 96 fits


  warn(



Random Forest Performance after Tuning:
Mean Absolute Error (MAE): 4394.193099166611
Mean Squared Error (MSE): 68579923.71923536
Root Mean Squared Error (RMSE): 8281.299639503171
R-squared Score: 0.7090032208499333
