In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
# Step 1: Data Collection
df = pd.read_csv('/content/vehicles.csv')
data=df.sample(2000)

In [3]:
# Step 2: Data Exploration and Preprocessing
print(data.info())  # Check the structure of the dataset

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 195401 to 350121
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            2000 non-null   int64  
 1   url           2000 non-null   object 
 2   region        2000 non-null   object 
 3   region_url    2000 non-null   object 
 4   price         2000 non-null   int64  
 5   year          1992 non-null   float64
 6   manufacturer  1909 non-null   object 
 7   model         1977 non-null   object 
 8   condition     1183 non-null   object 
 9   cylinders     1195 non-null   object 
 10  fuel          1991 non-null   object 
 11  odometer      1982 non-null   float64
 12  title_status  1959 non-null   object 
 13  transmission  1989 non-null   object 
 14  VIN           1264 non-null   object 
 15  drive         1393 non-null   object 
 16  size          578 non-null    object 
 17  type          1569 non-null   object 
 18  paint_color   1416 non-nul

In [4]:
print(data.describe())  # Summary statistics

                 id          price         year      odometer  county  \
count  2.000000e+03    2000.000000  1992.000000  1.982000e+03     0.0   
mean   7.311443e+09   17512.074000  2010.969378  9.780799e+04     NaN   
std    4.448566e+06   15915.545323    10.330868  2.335271e+05     NaN   
min    7.301594e+09       0.000000  1929.000000  0.000000e+00     NaN   
25%    7.307941e+09    5950.000000  2008.000000  3.791425e+04     NaN   
50%    7.312513e+09   13697.500000  2013.000000  8.716200e+04     NaN   
75%    7.315279e+09   26968.500000  2017.000000  1.325675e+05     NaN   
max    7.317087e+09  227995.000000  2022.000000  1.000000e+07     NaN   

               lat         long  
count  1974.000000  1974.000000  
mean     38.720097   -94.847800  
std       6.070520    18.302205  
min      19.725342  -158.030906  
25%      34.903495  -111.935911  
50%      39.675190   -88.930000  
75%      42.623488   -81.229050  
max      64.815520   -67.840490  


In [5]:
print(data.isnull().sum())  # Identify missing values

id                 0
url                0
region             0
region_url         0
price              0
year               8
manufacturer      91
model             23
condition        817
cylinders        805
fuel               9
odometer          18
title_status      41
transmission      11
VIN              736
drive            607
size            1422
type             431
paint_color      584
image_url          0
description        0
county          2000
state              0
lat               26
long              26
posting_date       0
dtype: int64


In [6]:
# Define features
categorical_features = ['manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state']
numerical_features = ['odometer', 'year']

In [7]:
# Define preprocessing for numerical and categorical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())  # Scale numerical features
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))  # One-hot encode categorical features
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

In [8]:
# Separate features and target variable
X = data.drop(columns=['price', 'id', 'url', 'region', 'region_url', 'VIN', 'image_url', 'description', 'county', 'lat', 'long', 'posting_date'])
y = data['price']

In [9]:
# Create a pipeline that combines preprocessing with the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

In [10]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [12]:
# Step 5: Model Evaluation
# Make predictions
y_pred = model_pipeline.predict(X_test)



In [13]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')

MAE: 7026.099750000001
MSE: 204428440.8941315
RMSE: 14297.847421697139
R-squared: 0.36021078734025846


In [14]:
# Step 6: Prediction
def predict_price(input_data):
    # Convert input_data to a DataFrame
    input_df = pd.DataFrame([input_data])
    # Use the model pipeline to preprocess and predict
    prediction = model_pipeline.predict(input_df)
    return prediction[0]

In [15]:
# Extract feature names from the column transformer
def get_feature_names(preprocessor):
    feature_names = []
    for name, trans, column in preprocessor.transformers_:
        if hasattr(trans, 'named_steps'):  # This means trans is a Pipeline
            if 'onehot' in trans.named_steps:
                onehot_encoder = trans.named_steps['onehot']
                feature_names.extend(onehot_encoder.get_feature_names_out(column))
            else:
                feature_names.extend(column)
        else:
            feature_names.extend(column)
    return feature_names

In [16]:
# Get the feature names
feature_names = get_feature_names(preprocessor)
print(f'Feature names: {feature_names}')

Feature names: ['odometer', 'year', 'manufacturer_alfa-romeo', 'manufacturer_audi', 'manufacturer_bmw', 'manufacturer_buick', 'manufacturer_cadillac', 'manufacturer_chevrolet', 'manufacturer_chrysler', 'manufacturer_dodge', 'manufacturer_fiat', 'manufacturer_ford', 'manufacturer_gmc', 'manufacturer_honda', 'manufacturer_hyundai', 'manufacturer_infiniti', 'manufacturer_jaguar', 'manufacturer_jeep', 'manufacturer_kia', 'manufacturer_lexus', 'manufacturer_lincoln', 'manufacturer_mazda', 'manufacturer_mercedes-benz', 'manufacturer_mercury', 'manufacturer_mini', 'manufacturer_mitsubishi', 'manufacturer_nissan', 'manufacturer_pontiac', 'manufacturer_porsche', 'manufacturer_ram', 'manufacturer_rover', 'manufacturer_saturn', 'manufacturer_subaru', 'manufacturer_tesla', 'manufacturer_toyota', 'manufacturer_volkswagen', 'manufacturer_volvo', 'model_09 HONDS ODYSSEY EXL', 'model_128i convertible', 'model_135is', 'model_1500', 'model_1500 4x4', 'model_1500 big horn', 'model_1500 classic', 'model_1

In [17]:
# Example input
example_input = {
    'odometer': 50000,
    'year': 2019,
    'manufacturer': 'Toyota',
    'model': 'Camry',
    'condition': 'Excellent',
    'cylinders': '4 cylinders',
    'fuel': 'Gas',
    'title_status': 'Clean',
    'transmission': 'Automatic',
    'drive': '4WD',
    'size': 'Full-size',
    'type': 'SUV',
    'paint_color': 'Black',
    'state': 'CA'
}

In [18]:
# Ensure the example input includes all necessary features
input_df = pd.DataFrame([example_input], columns=feature_names)
predicted_price = predict_price(example_input)
print(f'Predicted price: ${predicted_price}')

Predicted price: $23656.18




In [19]:
# Advanced Techniques: Feature Importance, Hyperparameter Tuning, Cross-Validation
# Feature importance
importances = model_pipeline.named_steps['model'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df.sort_values(by='Importance', ascending=False, inplace=True)
print(feature_importance_df)

                          Feature    Importance
0                        odometer  2.079064e-01
1                            year  1.702130e-01
1007                     fuel_gas  5.304343e-02
1016                    drive_fwd  3.636587e-02
296      model_corvette sting ray  2.002446e-02
...                           ...           ...
654              model_new beetle  9.213365e-09
939                     model_van  8.630324e-09
471                      model_fo  8.034386e-09
89    model_300d 2.5 turbo diesel  1.621460e-10
309   model_crosstrek limited awd  0.000000e+00

[1094 rows x 2 columns]


In [20]:
# Hyperparameter tuning (example: using GridSearchCV)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model_pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
{'model__max_depth': 30, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}


In [21]:
# Cross-validation
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(best_model, X, y, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation score: {cv_scores.mean()}')



Cross-validation scores: [0.51694779 0.34204482 0.33966633 0.46947178 0.36605822]
Average cross-validation score: 0.40683778958611255


