In [None]:
!rm -rf /content/

In [None]:
#!pip install catboost --force-reinstall



In [None]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
import os

# Load dataset
df = pd.read_csv("/content/merged_data_2024_column.csv")
df.drop(columns=["sellingprice", "zipcode", "sale_year"], inplace=True)

# Define columns
categorical_cols = ['make', 'model', 'trim', 'state', 'color', 'interior']
numerical_cols = ['year', 'condition', 'odometer', 'mmr']
cluster_features = ['year', 'condition', 'odometer', 'mmr']

# Clustering
scaler_for_clustering = StandardScaler()
scaled_features = scaler_for_clustering.fit_transform(df[cluster_features])
kmeans = KMeans(n_clusters=3, random_state=42)
df["cluster"] = kmeans.fit_predict(scaled_features)
print("CLUSTER 0: ", len(df[df["cluster"] == 0]))
print("CLUSTER 1: ", len(df[df["cluster"] == 1]))
print("CLUSTER 2: ", len(df[df["cluster"] == 2]))

joblib.dump(kmeans, "kmeans_model.joblib")
joblib.dump(scaler_for_clustering, "scaler_for_clustering.joblib")

# Preprocessing
def preprocess_data(df, categorical_cols, numerical_cols, label_encoders=None, scaler=None, fit=True):
    df = df.copy()
    if fit:
        label_encoders = {col: LabelEncoder().fit(df[col].astype(str)) for col in categorical_cols}
        scaler = StandardScaler().fit(df[numerical_cols])
    for col in categorical_cols:
        df[col] = label_encoders[col].transform(df[col].astype(str))
    df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, label_encoders, scaler

# Create directory
os.makedirs("models_by_cluster", exist_ok=True)

# Train and save LightGBM models per cluster
for cluster in [0, 1, 2]:
    df_cluster = df[df['cluster'] == cluster].copy()
    X = df_cluster.drop(columns=['2024_price', 'cluster'])
    y = np.log1p(df_cluster['2024_price'])

    X_encoded, label_encoders, scaler = preprocess_data(X, categorical_cols, numerical_cols, fit=True)
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    joblib.dump(label_encoders, f"models_by_cluster/label_encoders_cluster_{cluster}.joblib", compress=3)
    joblib.dump(scaler, f"models_by_cluster/scaler_cluster_{cluster}.joblib", compress=3)

    # Different hyperparameters for cluster 1
    model = lgb.LGBMRegressor(
        num_leaves=80 if cluster == 1 else 50,
        n_estimators=100,
        learning_rate=0.3 if cluster == 1 else 0.5,
        random_state=42
    )
    model.fit(X_train, y_train)

    joblib.dump({'model': model}, f"models_by_cluster/lightgbm_model_cluster_{cluster}.joblib", compress=3)

# ---- Prediction Function ---- #

def predict_cluster_and_price(car_input: pd.DataFrame):
    cluster_input = car_input[cluster_features].copy()
    cluster_scaled = scaler_for_clustering.transform(cluster_input)
    cluster = kmeans.predict(cluster_scaled)[0]

    label_encoders = joblib.load(f"models_by_cluster/label_encoders_cluster_{cluster}.joblib")
    scaler = joblib.load(f"models_by_cluster/scaler_cluster_{cluster}.joblib")

    car = car_input.copy()
    for col in categorical_cols:
        car[col] = label_encoders[col].transform(car[col].astype(str))
    X_cat = car[categorical_cols].values
    X_num = scaler.transform(car[numerical_cols])
    X_input = np.hstack((X_cat, X_num)).astype(np.float32)

    model = joblib.load(f"models_by_cluster/lightgbm_model_cluster_{cluster}.joblib")['model']
    y_log_pred = model.predict(X_input)
    y_pred = np.expm1(y_log_pred)
    return cluster, y_pred[0]


  df = pd.read_csv("/content/merged_data_2024_column.csv")


CLUSTER 0:  136622
CLUSTER 1:  461080
CLUSTER 2:  71190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1154
[LightGBM] [Info] Number of data points in the train set: 109297, number of used features: 10
[LightGBM] [Info] Start training from score 10.370771
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1391
[LightGBM] [Info] Number of data points in the train set: 368864, number of used features: 10
[LightGBM] [Info] Start training from score 9.128428
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001858 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11

In [None]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
import os

# === CONFIG === #
categorical_cols = ['make', 'model', 'trim', 'state', 'color', 'interior']
original_numerical = ['year', 'condition', 'odometer', 'mmr']
interaction_cols = ['age', 'odometer_per_year', 'mmr_odometer_ratio', 'year_condition_interaction']
numerical_cols = original_numerical + interaction_cols
cluster_features = ['year', 'condition', 'odometer', 'mmr']

# === FEATURE ENGINEERING === #
def add_interaction_features(df):
    df = df.copy()
    df["age"] = 2024 - df["year"]
    df["odometer_per_year"] = df["odometer"] / (df["age"] + 1e-3)
    df["mmr_odometer_ratio"] = df["mmr"] / (df["odometer"] + 1e-3)
    df["year_condition_interaction"] = df["year"] * df["condition"]
    return df

# === PREPROCESSING === #
def preprocess_data(df, categorical_cols, numerical_cols, label_encoders=None, scaler=None, fit=True):
    df = df.copy()
    if fit:
        label_encoders = {col: LabelEncoder().fit(df[col].astype(str)) for col in categorical_cols}
        scaler = StandardScaler().fit(df[numerical_cols])
    for col in categorical_cols:
        df[col] = label_encoders[col].transform(df[col].astype(str))
    df[numerical_cols] = scaler.transform(df[numerical_cols])
    return df, label_encoders, scaler

# === LOAD DATA & FEATURE ENGINEERING === #
df = pd.read_csv("/content/merged_data_2024_column.csv")
df.drop(columns=["sellingprice", "zipcode", "sale_year"], inplace=True)
df = add_interaction_features(df)

# === CLUSTERING === #
scaler_for_clustering = StandardScaler()
scaled_features = scaler_for_clustering.fit_transform(df[cluster_features])
kmeans = KMeans(n_clusters=3, random_state=42)
df["cluster"] = kmeans.fit_predict(scaled_features)

joblib.dump(kmeans, "kmeans_model.joblib")
joblib.dump(scaler_for_clustering, "scaler_for_clustering.joblib")

# === TRAINING === #
os.makedirs("models_by_cluster", exist_ok=True)

for cluster in [0, 1, 2]:
    df_cluster = df[df['cluster'] == cluster].copy()
    X = df_cluster.drop(columns=['2024_price', 'cluster'])
    y = np.log1p(df_cluster['2024_price'])

    X_encoded, label_encoders, scaler = preprocess_data(X, categorical_cols, numerical_cols, fit=True)
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    joblib.dump(label_encoders, f"models_by_cluster/label_encoders_cluster_{cluster}.joblib", compress=3)
    joblib.dump(scaler, f"models_by_cluster/scaler_cluster_{cluster}.joblib", compress=3)

    if cluster == 1:
        print("🔍 Tuning LightGBM for Cluster 1...")
        param_grid = {
            'num_leaves': [40, 60, 80],
            'learning_rate': [0.05, 0.1, 0.2],
            'n_estimators': [100, 200],
            'max_depth': [5, 10, 15]
        }

        base_model = lgb.LGBMRegressor(random_state=42)
        grid = GridSearchCV(base_model, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        print(f"✅ Best model for cluster 1: {grid.best_params_}")
    else:
        best_model = lgb.LGBMRegressor(num_leaves=50, n_estimators=100, learning_rate=0.5, random_state=42)
        best_model.fit(X_train, y_train)

    joblib.dump({'model': best_model}, f"models_by_cluster/lightgbm_model_cluster_{cluster}.joblib", compress=3)

# === PREDICTION FUNCTION === #
def predict_cluster_and_price(car_input: pd.DataFrame):
    cluster_input = car_input[cluster_features].copy()
    cluster_scaled = scaler_for_clustering.transform(cluster_input)
    cluster = kmeans.predict(cluster_scaled)[0]

    label_encoders = joblib.load(f"models_by_cluster/label_encoders_cluster_{cluster}.joblib")
    scaler = joblib.load(f"models_by_cluster/scaler_cluster_{cluster}.joblib")

    car = add_interaction_features(car_input.copy())
    for col in categorical_cols:
        car[col] = label_encoders[col].transform(car[col].astype(str))
    X_cat = car[categorical_cols].values
    X_num = scaler.transform(car[numerical_cols])
    X_input = np.hstack((X_cat, X_num)).astype(np.float32)

    model = joblib.load(f"models_by_cluster/lightgbm_model_cluster_{cluster}.joblib")['model']
    y_log_pred = model.predict(X_input)
    y_pred = np.expm1(y_log_pred)
    return cluster, y_pred[0]

# === EVALUATION === #
print("\n=== Final Evaluation per Cluster (With Feature Engineering + Tuning) ===")
for cluster in [0, 1, 2]:
    df_cluster = df[df['cluster'] == cluster].copy()
    X = df_cluster.drop(columns=['2024_price', 'cluster'])
    y_true = df_cluster['2024_price'].values

    X = add_interaction_features(X)
    label_encoders = joblib.load(f"models_by_cluster/label_encoders_cluster_{cluster}.joblib")
    scaler = joblib.load(f"models_by_cluster/scaler_cluster_{cluster}.joblib")

    for col in categorical_cols:
        X[col] = label_encoders[col].transform(X[col].astype(str))
    X[numerical_cols] = scaler.transform(X[numerical_cols])
    X_input = X.values.astype(np.float32)

    model = joblib.load(f"models_by_cluster/lightgbm_model_cluster_{cluster}.joblib")['model']
    y_log_pred = model.predict(X_input)
    y_pred = np.expm1(y_log_pred)

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"Cluster {cluster} - RMSE: {rmse:.2f}, R²: {r2:.4f}")


  df = pd.read_csv("/content/merged_data_2024_column.csv")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059814 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1883
[LightGBM] [Info] Number of data points in the train set: 109297, number of used features: 14
[LightGBM] [Info] Start training from score 10.370771
🔍 Tuning LightGBM for Cluster 1...
Fitting 3 folds for each of 54 candidates, totalling 162 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2267
[LightGBM] [Info] Number of data points in the train set: 368864, number of used features: 14
[LightGBM] [Info] Start training from score 9.128428
✅ Best model for cluster 1: {'learning_rate': 0.2, 'max_depth': 15, 'n_estimators': 200, 'num_leaves': 80}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010000 seconds.
You c



Cluster 0 - RMSE: 2503.47, R²: 0.9612




Cluster 1 - RMSE: 5595.76, R²: 0.7072




Cluster 2 - RMSE: 2124.32, R²: 0.9717


In [None]:
car = pd.DataFrame([{
    'year': 2024,
    'make': 'Kia',
    'model': 'Sorento',
    'trim': 'LX',
    'state': 'ca',
    'condition': 5,
    'odometer': 0,
    'color': 'white',
    'interior': 'black',
    'mmr': 20500
}])

cluster_id, predicted_price = predict_cluster_and_price(car)
print(f"Cluster: {cluster_id}, Predicted 2024 Price: ${predicted_price:.2f}")


Cluster: 2, Predicted 2024 Price: $13212.11




In [None]:
import pandas as pd

# Sample data
data = {
    'year': [2015, 2015, 2014, 2015, 2014],
    'make': ['Kia', 'Kia', 'BMW', 'Volvo', 'BMW'],
    'model': ['Sorento', 'Sorento', '3 Series', 'S60', '6 Series Gran Coupe'],
    'trim': ['LX', 'LX', '328i SULEV', 'T5', '650i'],
    'state': ['CA', 'CA', 'CA', 'CA', 'CA'],
    'condition': [5.0, 5.0, 4.5, 4.1, 4.3],
    'odometer': [16639.0, 9393.0, 1331.0, 14282.0, 2641.0],
    'color': ['white', 'white', 'gray', 'white', 'gray'],
    'interior': ['black', 'beige', 'black', 'black', 'black'],
    'mmr': [20500.0, 20800.0, 31900.0, 27500.0, 66000.0],
    '2024_price': [28666.67, 28666.67, 39949.37, 36953.16, 89333.33],
    'cluster': [2, 2, 0, 0, 0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)


In [None]:
import requests

API_URL = "https://autohaggleapi.onrender.com/predict"

test_data = {
    "year": 2015,
    "make": "Kia",
    "model": "Sorento",
    "trim": "LX",
    "state": "ca",
    "condition": 5,
    "odometer": 0,
    "color": "white",
    "interior": "black",
    "mmr": 20500
}

response = requests.post(API_URL, json=test_data)

if response.status_code == 200:
    print("Success! Prediction received:")
    print(response.json())
else:
    print("Error:", response.status_code, response.text)


Success! Prediction received:
{'cluster': 2, 'predicted_price': 12538.93}


In [None]:
!zip -r /content/models_by_cluster.zip /content/models_by_cluster

  adding: content/models_by_cluster/ (stored 0%)
  adding: content/models_by_cluster/lightgbm_model_cluster_1.joblib (deflated 0%)
  adding: content/models_by_cluster/scaler_cluster_1.joblib (stored 0%)
  adding: content/models_by_cluster/lightgbm_model_cluster_0.joblib (deflated 0%)
  adding: content/models_by_cluster/label_encoders_cluster_2.joblib (stored 0%)
  adding: content/models_by_cluster/lightgbm_model_cluster_2.joblib (deflated 0%)
  adding: content/models_by_cluster/label_encoders_cluster_1.joblib (deflated 0%)
  adding: content/models_by_cluster/scaler_cluster_2.joblib (stored 0%)
  adding: content/models_by_cluster/label_encoders_cluster_0.joblib (stored 0%)
  adding: content/models_by_cluster/scaler_cluster_0.joblib (stored 0%)
