In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.base import clone
import re

import optuna
from optuna.samplers import TPESampler

from sklearn.model_selection import *
from sklearn.preprocessing import *

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.metrics import *

pd.set_option('display.max_columns', None)
from IPython.display import clear_output
from tqdm import tqdm, trange
from tabulate import tabulate
import random
import time
import logging
from IPython.display import display
from IPython.display import display, HTML
from colorama import Fore
from datetime import datetime
from sklearn.ensemble import *

In [2]:
%%time

train = pd.read_csv('delete_USA_cars/train.csv')
test = pd.read_csv('delete_USA_cars/test.csv')



def update(df):
    
    t = 100
    
    df['accident'] = df['accident'].map({
        'None reported': 'not_reported',
        'At least 1 accident or damage reported': 'reported'
    })
    df['transmission'] = df['transmission'].str.replace('/', '').str.replace('-', '')
    df['transmission'] = df['transmission'].str.replace(' ', '_')
    
    cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
    re_ = ['model','engine','transmission','ext_col','int_col']
    
    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
    for col in cat_c:
        df[col] = df[col].fillna('missing')
        df[col] = df[col].astype('category')
        
    return df

train  = update(train)
test   = update(test)

CPU times: user 634 ms, sys: 60.5 ms, total: 694 ms
Wall time: 730 ms


In [3]:
%%time

def feature(df):
    current_year = datetime.now().year

    df['Vehicle_Age'] = current_year - df['model_year']

    df['Mileage_per_Year'] = df['milage'] / df['Vehicle_Age']

    def extract_horsepower(engine):
        try:
            return float(engine.split('HP')[0])
        except:
            return None

    def extract_engine_size(engine):
        try:
            return float(engine.split(' ')[1].replace('L', ''))
        except:
            return None

    df['Horsepower'] = df['engine'].apply(extract_horsepower)
    df['Engine_Size'] = df['engine'].apply(extract_engine_size)
    df['Power_to_Weight_Ratio'] = df['Horsepower'] / df['Engine_Size']

    luxury_brands =  ['Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Land', 
                    'Lexus', 'Jaguar', 'Bentley', 'Maserati', 'Lamborghini', 
                    'Rolls-Royce', 'Ferrari', 'McLaren', 'Aston', 'Maybach']
    df['Is_Luxury_Brand'] = df['brand'].apply(lambda x: 1 if x in luxury_brands else 0)

    df['Accident_Impact'] = df.apply(lambda x: 1 if x['accident'] == 1 and x['clean_title'] == 0 else 0, axis=1)
    
    return df

train = feature(train)
test = feature(test)

CPU times: user 533 ms, sys: 27.9 ms, total: 561 ms
Wall time: 581 ms


In [4]:
train.drop_duplicates(inplace = True)

In [5]:
train.shape

(192542, 21)

In [6]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,Vehicle_Age,Mileage_per_Year,Horsepower,Engine_Size,Power_to_Weight_Ratio,Is_Luxury_Brand,Accident_Impact
0,0,0.0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,AT,Yellow,Gray,missing,Yes,4200,17,12529.411765,172.0,1.6,107.5,0,0
1,1,1.0,Lincoln,noise,2002,143250,Gasoline,noise,AT,Silver,Beige,missing,Yes,4999,22,6511.363636,,,,0,0
2,2,2.0,Chevrolet,noise,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,AT,Blue,Gray,missing,Yes,13900,22,6215.045455,320.0,5.3,60.377358,0,0
3,3,3.0,Genesis,noise,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission_wDual_Shift_Mode,Black,Black,missing,Yes,45000,7,2785.714286,420.0,5.0,84.0,0,0
4,4,4.0,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7Speed_AT,Black,Beige,missing,Yes,97500,3,2462.666667,208.0,2.0,104.0,1,0


In [7]:
train.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

In [8]:
test.drop(columns = ['Unnamed: 0', 'id'], inplace = True)

In [9]:
%%time

X = train.drop(['price'], axis=1)
y = train['price']
cat_features = X.select_dtypes(include=['category']).columns.tolist()

SEED = 601
n_splits = 5

def Train_ML(X, y, model, test, n_splits=n_splits):
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    rmse_scores = []
    test_preds = np.zeros((test.shape[0], n_splits)) 
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        val_predictions = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
        rmse_scores.append(rmse)
        
        test_preds[:, fold] = model.predict(test)
    
    avg_rmse = np.mean(rmse_scores)
    
    mean_test_preds = np.mean(test_preds, axis=1)
    
    header = f"\n{'Final Validation RMSE:':<25} {avg_rmse:.5f}\n"
    print(header)
    
    return mean_test_preds

lgb_params ={'learning_rate': 0.017521301504983752, 'max_depth': 42, 'reg_alpha': 0.06876635751774487, 
 'reg_lambda': 9.738899198284985, 'num_leaves': 131, 'subsample': 0.2683765421728044, 
 'colsample_bytree': 0.44346036599709887} 

params1 = {'learning_rate': 0.015387355282525047, 'num_leaves': 287, 'max_depth': 10, 'min_child_samples': 32,
 'subsample': 0.5678602068076838, 'colsample_bytree': 0.5254867750210618, 'reg_alpha': 8.515713311140541e-05,
 'reg_lambda': 9.929128235845939, 'scale_pos_weight': 1.031529653438031, 'max_bin': 2894,
 'min_split_gain': 8.135732868325528e-05, 'min_child_weight': 0.9684228603448732,'boosting_type': 'gbdt',
 'objective': 'regression','metric': 'rmse'}

params2 = {'learning_rate': 0.01383980028736371, 'num_leaves': 185, 'max_depth': 11,
'min_child_samples': 47, 'subsample': 0.5179868722209913, 'colsample_bytree': 0.5099485937874763,
'reg_alpha': 2.6213681351209853e-08, 'reg_lambda': 5.24590000227586, 'scale_pos_weight': 3.6179756319298417, 
'max_bin': 2910, 'min_split_gain': 0.007117579745695504, 'min_child_weight': 0.21739378651138622}

lgb1 = LGBMRegressor(**lgb_params, random_state=SEED, verbose=-1, n_estimators=200)
lgb2 = LGBMRegressor(**params1, random_state=SEED, verbose=-1, n_estimators=200)
lgb3 = LGBMRegressor(**params2, random_state=SEED, verbose=-1, n_estimators=200)


estimator1 = VotingRegressor([
    ('LGBM_Tunned', lgb1), 
    ('LGBM_Tunned_1', lgb2), 
    ('LGBM_Tunned_2', lgb3)
], weights=[0.2, 0.7, 0.1])

vmp = Train_ML(X, y, estimator1, test)


Final Validation RMSE:    72505.12125

CPU times: user 4min 20s, sys: 36.5 s, total: 4min 57s
Wall time: 39.9 s


In [10]:
import joblib as jo


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    df_processed = df.copy()
    categorical_columns = df_processed.select_dtypes(include=['object', 'category']).columns
    
    le = LabelEncoder()
    
    for col in categorical_columns:
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        
    jo.dump(le,'scaler.joblib')
    
    return df_processed

def Train_ML(X, y, model, test, n_splits=5):
    SEED = 601
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    rmse_scores = []
    test_preds = np.zeros((test.shape[0], n_splits))
    
    # Preprocess X and test data
    X_processed = preprocess_data(X)
    test_processed = preprocess_data(test)
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_processed)):
        X_train, X_val = X_processed.iloc[train_idx], X_processed.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        val_predictions = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
        rmse_scores.append(rmse)
        
        test_preds[:, fold] = model.predict(test_processed)
    
    avg_rmse = np.mean(rmse_scores)
    
    mean_test_preds = np.mean(test_preds, axis=1)
    
    header = f"\n{'Final Validation RMSE:':<25} {avg_rmse:.5f}\n"
    print(header)
    
    return mean_test_preds

# Assume train and test are your DataFrames
X = train.drop(['price'], axis=1)
y = train['price']

lgb_params = {'learning_rate': 0.017521301504983752, 'max_depth': 42, 'reg_alpha': 0.06876635751774487, 
              'reg_lambda': 9.738899198284985, 'num_leaves': 131, 'subsample': 0.2683765421728044, 
              'colsample_bytree': 0.44346036599709887}

params1 = {'learning_rate': 0.015387355282525047, 'num_leaves': 287, 'max_depth': 10, 'min_child_samples': 32,
           'subsample': 0.5678602068076838, 'colsample_bytree': 0.5254867750210618, 'reg_alpha': 8.515713311140541e-05,
           'reg_lambda': 9.929128235845939, 'scale_pos_weight': 1.031529653438031, 'max_bin': 2894,
           'min_split_gain': 8.135732868325528e-05, 'min_child_weight': 0.9684228603448732, 'boosting_type': 'gbdt',
           'objective': 'regression', 'metric': 'rmse'}

params2 = {'learning_rate': 0.01383980028736371, 'num_leaves': 185, 'max_depth': 11,
           'min_child_samples': 47, 'subsample': 0.5179868722209913, 'colsample_bytree': 0.5099485937874763,
           'reg_alpha': 2.6213681351209853e-08, 'reg_lambda': 5.24590000227586, 'scale_pos_weight': 3.6179756319298417, 
           'max_bin': 2910, 'min_split_gain': 0.007117579745695504, 'min_child_weight': 0.21739378651138622}

lgb1 = LGBMRegressor(**lgb_params, random_state=SEED, verbose=-1, n_estimators=200,n_jobs=-1)
lgb2 = LGBMRegressor(**params1, random_state=SEED, verbose=-1, n_estimators=200,n_jobs=-1)
lgb3 = LGBMRegressor(**params2, random_state=SEED, verbose=-1, n_estimators=200,n_jobs=-1)

estimator1 = VotingRegressor([
    ('LGBM_Tunned', lgb1), 
    ('LGBM_Tunned_1', lgb2), 
    ('LGBM_Tunned_2', lgb3)
], weights=[0.2, 0.7, 0.1])

vmp = Train_ML(X, y, estimator1, test)


Final Validation RMSE:    72490.09187



In [12]:
test

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,Vehicle_Age,Mileage_per_Year,Horsepower,Engine_Size,Power_to_Weight_Ratio,Is_Luxury_Brand,Accident_Impact
0,Land,noise,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6Speed_AT,White,Beige,missing,Yes,9,10888.888889,240.0,2.0,120.000000,1,0
1,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8Speed_AT,Silver,Black,missing,Yes,4,2285.500000,395.0,3.0,131.666667,1,0
2,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10Speed_Automatic,White,Ebony,missing,missing,2,14060.500000,,,,0,0
3,Audi,noise,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,noise,Black,missing,missing,8,7657.250000,,,,1,0
4,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,AT,Gray,Black,missing,Yes,6,9833.333333,252.0,2.0,126.000000,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125685,Mercedes-Benz,GL-Class GL 450 4MATIC,2014,83315,Gasoline,362.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,7Speed_AT,Black,Black,missing,Yes,10,8331.500000,362.0,3.0,120.666667,1,0
125686,Audi,Q7 55 Prestige,2019,29336,Gasoline,3.0 Liter Turbo,Automatic,White,Black,missing,missing,5,5867.200000,,,,1,0
125687,Audi,A6 3.0T Premium Plus,2012,77634,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,AT,Black,Black,missing,Yes,12,6469.500000,333.0,3.0,111.000000,1,0
125688,Audi,noise,2012,112000,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,AT,Black,Black,missing,Yes,12,9333.333333,333.0,3.0,111.000000,1,0


In [13]:
# df_processed = df.copy()


NameError: name 'df' is not defined

In [57]:
# categorical_columns = test.select_dtypes(include=['object', 'category']).columns
# categorical_columns

Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident', 'clean_title'],
      dtype='object')

In [37]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [39]:
# y_predict_train = estimator1.predict(x_train)

In [40]:
X.columns

Index(['brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'Vehicle_Age', 'Mileage_per_Year', 'Horsepower', 'Engine_Size',
       'Power_to_Weight_Ratio', 'Is_Luxury_Brand', 'Accident_Impact'],
      dtype='object')

In [14]:
import joblib as jo
jo.dump(estimator1,'usa_model.joblib')

['usa_model.joblib']

In [None]:
srgrehgreherhre.(ASda)

In [2]:
import requests

In [None]:
url = "https://thammenha.onrender.com/"

response = requests.get(url)
print(response.json())

In [9]:
url = "https://thammenha.onrender.com/predict/"

params = {
    'brand': 'MINI',
    'model': 'Cooper S Base',
    'model_year': 2011,
    'milage': 15000,
    'fuel_type': 'Gasoline',
    'engine': '190.0HP 2.0L 4 Cylinder Engine',
    'transmission': 'AT',
    'ext_col': 'Black',
    'int_col': 'Black',
    'accident': 'not_reported',
    'clean_title': 'Yes',
    'Vehicle_Age': 6,
    'Mileage_per_Year': 1234.123,
    'Horsepower': 190.0,
    'Engine_Size': 2.0,
    'Power_to_Weight_Ratio': 110.5,
    'Is_Luxury_Brand': 0,
    'Accident_Impact': 0

}

response = requests.get(url, params=params)
print(response.json())

{'detail': 'Method Not Allowed'}


brand    model    model_year    milage    fuel_type    engine    transmission    ext_col    int_col    accident    clean_title    Vehicle_Age    Mileage_per_Year    Horsepower    Engine_Size    Power_to_Weight_Ratio    Is_Luxury_Brand    Accident_Impact
0    

MINI    Cooper S Base    2007    213000    Gasoline    172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel    AT    Yellow    Gray    not_reported    Yes    17    12529.411765    172.0    1.6    107.5    0    0


In [48]:
X.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,Vehicle_Age,Mileage_per_Year,Horsepower,Engine_Size,Power_to_Weight_Ratio,Is_Luxury_Brand,Accident_Impact
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,AT,Yellow,Gray,missing,Yes,17,12529.411765,172.0,1.6,107.5,0,0
1,Lincoln,noise,2002,143250,Gasoline,noise,AT,Silver,Beige,missing,Yes,22,6511.363636,,,,0,0
2,Chevrolet,noise,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,AT,Blue,Gray,missing,Yes,22,6215.045455,320.0,5.3,60.377358,0,0
3,Genesis,noise,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission_wDual_Shift_Mode,Black,Black,missing,Yes,7,2785.714286,420.0,5.0,84.0,0,0
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7Speed_AT,Black,Beige,missing,Yes,3,2462.666667,208.0,2.0,104.0,1,0


In [3]:
url = "http://0.0.0.0:8000/predict"

params = {
    'brand': 'MINI',
    'model': 'Cooper S Base',
    'model_year': 2011,
    'milage': 15000,
    'fuel_type': 'Gasoline',
    'engine': '190.0HP 2.0L 4 Cylinder Engine Gasoline Fuel',
    'transmission': 'AT',
    'ext_col': 'Black',
    'int_col': 'Black',
    'accident': 'not_reported',
    'clean_title': 'Yes',
    'Vehicle_Age': 6,
    'Mileage_per_Year': 1234.123,
    'Horsepower': 190.0,
    'Engine_Size': 2.0,
    'Power_to_Weight_Ratio': 110.5,
    'Is_Luxury_Brand': 0,
    'Accident_Impact': 0

}

response = requests.post(url, params=params)
print(response.json())

87934
