In [36]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler,minmax_scale
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt 

In [25]:
df=pd.read_csv("vehicles_model_ready_data.csv")

In [26]:
df=df.drop(['Unnamed: 0','paint_encoded'], axis=1)

In [27]:
#col_cat=['manufacturer','model']
#le = LabelEncoder()
#for col in col_cat:
#    df[col] = le.fit_transform(df[col])

In [28]:
#convert columns to categorical
col_cat=['manufacturer','model','fuel_diesel','fuel_electric','fuel_gas','fuel_hybrid','fuel_other','transmission_automatic', 	'transmission_manual', 	'transmission_other'  ,	'drive_4wd' ,	'drive_fwd' ,	'drive_rwd'  ,	'type_SUV' ,	'type_bus', 	'type_convertible' ,	'type_coupe' ,	'type_hatchback' ,	'type_mini-van', 	'type_offroad' ,	'type_other' ,	'type_pickup' ,	'type_sedan' ,	'type_truck' 	,'type_van' ,	'type_wagon']
for col in col_cat:
  df[col] = df[col].astype('category',copy=False)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81424 entries, 0 to 81423
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   price                   81424 non-null  int64   
 1   manufacturer            81424 non-null  category
 2   model                   81424 non-null  category
 3   condition               81424 non-null  float64 
 4   cylinders               81424 non-null  float64 
 5   odometer                81424 non-null  float64 
 6   title_status            81424 non-null  float64 
 7   age                     81424 non-null  float64 
 8   region_enc              81424 non-null  float64 
 9   fuel_diesel             81424 non-null  category
 10  fuel_electric           81424 non-null  category
 11  fuel_gas                81424 non-null  category
 12  fuel_hybrid             81424 non-null  category
 13  fuel_other              81424 non-null  category
 14  transmission_automatic

In [30]:
df["odometer"] = np.sqrt(minmax_scale(df["odometer"]))

In [31]:
X=df.loc[:, ~df.columns.isin(['price'])]
y=df["price"]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.1, 
                                                    random_state=1)

In [33]:
lgbm_base = lgbm.LGBMRegressor( boosting_type= 'gbdt',
          objective= 'regression')
lgbm_base.fit(X_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [34]:
lgbm_base_pred=lgbm_base.predict(X_test)

In [37]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, lgbm_base_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, lgbm_base_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, lgbm_base_pred)))
print('R2: ',metrics.r2_score(y_test, lgbm_base_pred))

Mean Absolute Error: 1446.8984910428094
Mean Squared Error: 5106361.933806228
Root Mean Squared Error: 2259.7260749494017
R2:  0.9481439024138041


In [None]:
fscores = []

for score in lgbm_base.feature_importances_:
    fscores.append( score * 1000000 )

In [None]:
index_columns = X.columns

coef = pd.Series( fscores, index = index_columns )

imp_coef = coef.sort_values()

In [None]:
matplotlib.rcParams['figure.figsize'] = (20, 10)

ax = imp_coef.plot( kind = "barh" )
ax.set_ylabel('',fontdict={'fontsize':4})
plt.title( "Feature importance using Catboost Regression Model" )
#plt.savefig('/home/ubuntu/01-Nuera/charts/XGBoost_Tuned_Features_Importance.jpg')
plt.show()