In [22]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.3-py3-none-win_amd64.whl (1.0 MB)


You should consider upgrading via the 'C:\Users\somnath.ojha\Anaconda3\python.exe -m pip install --upgrade pip' command.


Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.3


# Getting Started

In [23]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

In [3]:
data = pd.read_excel('used_car_pred.xlsx')
data.sample(5)

Unnamed: 0,id,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
318,322,620,mazda,demio,2010,111132,1300,at,2wd,rhd,gasoline
156,158,490,toyota,passo,2010,96000,1000,at,2wd,rhd,gasoline
2274,2292,1390,toyota,avensis,2004,20000,2000,at,2wd,rhd,gasoline
325,329,625,daihatsu,boon,2005,86154,1300,at,2wd,rhd,gasoline
1490,1506,1122,toyota,regius wagon,1999,75012,2700,at,2wd,rhd,gasoline


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2318 entries, 0 to 2317
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               2318 non-null   int64 
 1   price            2318 non-null   int64 
 2   mark             2318 non-null   object
 3   model            2318 non-null   object
 4   year             2318 non-null   int64 
 5   mileage          2318 non-null   int64 
 6   engine_capacity  2318 non-null   int64 
 7   transmission     2318 non-null   object
 8   drive            2318 non-null   object
 9   hand_drive       2318 non-null   object
 10  fuel             2318 non-null   object
dtypes: int64(5), object(6)
memory usage: 199.3+ KB


# Preprocessing

In [5]:
data.isna().sum()

id                 0
price              0
mark               0
model              0
year               0
mileage            0
engine_capacity    0
transmission       0
drive              0
hand_drive         0
fuel               0
dtype: int64

In [6]:
unneeded_columns = ['id']

data = data.drop(unneeded_columns, axis=1)

In [7]:
data

Unnamed: 0,price,mark,model,year,mileage,engine_capacity,transmission,drive,hand_drive,fuel
0,80,nissan,march,2003,80000,1240,at,2wd,rhd,gasoline
1,110,nissan,march,2010,53000,1200,at,2wd,rhd,gasoline
2,165,nissan,lafesta,2005,47690,2000,at,2wd,rhd,gasoline
3,190,toyota,avensis,2008,130661,1990,at,2wd,rhd,gasoline
4,190,daihatsu,mira,2006,66300,660,at,2wd,rhd,gasoline
...,...,...,...,...,...,...,...,...,...,...
2313,1400,toyota,vitz,2009,121000,996,at,2wd,rhd,gasoline
2314,1400,toyota,estima,2003,101000,3000,at,2wd,rhd,gasoline
2315,1400,subaru,r2,2005,101000,660,cvt,2wd,rhd,gasoline
2316,1400,honda,z,2000,170000,660,at,4wd,rhd,gasoline


In [8]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'mark': 28,
 'model': 258,
 'transmission': 3,
 'drive': 3,
 'hand_drive': 3,
 'fuel': 5}

In [9]:
data = data.drop('model', axis=1)

In [14]:
data = pd.get_dummies(data)

In [15]:
data

Unnamed: 0,price,year,mileage,engine_capacity,mark_audi,mark_bmw,mark_chrysler,mark_citroen,mark_daihatsu,mark_ford,...,drive_4wd,drive_awd,hand_drive_center,hand_drive_lhd,hand_drive_rhd,fuel_cng,fuel_diesel,fuel_gasoline,fuel_hybrid,fuel_lpg
0,80,2003,80000,1240,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,110,2010,53000,1200,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,165,2005,47690,2000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,190,2008,130661,1990,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,190,2006,66300,660,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2313,1400,2009,121000,996,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2314,1400,2003,101000,3000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2315,1400,2005,101000,660,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2316,1400,2000,170000,660,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0


# Splitting and Scaling

In [16]:
y = data.loc[:, 'price']
X = data.drop('price', axis=1)

In [17]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=34)

# Training

In [19]:
lin_model = LinearRegression()

lin_model.fit(X_train, y_train)

lin_y_preds = lin_model.predict(X_test)

In [24]:
lgb_model = lgb.LGBMRegressor(
    boosting_type='gbdt',
    num_leaves=31,
    n_estimators=100,
    reg_lambda=1.0
)

lgb_model.fit(X_train, y_train)

lgb_y_preds = lgb_model.predict(X_test)

In [25]:
lin_loss = np.sqrt(mean_squared_error(y_test, lin_y_preds))
lgb_loss = np.sqrt(mean_squared_error(y_test, lgb_y_preds))

In [26]:
print("Linear Regression RMSE:", lin_loss)
print("Gradient Boosted RMSE:", lgb_loss)

Linear Regression RMSE: 69561865684147.63
Gradient Boosted RMSE: 245.86541213229475


In [27]:
print("Linear Regression R^2 Score:", lin_model.score(X_test, y_test))
print("Gradient Boosted R^2 Score:", lgb_model.score(X_test, y_test))

Linear Regression R^2 Score: -6.413742665349091e+22
Gradient Boosted R^2 Score: 0.19875752700334026
