In [1]:
# This cell installs the GPU version of LightGBM
! git clone --recursive https://github.com/Microsoft/LightGBM

%cd /content/LightGBM
! mkdir -p build

%cd build
! cmake -DUSE_GPU=1 /content/LightGBM
! make -j$(nproc)
! sudo apt-get -y install python-pip
! sudo -H pip install setuptools numpy scipy scikit-learn -U

%cd /content/LightGBM/python-package
! sudo python setup.py install --precompile

%cd /content/

fatal: destination path 'LightGBM' already exists and is not an empty directory.
/content/LightGBM
/content/LightGBM/build
-- OpenCL include directory: /usr/include
-- Boost version: 1.65.1
-- Found the following Boost libraries:
--   filesystem
--   system
-- Using _mm_prefetch
-- Using _mm_malloc
-- Configuring done
-- Generating done
-- Build files have been written to: /content/LightGBM/build
[  2%] Built target lightgbm_capi_objs
[ 89%] Built target lightgbm_objs
[ 94%] Built target _lightgbm
[100%] Built target lightgbm
Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-pip is already the newest version (9.0.1-2.3~ubuntu1.18.04.5).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
/content/LightGBM/python-package
running install
running build
running build_py
running egg_info
writing lightgbm.egg-info/PKG-INFO
writing dependency_links to lightgbm.egg-info/dependency_links.txt
writing requirements to lightgbm.egg-in

In [2]:
!pip install catboost gdown



In [3]:
!gdown --id 1-6TLVgf4ksqyaSkodFMtnJdsDcc86A_6 #* download dataset

Downloading...
From: https://drive.google.com/uc?id=1-6TLVgf4ksqyaSkodFMtnJdsDcc86A_6
To: /content/used_cars_data.csv
100% 9.98G/9.98G [01:36<00:00, 104MB/s] 


In [4]:
!gdown --id 18cueGRNfsaQo5UnWERD03cdxMXz1yFjd #* pretrained model

Downloading...
From: https://drive.google.com/uc?id=18cueGRNfsaQo5UnWERD03cdxMXz1yFjd
To: /content/model
100% 2.50G/2.50G [00:22<00:00, 109MB/s]


In [5]:
!gdown --id 1gPBKmgaGccrf3T2AKDYNjaDZZzpZBL4I #* testset index
!gdown --id 1RzcPJStyqsfk643PfpScoMM5RflnyluK #* trainset index

Downloading...
From: https://drive.google.com/uc?id=1gPBKmgaGccrf3T2AKDYNjaDZZzpZBL4I
To: /content/train_index
100% 19.2M/19.2M [00:00<00:00, 117MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1RzcPJStyqsfk643PfpScoMM5RflnyluK
To: /content/test_index
100% 4.80M/4.80M [00:00<00:00, 42.3MB/s]


In [6]:
#* Download code of components
!git clone https://github.com/Wp-Zhang/DS-5220-Final-Project.git

Cloning into 'DS-5220-Final-Project'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 29 (delta 6), reused 20 (delta 3), pack-reused 0[K
Unpacking objects: 100% (29/29), done.


In [7]:
import sys
#* Add components to the system path so they can be imported
sys.path.append('/content/DS-5220-Final-Project/')

In [8]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

import warnings
import pickle

In [9]:
from components.data_preprocessor import DataPreprocessor
from components.feature_engineer import FeatureEngineer
from components.learner import Learner

In [10]:
pd.set_option("display.max_columns", None)
warnings.filterwarnings('ignore')

In [11]:
data_loc = "/content/"

In [12]:
#* Load and split original data
df = pd.read_csv(data_loc + "used_cars_data.csv")

In [13]:
#* Load train and test idnex
train_index = pickle.load(open(data_loc+"train_index", 'rb'))
test_index = pickle.load(open(data_loc+"test_index", 'rb'))
df['is_train'] = 1
df['is_train'][test_index] = 0

In [14]:
data_preprocessor = DataPreprocessor()
feat_engineer = FeatureEngineer()

In [15]:
df = data_preprocessor.preprocess(df)
df = feat_engineer.generate_feats(df)

Dropping useless data...
Dropping useless data is done
Cleaning data...
Cleaning data is done
Transforming feature type...
Transforming feature type is done
Imputing data...
Imputing data is done
524430 rows with na are dropped
Reducing memory usage...
Memory usage of dataframe is : 641.0807905197144  MB
******************************
Column:  back_legroom
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  city_fuel_economy
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  engine_displacement
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  front_legroom
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  fuel_tank_volume
dtype before:  float64
dtype after:  float32
******************************
******************************
Co

In [16]:
df_train = df[df['is_train']==1].reset_index(drop=True)
df_test = df[df['is_train']==0].reset_index(drop=True)

In [17]:
cat_feats = [x for x in df_train.columns if df_train[x].dtype == 'object' and x not in ['vin']]
lgb_cat_feats = [c for c in cat_feats if c not in ['model_name']]
feats = [x for x in df_train.columns if x not in cat_feats+['vin', 'price', 'is_train', 'fold', 'bins']]
target = 'price'

In [18]:
df_train = data_preprocessor.label_encode(df_train, lgb_cat_feats)
df_test = data_preprocessor.label_encode(df_test, lgb_cat_feats)

Label encoding features...
Label encoding features is done
Label encoding features...
Label encoding features is done


In [19]:
learner = Learner()

In [20]:
with open(data_loc+'model', 'rb') as h:
    learner = pickle.load(h)

In [21]:
trn_pred_lr = learner.predict(df_train, feats, [], ['LR'], is_train=True)
trn_pred_ridge = learner.predict(df_train, feats, [], ['Ridge'], is_train=True)
trn_pred_lasso = learner.predict(df_train, feats, [], ['Lasso'], is_train=True)
trn_pred_dt = learner.predict(df_train, feats, [], ['DT'], is_train=True)
trn_pred_lgb = learner.predict(df_train, feats, lgb_cat_feats, ['LGB'], is_train=True)
trn_pred_xgb = learner.predict(df_train, feats, [], ['XGB'], is_train=True)

Predicting: 1it [00:05,  5.62s/it]
Predicting: 1it [00:04,  4.87s/it]
Predicting: 1it [00:04,  4.86s/it]
Predicting: 1it [00:05,  5.22s/it]
Predicting: 1it [02:49, 169.27s/it]
Predicting: 1it [00:12, 12.82s/it]


In [22]:
test_pred_lr = learner.predict(df_test, feats, [], ['LR'])
test_pred_ridge = learner.predict(df_test, feats, [], ['Ridge'])
test_pred_lasso = learner.predict(df_test, feats, [], ['Lasso'])
test_pred_dt = learner.predict(df_test, feats, [], ['DT'])
test_pred_lgb = learner.predict(df_test, feats, lgb_cat_feats, ['LGB'])
test_pred_xgb = learner.predict(df_test, feats, [], ['XGB'])

Predicting: 1it [00:02,  2.05s/it]
Predicting: 1it [00:01,  1.99s/it]
Predicting: 1it [00:01,  1.99s/it]
Predicting: 1it [00:02,  2.49s/it]
Predicting: 1it [03:28, 208.01s/it]
Predicting: 1it [00:11, 11.47s/it]


In [23]:
df_train['lr'] = trn_pred_lr
df_test['lr'] = test_pred_lr
df_train['ridge'] = trn_pred_ridge
df_test['ridge'] = test_pred_ridge
df_train['lasso'] = trn_pred_lasso
df_test['lasso'] = test_pred_lasso
df_train['dt'] = trn_pred_dt
df_test['dt'] = test_pred_dt
df_train['lgb'] = trn_pred_lgb
df_test['lgb'] = test_pred_lgb
df_train['xgb'] = trn_pred_xgb
df_test['xgb'] = test_pred_xgb

In [24]:
scaler = MinMaxScaler().fit(df_train[['lr','ridge','lasso','dt','xgb','lgb','mileage']])

In [25]:
df_train[['lr','ridge','lasso','dt','xgb','lgb','mileage']] = scaler.transform(df_train[['lr','ridge','lasso','dt','xgb','lgb','mileage']])
df_test[['lr','ridge','lasso','dt','xgb','lgb','mileage']] = scaler.transform(df_test[['lr','ridge','lasso','dt','xgb','lgb','mileage']])

In [26]:
learner2 = Learner()
_ = learner2.train(df_train, target, ['lr','ridge','lasso','dt','xgb'], [], 5, 'Ridge', {}, mean_squared_error)

100%|██████████| 5/5 [00:02<00:00,  2.03it/s]

Ridge End of training, avg metric: 8183824.71222311





In [27]:
pred_ensemble = learner2.predict(df_test, ['lr','ridge','lasso','dt','xgb'], [], ['Ridge'])
mean_squared_error(df_test[target], pred_ensemble, squared=False) # 2854

Predicting: 1it [00:00,  1.78it/s]


2853.935126944673

In [28]:
pred_l = [test_pred_lr, test_pred_ridge, test_pred_lasso, test_pred_dt, test_pred_lgb, test_pred_xgb, pred_ensemble]
rmse_l = []
mse_l = []
mae_l = []
for pred in pred_l:
    rmse_l.append(mean_squared_error(df_test[target], pred, squared=False))
    mse_l.append(mean_squared_error(df_test[target], pred))
    mae_l.append(mean_absolute_error(df_test[target], pred))
metric_df = pd.DataFrame({'model':['lr','ridge','lasso','dt','lgb','xgb','ensemble'],'rmse':rmse_l,'mse':mse_l,'mae':mae_l})

In [29]:
metric_df

Unnamed: 0,model,rmse,mse,mae
0,lr,4179.112239,17464980.0,2938.365156
1,ridge,4179.112242,17464980.0,2938.363303
2,lasso,4180.510277,17476670.0,2938.706503
3,dt,3052.642992,9318629.0,2144.105633
4,lgb,2990.714435,8944373.0,2129.761951
5,xgb,2854.050668,8145605.0,2010.68262
6,ensemble,2853.935127,8144946.0,2010.451266


In [30]:
df_test['pred'] = pred_ensemble
df_test[['vin','pred']].to_csv(data_loc+'pred.csv',index=None)