In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
import warnings
from sklearn.exceptions import DataConversionWarning
# ignore all scikit-learn warnings
warnings.filterwarnings(action='ignore')


#### Read Data

In [2]:
df=pd.read_csv("./Dataset/bikesCleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,name,brand,price,max power,max torque,cooling system,transmission,transmission type,displacement,...,front tyre pressure (rider & pillion),rear tyre pressure (rider & pillion),kerb weight,overall length,overall width,wheelbase,ground clearance,seat height,overall height,chassis type
0,0,Royal Enfield Hunter 350,Royal Enfield,149900,20.2,27.0,air/oil cooled,5 speed manual,chain drive,349.0,...,29.0,33.0,177.0,2055,800.0,1370,150.0,790,1055,double-downtube frame
1,1,Royal Enfield Classic 350,Royal Enfield,190229,20.2,27.0,air/oil cooled,5 speed manual,chain drive,349.0,...,22.0,32.0,195.0,2145,785.0,1390,170.0,805,1090,twin downtube spine frame
2,2,Royal Enfield Bullet 350,Royal Enfield,157391,19.1,28.0,air cooled,5 speed manual,chain drive,346.0,...,22.0,32.0,186.0,2170,810.0,1395,135.0,800,1120,"single downtube,using engine as stressed member"
3,3,Royal Enfield Continental GT 650,Royal Enfield,304945,47.0,52.0,air/oil cooled,6 speed manual,chain drive,648.0,...,32.0,39.0,198.0,2122,744.0,1398,174.0,793,1024,"steel tubular, double cradle frame"
4,4,Royal Enfield Meteor 350,Royal Enfield,200924,20.2,27.0,air/oil cooled,5 speed manual,chain drive,349.0,...,32.0,36.0,191.0,2140,845.0,1400,170.0,765,1140,twin downtube spine frame


### Data Preprocessing
#### Using pandas_profiling for EDA

In [3]:
# from pandas_profiling import ProfileReport
# prof = ProfileReport(df)
# prof.to_file(output_file='output.html')

In [4]:
df.shape

(173, 45)

In [5]:
df.count()

Unnamed: 0                               173
name                                     173
brand                                    173
price                                    173
max power                                173
max torque                               173
cooling system                           173
transmission                             173
transmission type                        173
displacement                             173
cylinders                                173
bore                                     173
stroke                                   173
valves per cylinder                      173
spark plugs                              173
gear shifting pattern                    173
clutch                                   173
fuel tank capacity                       173
mileage - arai                            38
mileage - owner reported                 110
top speed                                173
braking system                           173
front brak

In [6]:
# find correlation of price with other features
h=df.corr()['price']
# get features which has correlation of price with other features greater than 50%
h = df.corr()['price']
h = h[h > 0.5]
h

price                                    1.000000
max power                                0.899191
max torque                               0.849956
displacement                             0.761933
cylinders                                0.773806
bore                                     0.553585
fuel tank capacity                       0.534874
top speed                                0.804045
rear brake size                          0.501127
front tyre pressure (rider)              0.679589
rear tyre pressure (rider)               0.600355
front tyre pressure (rider & pillion)    0.690404
kerb weight                              0.641060
wheelbase                                0.605229
Name: price, dtype: float64

In [7]:
## selectd columns are name,brand,max power,max torque ,fuel tank capacity,top speed,kerb weight,overall_height,overall_lenght,overall_width,wheelbase,braking system
columns=["name","brand","max power","max torque" ,"fuel tank capacity","top speed","kerb weight","overall height","overall length","overall width","wheelbase","front brake type","price"]

In [8]:
# get remainnig columns other than selected columns
remaining_cols = set(df.columns) - set(columns)
print(list(remaining_cols))

['seat height', 'ground clearance', 'rear tyre pressure (rider)', 'radial tyres', 'bore', 'rear wheel size', 'valves per cylinder', 'mileage - owner reported', 'wheel type', 'front brake size', 'front tyre size', 'front tyre pressure (rider & pillion)', 'braking system', 'rear brake size', 'transmission', 'spark plugs', 'mileage - arai', 'gear shifting pattern', 'tyre type', 'displacement', 'rear brake type', 'rear tyre pressure (rider & pillion)', 'transmission type', 'rear tyre size', 'stroke', 'front tyre pressure (rider)', 'Unnamed: 0', 'clutch', 'cylinders', 'chassis type', 'front wheel size', 'cooling system']


In [9]:
# drop all remaining columns
df_updated = df.drop(remaining_cols, axis=1)
df_updated

Unnamed: 0,name,brand,price,max power,max torque,fuel tank capacity,top speed,front brake type,kerb weight,overall length,overall width,wheelbase,overall height
0,Royal Enfield Hunter 350,Royal Enfield,149900,20.20,27.0,13.0,114.0,disc,177.0,2055,800.0,1370,1055
1,Royal Enfield Classic 350,Royal Enfield,190229,20.20,27.0,13.0,114.0,disc,195.0,2145,785.0,1390,1090
2,Royal Enfield Bullet 350,Royal Enfield,157391,19.10,28.0,13.5,110.0,disc,186.0,2170,810.0,1395,1120
3,Royal Enfield Continental GT 650,Royal Enfield,304945,47.00,52.0,12.5,169.0,disc,198.0,2122,744.0,1398,1024
4,Royal Enfield Meteor 350,Royal Enfield,200924,20.20,27.0,15.0,112.0,disc,191.0,2140,845.0,1400,1140
...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,Vespa SXL 125,Vespa,134313,9.78,9.6,7.4,90.0,disc,114.0,1770,690.0,1290,1140
169,Vespa SXL 150,Vespa,148261,10.32,10.6,7.4,95.0,disc,114.0,1770,690.0,1290,1140
170,Vespa VXL 125,Vespa,130438,9.78,9.6,7.4,90.0,disc,114.0,1770,690.0,1290,1140
171,Vespa VXL 150,Vespa,143895,10.32,10.6,7.4,95.0,disc,114.0,1770,690.0,1290,1140


In [10]:
df_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                173 non-null    object 
 1   brand               173 non-null    object 
 2   price               173 non-null    int64  
 3   max power           173 non-null    float64
 4   max torque          173 non-null    float64
 5   fuel tank capacity  173 non-null    float64
 6   top speed           173 non-null    float64
 7   front brake type    173 non-null    object 
 8   kerb weight         173 non-null    float64
 9   overall length      173 non-null    int64  
 10  overall width       173 non-null    float64
 11  wheelbase           173 non-null    int64  
 12  overall height      173 non-null    int64  
dtypes: float64(6), int64(4), object(3)
memory usage: 17.7+ KB


In [11]:
#unique values of front brake type feature
print(df_updated["front brake type"].unique())
print("Unique Count : ",df_updated["front brake type"].nunique())

['disc' 'drum']
Unique Count :  2


In [12]:
#unique values of brand feature
print(df_updated["brand"].unique())
print("Unique Count : ",df_updated["brand"].nunique())

['Royal Enfield' 'TVS' 'Hero' 'Bajaj' 'Honda' 'Yamaha' 'Suzuki' 'KTM'
 'Jawa' 'Kawasaki' 'Aprilia' 'Benelli' 'CFMoto' 'Ducati' 'Husqvarna'
 'Triumph' 'Vespa']
Unique Count :  17


In [13]:
#unique values of name feature
print(df_updated["name"].unique())
print("Unique Count : ",df_updated["name"].nunique())

['Royal Enfield Hunter 350' 'Royal Enfield Classic 350'
 'Royal Enfield Bullet 350' 'Royal Enfield Continental GT 650'
 'Royal Enfield Meteor 350' 'Royal Enfield Himalayan'
 'Royal Enfield Interceptor 650' 'Royal Enfield Scram 411'
 'TVS Raider 125' 'Apache' 'TVS Ronin' 'TVS Ntorq 125' 'TVS Sport'
 'TVS Jupiter' 'TVS Jupiter 125' 'TVS Star City Plus' 'TVS Radeon'
 'TVS Scooty Pep Plus' 'TVS XL100 Heavy Duty' 'TVS Scooty Zest 110'
 'TVS XL100 Comfort' 'Splendor' 'Hero Splendor Plus Xtec' 'Hero HF Deluxe'
 'Hero Xtreme 160R' 'Hero Glamour' 'Hero Passion Pro' 'Hero Xpulse 200 4V'
 'Hero Pleasure +' 'Hero Passion Xtec' 'Hero Glamour Xtec'
 'Hero Xpulse 200T 4V' 'Hero Xtreme 200S' 'Hero Pleasure + Xtec'
 'Hero Maestro Edge 125' 'Hero Destini 125 Xtec' 'Hero Maestro Edge 110'
 'Pulsar' 'Platina' 'Bajaj Dominar 400' 'Bajaj Dominar 250' 'Bajaj CT 110'
 'Bajaj CT 125X' 'Avenger' 'Honda SP 125' 'Activa' 'Honda Shine'
 'Honda Unicorn' 'Honda Dio' 'Honda Hornet 2.0' 'Honda Hness CB350'
 'Honda Liv

In [14]:
#Null value check
df_updated.isnull().sum()

name                  0
brand                 0
price                 0
max power             0
max torque            0
fuel tank capacity    0
top speed             0
front brake type      0
kerb weight           0
overall length        0
overall width         0
wheelbase             0
overall height        0
dtype: int64

### Split the data

In [15]:
#define X and Y
X=df_updated.drop(columns=['price'])
y=df_updated['price']

In [16]:
X

Unnamed: 0,name,brand,max power,max torque,fuel tank capacity,top speed,front brake type,kerb weight,overall length,overall width,wheelbase,overall height
0,Royal Enfield Hunter 350,Royal Enfield,20.20,27.0,13.0,114.0,disc,177.0,2055,800.0,1370,1055
1,Royal Enfield Classic 350,Royal Enfield,20.20,27.0,13.0,114.0,disc,195.0,2145,785.0,1390,1090
2,Royal Enfield Bullet 350,Royal Enfield,19.10,28.0,13.5,110.0,disc,186.0,2170,810.0,1395,1120
3,Royal Enfield Continental GT 650,Royal Enfield,47.00,52.0,12.5,169.0,disc,198.0,2122,744.0,1398,1024
4,Royal Enfield Meteor 350,Royal Enfield,20.20,27.0,15.0,112.0,disc,191.0,2140,845.0,1400,1140
...,...,...,...,...,...,...,...,...,...,...,...,...
168,Vespa SXL 125,Vespa,9.78,9.6,7.4,90.0,disc,114.0,1770,690.0,1290,1140
169,Vespa SXL 150,Vespa,10.32,10.6,7.4,95.0,disc,114.0,1770,690.0,1290,1140
170,Vespa VXL 125,Vespa,9.78,9.6,7.4,90.0,disc,114.0,1770,690.0,1290,1140
171,Vespa VXL 150,Vespa,10.32,10.6,7.4,95.0,disc,114.0,1770,690.0,1290,1140


In [17]:
# train/test/split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


### Using ColumnTransformer to perform columnwise transformation

In [18]:
from sklearn.compose import ColumnTransformer

In [19]:
# one hot encoding on name,brand,front_brake_type columns
trf1 = ColumnTransformer([
    ('ohe_name', OneHotEncoder(sparse=False, handle_unknown='ignore'), [0]),
    ('ohe_brand', OneHotEncoder(sparse=False, handle_unknown='ignore'), [1]),
    ('ohe_front_brake_type', OneHotEncoder(sparse=False, handle_unknown='ignore'), [6])
], remainder='passthrough')

In [20]:
trf1.fit_transform(X_train).shape

(138, 146)

In [21]:
trf1.transform(X_test).shape

(35, 146)

### Train the model with pipeline

In [22]:
# train the model
from sklearn.ensemble import RandomForestRegressor
trf2 = RandomForestRegressor(n_estimators=200, criterion='squared_error')

In [23]:
# crating the pipeline based on above column transformation
from sklearn.pipeline import Pipeline,make_pipeline
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),])

In [24]:
pipe

In [25]:
# Display Pipeline diagram
from sklearn import set_config
set_config(display='diagram')

In [26]:
# train pipeline
pipe.fit(X_train,y_train)

In [27]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_name',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [0]),
                                 ('ohe_brand',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1]),
                                 ('ohe_front_brake_type',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [6])]),
 'trf2': RandomForestRegressor(n_estimators=200)}

In [28]:
# Predict
y_pred = pipe.predict(X_test)
y_pred

array([1265918.03166667,  102672.5       ,   81371.805     ,
         72229.535     ,  795558.045     , 2095312.72      ,
         72248.305     ,  144311.105     , 2211490.14166667,
       2772981.085     ,  126568.315     ,  121297.085     ,
       1451296.245     ,   69884.75      ,   76550.85      ,
        129597.745     ,  715681.43833333,  126456.61      ,
         54422.22      ,  226643.305     ,  123611.83      ,
        164814.035     , 1051106.66      ,  127034.065     ,
         81408.785     ,  194006.16      , 2116317.1       ,
         71245.235     ,   72942.55      , 2067451.8       ,
         79737.03      ,  132832.89      ,   82336.955     ,
       1429600.33833333, 2380599.855     ])

In [29]:
# Evaluate the performance of the model on the test data using R-squared
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

R-squared: 0.8646008997627787


In [30]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=15).mean()

0.8183045282675985

### Save and load model

In [31]:
# export 
import pickle
#pickle.dump(pipe,open('models/pipe3.pkl','wb'))

In [32]:
pipe = pickle.load(open('models/pipe2.pkl','rb'))

In [33]:
features=["name","brand","max power","max torque" ,"fuel tank capacity","top speed","front brake type","kerb weight","overall length","overall width","wheelbase","overall height","price"]
features_without_price =["name","brand","max power","max torque" ,"fuel tank capacity","top speed","front brake type","kerb weight","overall length","overall width","wheelbase","overall height"]
print(df[features].loc[[15], :])
print("----------------------------------------------------------------------------------------")
print(df[features_without_price].loc[[15], :])

      name brand  max power  max torque  fuel tank capacity  top speed  \
15  Apache   TVS      20.54       17.25                12.0      127.0   

   front brake type  kerb weight  overall length  overall width  wheelbase  \
15             disc        152.0            2050          790.0       1353   

    overall height   price  
15            1050  141058  
----------------------------------------------------------------------------------------
      name brand  max power  max torque  fuel tank capacity  top speed  \
15  Apache   TVS      20.54       17.25                12.0      127.0   

   front brake type  kerb weight  overall length  overall width  wheelbase  \
15             disc        152.0            2050          790.0       1353   

    overall height  
15            1050  


In [34]:
test_input = df[features_without_price].loc[[15], :].to_numpy()
test_input

array([['Apache', 'TVS', 20.54, 17.25, 12.0, 127.0, 'disc', 152.0, 2050,
        790.0, 1353, 1050]], dtype=object)

In [35]:
#pipe.predict(test_input)