In [49]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
import pickle

In [46]:
print(pickle.format_version)

4.0


In [50]:
print(sklearn.__version__)


1.2.2


In [31]:
df = pd.read_csv("/content/Used_Bikes.csv")

In [32]:
df


Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha
...,...,...,...,...,...,...,...,...
32643,Hero Passion Pro 100cc,39000.0,Delhi,22000.0,First Owner,4.0,100.0,Hero
32644,TVS Apache RTR 180cc,30000.0,Karnal,6639.0,First Owner,9.0,180.0,TVS
32645,Bajaj Avenger Street 220,60000.0,Delhi,20373.0,First Owner,6.0,220.0,Bajaj
32646,Hero Super Splendor 125cc,15600.0,Jaipur,84186.0,First Owner,16.0,125.0,Hero


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32648 entries, 0 to 32647
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bike_name   32648 non-null  object 
 1   price       32648 non-null  float64
 2   city        32648 non-null  object 
 3   kms_driven  32648 non-null  float64
 4   owner       32648 non-null  object 
 5   age         32648 non-null  float64
 6   power       32648 non-null  float64
 7   brand       32648 non-null  object 
dtypes: float64(4), object(4)
memory usage: 2.0+ MB


In [34]:
df["owner"].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth Owner Or More'], dtype=object)

In [35]:
df["power"].unique()

array([ 110.,  350.,  675.,  180.,  150.,  160.,  100.,  500.,  250.,
        200.,  125.,  302.,  220.,  390.,  600.,  900.,  650.,  223.,
        410.,  135., 1100.,  765.,  300., 1299.,  750.,  400., 1300.,
        821., 1198.,  883.,  959.,  295.,  320.,  310.,  899., 1800.,
        535., 1130., 1000., 1200., 1050.,  800., 1262., 1700.,  175.,
        865.,  797.,  796.,  149.,  850., 1090.,  502.,  107.])

In [36]:
df.isna().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [37]:
X = df[["brand","bike_name","kms_driven","owner","age","power"]]
Y = df["price"]

In [38]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y, test_size=0.3)

In [39]:
ohe = OneHotEncoder()
ohe.fit(X[["bike_name","brand","owner"]])

In [40]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['bike_name','brand','owner']),remainder="passthrough")

In [41]:
lr = LinearRegression()

In [42]:
pipe=make_pipeline(column_trans,lr)

In [15]:
pipe.fit(X_train,Y_train)

In [16]:
Y_pred= pipe.predict(X_test)

In [17]:
Y_pred

array([82681.1659658 , 10743.61306767, 28817.02318387, ...,
       20707.28874891, 65342.54677011, 20144.72533003])

In [18]:
r2_score(Y_test,Y_pred)

0.9290975842628173

In [22]:
scores=[]
for i in range(5000):
  X_train,X_text,Y_train,Y_test= train_test_split(X,Y, test_size=0.25,random_state=i)
  lr = LinearRegression()
  pipe=make_pipeline(column_trans,lr)
  pipe.fit(X_train,Y_train)
  Y_pred= pipe.predict(X_text)
  scores.append(r2_score(Y_test,Y_pred))

In [23]:
print(scores)

[0.9290858511383642, 0.9001814115536052, 0.9240692926234715, 0.9329932896424662, 0.8992829673962714, 0.887431868129737, 0.9062445990591259, 0.883923460136547, 0.8914358468340489, 0.9069736678227666, 0.9122521473278477, 0.9418870010437101, 0.9033707060679529, 0.9149370566338819, 0.8853194517073808, 0.9086036743813533, 0.9100557202516073, 0.9081776111520634, 0.8948510010088091, 0.9145982369863455, 0.9169764107625558, 0.8996768495837972, 0.9094771093168926, 0.9121655040368357, 0.9193853519896642, 0.9154474128292791, 0.9076164310902015, 0.9323253052860018, 0.9041447324820301, 0.9291902339916008, 0.9050351330835964, 0.8905083703059298, 0.9254230808442739, 0.9054134470821755, 0.924364810984704, 0.9407909372761352, 0.919622088542708, 0.9305494067083824, 0.8766725925416005, 0.9090924426425058, 0.9122154884647375, 0.9309079942991051, 0.8868707723165118, 0.9164501327291192, 0.9139396051023703, 0.9113438527073037, 0.9139431571032247, 0.9167018503374945, 0.9037831925393189, 0.9093337117591764, 0.9

In [24]:
np.argmax(scores)

1287

In [43]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y, test_size=0.25,random_state=1287)
lr = LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,Y_train)
Y_pred= pipe.predict(X_text)
r2_score(Y_test,Y_pred)


0.9504612448928291

In [44]:
pickle.dump(pipe,open("trained_model.pkl","wb"))