## Importing libraries and data 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_csv('Data/Bike price dataset/Used_Bikes.csv')

In [5]:
dataset.head()

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [6]:
dataset.shape

(32648, 8)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32648 entries, 0 to 32647
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bike_name   32648 non-null  object 
 1   price       32648 non-null  float64
 2   city        32648 non-null  object 
 3   kms_driven  32648 non-null  float64
 4   owner       32648 non-null  object 
 5   age         32648 non-null  float64
 6   power       32648 non-null  float64
 7   brand       32648 non-null  object 
dtypes: float64(4), object(4)
memory usage: 2.0+ MB


### Some changes needed in the data
- convert float values to int to help with the data size
- use only the first 4 words of the names

In [8]:
dataset['price'] = dataset['price'].astype(int) 
dataset['kms_driven'] = dataset['kms_driven'].astype(int)
dataset['age'] = dataset['age'].astype(int)
dataset['power'] = dataset['power'].astype(int)

In [9]:
dataset['bike_name'] = dataset['bike_name'].str.split(" ").str.slice(0,4).str.join(' ')

In [10]:
dataset['bike_name'].unique()

array(['TVS Star City Plus', 'Royal Enfield Classic 350cc',
       'Triumph Daytona 675R', 'TVS Apache RTR 180cc', 'Yamaha FZ S V',
       'Yamaha FZs 150cc', 'Honda CB Hornet 160R',
       'Hero Splendor Plus Self', 'Royal Enfield Thunderbird X',
       'Royal Enfield Classic Desert', 'Yamaha YZF-R15 2.0 150cc',
       'Yamaha FZ25 250cc', 'Bajaj Pulsar NS200', 'Bajaj Discover 100M',
       'Bajaj Discover 125M', 'Bajaj Pulsar NS200 ABS',
       'Bajaj Pulsar RS200 ABS', 'Suzuki Gixxer SF 150cc',
       'Benelli 302R 300CC', 'Hero Splendor iSmart Plus',
       'Royal Enfield Classic Chrome', 'Yamaha FZ V 2.0',
       'Hero Super Splendor 125cc', 'Honda CBF Stunner 125cc',
       'Bajaj Pulsar 150cc', 'Honda X-Blade 160CC ABS',
       'Bajaj Avenger 220cc', 'KTM RC 390cc', 'Honda CB Unicorn 150cc',
       'KTM Duke 200cc', 'Honda CBR 150R 150cc', 'KTM RC 200cc ABS',
       'Royal Enfield Thunderbird 350cc', 'Royal Enfield Bullet Electra',
       'Bajaj Avenger Street 220', 'Mahindra Ce

In [11]:
X = dataset.drop(columns= 'price') 
y = dataset['price']

In [12]:
X.head()

Unnamed: 0,bike_name,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus,Ahmedabad,17654,First Owner,3,110,TVS
1,Royal Enfield Classic 350cc,Delhi,11000,First Owner,4,350,Royal Enfield
2,Triumph Daytona 675R,Delhi,110,First Owner,8,675,Triumph
3,TVS Apache RTR 180cc,Bangalore,16329,First Owner,4,180,TVS
4,Yamaha FZ S V,Bangalore,10000,First Owner,3,150,Yamaha


In [13]:
from posixpath import split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.2)

In [14]:
X_train.head()

Unnamed: 0,bike_name,city,kms_driven,owner,age,power,brand
5464,TVS Apache RTR 160cc,Ghaziabad,19542,First Owner,7,160,TVS
11900,Yamaha FZ 150cc,Chennai,18742,First Owner,6,150,Yamaha
21234,Hero Passion 100cc,Hyderabad,3869,First Owner,16,100,Hero
28083,Hero Passion Pro 100cc,Delhi,22000,First Owner,4,100,Hero
28181,Hero Super Splendor 125cc,Jaipur,84186,First Owner,16,125,Hero


## Creating a model pipeline 
- Using onehotencoder for categorical data
- applying linear regression to the data

In [15]:
# Importing the libraries
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [16]:
encoder = OneHotEncoder()
encoder.fit(X[['bike_name', 'city', 'owner', 'brand']])

In [17]:
transformer = make_column_transformer(
    (OneHotEncoder(categories = encoder.categories_), ['bike_name', 'city', 'owner', 'brand']),
    remainder = 'passthrough'
)


In [18]:
regressor = LinearRegression()

pipeline = make_pipeline(
    transformer, regressor
)

In [19]:
pipeline.fit(X_train, y_train)

In [20]:
y_pred = pipeline.predict(X_test)

In [21]:
y_pred

array([35182.16058826, 14616.41811594, 80592.07120956, ...,
       -7596.96452737, 32298.81618518, 94082.87311093])

In [22]:
r2_score(y_test, y_pred)

0.8986057035929698

### model results and further steps
- SimpleLinearRegression works! and it works quite well, 90% is a good result
- though here are some other models i wanna test and i'll make the code compact using previous pipelines and a small section to pick the best one

In [24]:
# Trying the DecisionTreeRegression
from sklearn.tree import DecisionTreeRegressor
DecisionTree = DecisionTreeRegressor(random_state = 0)

pipeline2 = make_pipeline(
    transformer, DecisionTree
)

In [25]:
# Trying the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
RandomForest = RandomForestRegressor(n_estimators = 10, random_state = 0)

pipeline3 = make_pipeline(
    transformer, RandomForest
)

In [26]:
pipelines = [pipeline, pipeline2, pipeline3]

bes_acc = 0.0
best_classifier = 0
best_pipeline = ""

In [27]:
pipe_dict = {0: 'Linear_Regression', 1: 'Decision_tree', 2: 'Random_forest'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [30]:
y_pred2 = pipeline2.predict(X_test)
y_pred3 = pipeline3.predict(X_test)

predictions = [y_pred, y_pred2, y_pred3]

In [31]:
for i,model in enumerate(pipelines):
    print('{} Test Accuracy: {}'.format(pipe_dict[i], r2_score(y_test, predictions[i])))

Linear_Regression Test Accuracy: 0.8986057035929698
Decision_tree Test Accuracy: 0.9030475254554818
Random_forest Test Accuracy: 0.9191467861660407
