# REGRESSION

In [1]:
import pandas as pd

### Gathering Data ( Step-1 )

In [2]:
df = pd.read_csv("Used_Bikes.csv")
df

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha
...,...,...,...,...,...,...,...,...
32643,Hero Passion Pro 100cc,39000.0,Delhi,22000.0,First Owner,4.0,100.0,Hero
32644,TVS Apache RTR 180cc,30000.0,Karnal,6639.0,First Owner,9.0,180.0,TVS
32645,Bajaj Avenger Street 220,60000.0,Delhi,20373.0,First Owner,6.0,220.0,Bajaj
32646,Hero Super Splendor 125cc,15600.0,Jaipur,84186.0,First Owner,16.0,125.0,Hero


### Data Preparation

In [3]:
df.isna().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [4]:
df.duplicated().sum()

25324

In [5]:
df = df.drop_duplicates()
df

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha
...,...,...,...,...,...,...,...,...
9362,Hero Hunk Rear Disc 150cc,25000.0,Delhi,48587.0,First Owner,8.0,150.0,Hero
9369,Bajaj Avenger 220cc,35000.0,Bangalore,60000.0,First Owner,9.0,220.0,Bajaj
9370,Harley-Davidson Street 750 ABS,450000.0,Jodhpur,3430.0,First Owner,4.0,750.0,Harley-Davidson
9371,Bajaj Dominar 400 ABS,139000.0,Hyderabad,21300.0,First Owner,4.0,400.0,Bajaj


In [6]:
cat_col = df.select_dtypes(include = "O") # Separates the categorical data 
cat_col

Unnamed: 0,bike_name,city,owner,brand
0,TVS Star City Plus Dual Tone 110cc,Ahmedabad,First Owner,TVS
1,Royal Enfield Classic 350cc,Delhi,First Owner,Royal Enfield
2,Triumph Daytona 675R,Delhi,First Owner,Triumph
3,TVS Apache RTR 180cc,Bangalore,First Owner,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,Bangalore,First Owner,Yamaha
...,...,...,...,...
9362,Hero Hunk Rear Disc 150cc,Delhi,First Owner,Hero
9369,Bajaj Avenger 220cc,Bangalore,First Owner,Bajaj
9370,Harley-Davidson Street 750 ABS,Jodhpur,First Owner,Harley-Davidson
9371,Bajaj Dominar 400 ABS,Hyderabad,First Owner,Bajaj


In [7]:
num_col = df.select_dtypes(exclude = "O") # Separates the categorical data 
num_col

Unnamed: 0,price,kms_driven,age,power
0,35000.0,17654.0,3.0,110.0
1,119900.0,11000.0,4.0,350.0
2,600000.0,110.0,8.0,675.0
3,65000.0,16329.0,4.0,180.0
4,80000.0,10000.0,3.0,150.0
...,...,...,...,...
9362,25000.0,48587.0,8.0,150.0
9369,35000.0,60000.0,9.0,220.0
9370,450000.0,3430.0,4.0,750.0
9371,139000.0,21300.0,4.0,400.0


#### Feature Selection ( Selecting the features which does not affect the data  )

In [8]:
cat_col = cat_col.drop(['bike_name','city'], axis = 1)
cat_col

Unnamed: 0,owner,brand
0,First Owner,TVS
1,First Owner,Royal Enfield
2,First Owner,Triumph
3,First Owner,TVS
4,First Owner,Yamaha
...,...,...
9362,First Owner,Hero
9369,First Owner,Bajaj
9370,First Owner,Harley-Davidson
9371,First Owner,Bajaj


#### Label encoding ( Replacing the categorical data to numerical data )

In [9]:
cat_col.value_counts('owner')

owner
First Owner             6642
Second Owner             588
Third Owner               84
Fourth Owner Or More      10
dtype: int64

In [10]:
dt = {'First Owner':1,
      'Second Owner':2,
      'Third Owner':3,
      'Fourth Owner Or More':4 }
dt

{'First Owner': 1,
 'Second Owner': 2,
 'Third Owner': 3,
 'Fourth Owner Or More': 4}

In [11]:
# Method 1 
for i in cat_col['owner']:
    pass

In [12]:
# Method 2

cat_col['owner'] = cat_col['owner'].map(dt) # Maps the respective keys to the values

In [13]:
cat_col

Unnamed: 0,owner,brand
0,1,TVS
1,1,Royal Enfield
2,1,Triumph
3,1,TVS
4,1,Yamaha
...,...,...
9362,1,Hero
9369,1,Bajaj
9370,1,Harley-Davidson
9371,1,Bajaj


In [14]:
cat_col['brand'].nunique()

23

In [15]:
num = 1
dt2 = {}
for i in cat_col['brand'].unique():
    dt2[i] = num
    num+=1

In [16]:
dt2

{'TVS': 1,
 'Royal Enfield': 2,
 'Triumph': 3,
 'Yamaha': 4,
 'Honda': 5,
 'Hero': 6,
 'Bajaj': 7,
 'Suzuki': 8,
 'Benelli': 9,
 'KTM': 10,
 'Mahindra': 11,
 'Kawasaki': 12,
 'Ducati': 13,
 'Hyosung': 14,
 'Harley-Davidson': 15,
 'Jawa': 16,
 'BMW': 17,
 'Indian': 18,
 'Rajdoot': 19,
 'LML': 20,
 'Yezdi': 21,
 'MV': 22,
 'Ideal': 23}

In [17]:
cat_col['brand'] = cat_col['brand'].map(dt2)

In [18]:
cat_col

Unnamed: 0,owner,brand
0,1,1
1,1,2
2,1,3
3,1,1
4,1,4
...,...,...
9362,1,6
9369,1,7
9370,1,15
9371,1,7


In [19]:
df2 = pd.concat([cat_col, num_col],axis = 1)
df2

Unnamed: 0,owner,brand,price,kms_driven,age,power
0,1,1,35000.0,17654.0,3.0,110.0
1,1,2,119900.0,11000.0,4.0,350.0
2,1,3,600000.0,110.0,8.0,675.0
3,1,1,65000.0,16329.0,4.0,180.0
4,1,4,80000.0,10000.0,3.0,150.0
...,...,...,...,...,...,...
9362,1,6,25000.0,48587.0,8.0,150.0
9369,1,7,35000.0,60000.0,9.0,220.0
9370,1,15,450000.0,3430.0,4.0,750.0
9371,1,7,139000.0,21300.0,4.0,400.0


In [20]:
df2.isnull().sum()

owner         0
brand         0
price         0
kms_driven    0
age           0
power         0
dtype: int64

In [21]:
df2.duplicated().sum()

18

In [22]:
x = df2.drop('price' , axis = 1)
y = df2[['price']]

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2) # 80% training 20% testing

In [25]:
x_train

Unnamed: 0,owner,brand,kms_driven,age,power
2756,2,7,37000.0,9.0,220.0
4277,1,7,48203.0,5.0,150.0
8976,1,10,4500.0,7.0,390.0
5678,2,2,5148.0,5.0,350.0
110,1,14,1800.0,4.0,650.0
...,...,...,...,...,...
8357,1,5,22167.0,10.0,110.0
7340,1,2,20241.0,6.0,350.0
5290,1,6,50000.0,13.0,150.0
9008,1,7,28054.0,13.0,150.0


### Model Training

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
lr = LinearRegression()

In [28]:
lr.fit(x_train,y_train)

LinearRegression()

In [29]:
prediction = lr.predict(x_test)

In [30]:
y_test['prediction_price'] = prediction
y_test

Unnamed: 0,price,prediction_price
5478,115000.0,129351.661789
8091,17000.0,28002.618357
7315,23500.0,13358.869629
1699,80000.0,32185.993068
522,129000.0,85356.194299
...,...,...
7701,28000.0,63707.534236
8634,22000.0,36953.694417
2544,153400.0,216466.069230
2482,87000.0,85562.936378


In [31]:
lr.score(x_train,y_train) # Train Score

0.7170244262515442

In [32]:
lr.score(x_test,y_test['price']) # Test Score

0.7298874964037607

In [33]:
#using pandas-> data filter krte h
#using matplotlib and seaborn -> charts bnate h

In [34]:
y_test

Unnamed: 0,price,prediction_price
5478,115000.0,129351.661789
8091,17000.0,28002.618357
7315,23500.0,13358.869629
1699,80000.0,32185.993068
522,129000.0,85356.194299
...,...,...
7701,28000.0,63707.534236
8634,22000.0,36953.694417
2544,153400.0,216466.069230
2482,87000.0,85562.936378


In [66]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [68]:
mse = mean_squared_error(y_test['price'] , y_test['prediction_price'])
mse

4782437763.428773

In [67]:
mean_absolute_error(y_test['price'] , y_test['prediction_price'])

35675.875861590735

In [71]:
import numpy as np
rmse = np.sqrt(mse)
rmse

69155.17163183657

In [79]:
# MSE FUNCTION

mse = 0
for price,predicted_price in y_test.values:
    mse+=((price-predicted_price)**2)
mse/=len(y_test)
mse

4782437763.428773

In [80]:
# MAE FUNCTION

mae = 0
for price,predicted_price in y_test.values:
    mae+=(abs(price-predicted_price))
mae/=len(y_test)
mae

35675.87586159074

In [82]:
# RMSE FUNCTION

rmse = 0
for price,predicted_price in y_test.values:
    rmse+=((price-predicted_price)**2)
rmse/=len(y_test)
rmse = np.sqrt(rmse)
rmse

69155.17163183657

In [85]:
from sklearn.metrics import r2_score
r = r2_score(y_test['price'] , y_test['prediction_price'])
r

0.7298874964037607

In [84]:
# lr.score() is same as r2_score

In [87]:
1 - ((1-r)*(y_test.shape[0]-1)/(y_test.shape[0] - x_test.shape[1] - 1))

0.7289618195579888

In [89]:
 import joblib

In [90]:
ls = ['shreyanshsahayacer@gmail.com','hello@123456']

In [93]:
joblib.dump(ls,'email_id.lb')

['email_id.lb']

In [94]:
joblib.load('email_id.lb')

['shreyanshsahayacer@gmail.com', 'hello@123456']

In [95]:
joblib.dump(lr,'linear_regression_model.lb')

['linear_regression_model.lb']