In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import pickle



In [2]:
df=pd.read_csv(r"C:\Users\kannu\OneDrive\Desktop\Pandas_csvs\Datasets\Regression_data_sets\used_bikes.csv")

In [3]:
df.head()

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha


In [4]:
df.shape

(32648, 8)

### EDA

In [5]:
df.isna().sum()

bike_name     0
price         0
city          0
kms_driven    0
owner         0
age           0
power         0
brand         0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32648 entries, 0 to 32647
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bike_name   32648 non-null  object 
 1   price       32648 non-null  float64
 2   city        32648 non-null  object 
 3   kms_driven  32648 non-null  float64
 4   owner       32648 non-null  object 
 5   age         32648 non-null  float64
 6   power       32648 non-null  float64
 7   brand       32648 non-null  object 
dtypes: float64(4), object(4)
memory usage: 2.0+ MB


In [7]:
#bike_name

In [8]:
df["bike_name"].value_counts().head(60)

Bajaj Pulsar 150cc                      2776
Bajaj Avenger Street 220                2531
Bajaj Avenger 220cc                     2060
Royal Enfield Classic 350cc             1673
Hero Passion Pro 100cc                  1432
Hero Passion 100cc                      1238
Royal Enfield Thunderbird 350cc          919
Yamaha YZF-R15 2.0 150cc                 769
Royal Enfield Bullet Electra 350cc       756
Bajaj Pulsar NS200                       708
KTM RC 390cc                             705
Bajaj Dominar 400 ABS                    700
Hero CD Deluxe 100cc                     693
Bajaj Platina 100cc                      686
Hero CBZ Xtreme 150cc                    686
Honda CB Trigger 150cc                   681
TVS Apache RTR 180cc                     675
Yamaha Fazer 150cc                       674
Yamaha FZ 150cc                          661
Hero Super Splendor 125cc                649
Honda CB Hornet 160R STD                 641
Harley-Davidson Street 750 ABS           631
Hero Hunk 

In [9]:
## city
df["city"].value_counts().head(60)

Delhi                  7318
Bangalore              2723
Mumbai                 2591
Hyderabad              2160
Pune                   1724
Chennai                1619
Lucknow                1294
Jaipur                 1007
Ghaziabad               938
Ahmedabad               905
Noida                   776
Bhopal                  651
Gautam Buddha Nagar     649
Kanchipuram             640
Jodhpur                 635
Karnal                  625
Rupnagar                621
Allahabad               621
Gurgaon                 617
Godhara                 611
Faridabad               609
Kadapa                  608
Perumbavoor             608
Ludhiana                100
Kolkata                  97
Thane                    94
Jhansi                   87
Vadodara                 75
Surat                    57
Jalandhar                52
Chandigarh               46
Rajkot                   36
Indore                   33
Dehradun                 30
Patna                    29
Navi Mumbai         

In [10]:
df["owner"].value_counts()

First Owner             29964
Second Owner             2564
Third Owner               108
Fourth Owner Or More       12
Name: owner, dtype: int64

In [11]:
#brand
df['brand'].value_counts().sort_values(ascending=False).head(10)

Bajaj              11213
Hero                6368
Royal Enfield       4178
Yamaha              3916
Honda               2108
Suzuki              1464
TVS                 1247
KTM                 1077
Harley-Davidson      737
Kawasaki              79
Name: brand, dtype: int64

In [12]:
df["city"].value_counts()

Delhi            7318
Bangalore        2723
Mumbai           2591
Hyderabad        2160
Pune             1724
                 ... 
Surendranagar       1
Khandela            1
Mohammadabad        1
Shimla              1
Sidhi               1
Name: city, Length: 443, dtype: int64

In [13]:
df["bike_name"].value_counts()

Bajaj Pulsar 150cc                                   2776
Bajaj Avenger Street 220                             2531
Bajaj Avenger 220cc                                  2060
Royal Enfield Classic 350cc                          1673
Hero Passion Pro 100cc                               1432
                                                     ... 
Indian Chief Classic 1800cc                             1
TVS Apache RTR 160cc White Race Edition Rear Disc       1
Triumph Bonneville T100 900cc                           1
Bajaj CT110 ES Alloy                                    1
Yamaha FZS FI 150cc Special Edition                     1
Name: bike_name, Length: 471, dtype: int64

In [14]:
x=df.drop("price",axis=1)
y=df["price"]

In [15]:
def top_ten_encode(df):
    top_10=df.value_counts().sort_values(ascending=False).head(10).index.to_list()
    df.loc[df.isin (top_10)==False]="Uncommon"
    return df
    


for feature in ["city","bike_name","brand"]:
    top_ten_encode(x[feature])
    
ohe=OneHotEncoder(drop="first",sparse=False,dtype="int")   #only two things
oe=OrdinalEncoder(categories=[["First Owner","Second Owner","Third Owner","Fourth Owner Or More"]],dtype="int")
transformer=ColumnTransformer(transformers=[("ohe",ohe,["city","bike_name","brand"]),("oe",oe,["owner"])],remainder="passthrough")  #passthrough inside quotes
new_array=transformer.fit_transform(x)
new_x=pd.DataFrame(new_array)

new_x.columns=transformer.get_feature_names_out()
new_x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.isin (top_10)==False]="Uncommon"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.isin (top_10)==False]="Uncommon"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df.isin (top_10)==False]="Uncommon"


Unnamed: 0,ohe__city_Bangalore,ohe__city_Chennai,ohe__city_Delhi,ohe__city_Ghaziabad,ohe__city_Hyderabad,ohe__city_Jaipur,ohe__city_Lucknow,ohe__city_Mumbai,ohe__city_Pune,ohe__city_Uncommon,...,ohe__brand_Kawasaki,ohe__brand_Royal Enfield,ohe__brand_Suzuki,ohe__brand_TVS,ohe__brand_Uncommon,ohe__brand_Yamaha,oe__owner,remainder__kms_driven,remainder__age,remainder__power
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17654.0,3.0,110.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11000.0,4.0,350.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,110.0,8.0,675.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,16329.0,4.0,180.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10000.0,3.0,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32643,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22000.0,4.0,100.0
32644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6639.0,9.0,180.0
32645,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20373.0,6.0,220.0
32646,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84186.0,16.0,125.0


In [16]:
x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.2,random_state=4)

In [17]:
lin_reg=LinearRegression()
lin_reg.fit(x_train,y_train)

In [18]:
lin_reg.score(x_train,y_train)

0.890538244757788

In [19]:
lin_reg.score(x_test,y_test)

0.8813696545694797

In [20]:
def training_results(x_train,y_train,model):
        y_pred_train=model.predict(x_train)
        mse_train=mean_squared_error(y_train,y_pred_train)
        mae_train=mean_absolute_error(y_train,y_pred_train)
        rmse_train=np.sqrt(mse_train)
        r2_score_train=r2_score(y_train,y_pred_train)
        print(f"{mse_train=}\n{mae_train=}\n{rmse_train=}\n{r2_score_train=}")
        
def testing_results(x_test,y_test,model):
        y_pred_test=model.predict(x_test)
        mse_test=mean_squared_error(y_test,y_pred_test)
        mae_test=mean_absolute_error(y_test,y_pred_test)
        rmse_test=np.sqrt(mse_test)
        r2_score_test=r2_score(y_test,y_pred_test)
        print(f"{mse_test=}\n{mae_test=}\n{rmse_test=}\n{r2_score_test=}")

In [21]:
training_results(x_train,y_train,lin_reg)

mse_train=908506851.41072
mae_train=12397.645671333943
rmse_train=30141.4474007258
r2_score_train=0.890538244757788


In [22]:
testing_results(x_test,y_test,lin_reg)

mse_test=942994770.2125176
mae_test=12367.964867904922
rmse_test=30708.219912793993
r2_score_test=0.8813696545694797


In [23]:
#knn 

In [24]:
std_scaler=StandardScaler()

In [25]:
knn_reg=KNeighborsRegressor()

In [26]:
knn_reg.fit(x_train,y_train)

In [27]:
training_results(x_train,y_train,knn_reg)

mse_train=737363009.0477036
mae_train=5708.309150777242
rmse_train=27154.428902993037
r2_score_train=0.91115856848442


In [28]:
testing_results(x_test,y_test,knn_reg)

mse_test=1153980412.2544136
mae_test=7136.794578866769
rmse_test=33970.28719711409
r2_score_test=0.8548273020698263


In [29]:
hyper_parm={"n_neighbors":np.arange(2,30),"p":[1,2]}

randomized_search_knn=RandomizedSearchCV(knn_reg,hyper_parm)

In [30]:
randomized_search_knn.fit(x_train,y_train)

In [31]:
knn_reg_tunned=randomized_search_knn.best_estimator_

In [32]:
knn_reg_tunned.fit(x_train,y_train)

In [33]:
training_results(x_train,y_train,knn_reg_tunned)

mse_train=1056122551.1200697
mae_train=6860.212818745693
rmse_train=32498.039188850606
r2_score_train=0.8727527172558188


In [34]:
testing_results(x_test,y_test,knn_reg_tunned)

mse_test=1297559193.9712343
mae_test=7491.376692189892
rmse_test=36021.64896241196
r2_score_test=0.8367648472083629


In [35]:
# dt_reg=DecisionTreeRegressor(random_state=6)
# dt_reg.fit(x_train,y_train)
# hyper_parm_dt={"criterion":["squared_error","absolute_error"],
#                 "max_depth":[2,3,4],
#                "min_samples_split":[2,3,4],
#                "min_samples_leaf":[5,6,7],
#                "random_state":[5]
    
# }

In [36]:
# randomized_search_dt=RandomizedSearchCV(dt_reg,hyper_parm_dt,n_jobs=-1)

In [37]:
# randomized_search_dt.fit(x_train,y_train)

In [38]:
random_forest_reg=RandomForestRegressor(random_state=5)
random_forest_reg.fit(x_train,y_train)
# hyper_param_rf={"criterion":["squared_error","absolute_error"],
#                 "max_depth":np.arange(2,7),
#                "min_samples_split":np.arange(4,10),
#                "min_samples_leaf":np.arange(4,8),
#                 "random_state":[5],
#                 "n_estimators":np.arange(30,150,10),
#                }

In [39]:
# randomized_search_rf=RandomizedSearchCV(random_forest_reg,hyper_param_rf)
# randomized_search_rf.fit(x_train,y_train)

In [40]:
random_forest_reg.score(x_train,y_train)

0.993069240794693

In [41]:
random_forest_reg.score(x_test,y_test)

0.9515221074775344

In [42]:
training_results(x_train,y_train,random_forest_reg)

mse_train=57523673.08167419
mae_train=1244.9909894733769
rmse_train=7584.436240200994
r2_score_train=0.993069240794693


In [43]:
testing_results(x_test,y_test,random_forest_reg)

mse_test=385351648.0434064
mae_test=3033.5589295945124
rmse_test=19630.375647027402
r2_score_test=0.9515221074775344


In [44]:
y_test_pred=random_forest_reg.predict(x_test)

In [45]:
y_test_pred

array([ 21620.66666667,  14227.        ,  61152.        , ...,
        25036.        ,  71214.38      , 123000.        ])

In [46]:
y_test

3079      25000.0
10185     14227.0
543       62000.0
19141     55000.0
28220     22000.0
           ...   
21118     11900.0
13295     25000.0
7495      12000.0
4623      65949.0
11890    123000.0
Name: price, Length: 6530, dtype: float64

In [47]:
#making pickle file
with open ("bike_price_modrandom_forest_regel.pkl","wb") as file:
    pickle.dump(,file)

In [48]:
#with pickle file of encoder
with open("encoder.pkl","wb") as file:
    pickle.dump(transformer,file)

In [49]:
df.loc[0:10]

Unnamed: 0,bike_name,price,city,kms_driven,owner,age,power,brand
0,TVS Star City Plus Dual Tone 110cc,35000.0,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,119900.0,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Triumph Daytona 675R,600000.0,Delhi,110.0,First Owner,8.0,675.0,Triumph
3,TVS Apache RTR 180cc,65000.0,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Yamaha FZ S V 2.0 150cc-Ltd. Edition,80000.0,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha
5,Yamaha FZs 150cc,53499.0,Delhi,25000.0,First Owner,6.0,150.0,Yamaha
6,Honda CB Hornet 160R ABS DLX,85000.0,Delhi,8200.0,First Owner,3.0,160.0,Honda
7,Hero Splendor Plus Self Alloy 100cc,45000.0,Delhi,12645.0,First Owner,3.0,100.0,Hero
8,Royal Enfield Thunderbird X 350cc,145000.0,Bangalore,9190.0,First Owner,3.0,350.0,Royal Enfield
9,Royal Enfield Classic Desert Storm 500cc,88000.0,Delhi,19000.0,Second Owner,7.0,500.0,Royal Enfield


In [50]:
test_query=x.loc[0:10]
test_query

Unnamed: 0,bike_name,city,kms_driven,owner,age,power,brand
0,Uncommon,Ahmedabad,17654.0,First Owner,3.0,110.0,TVS
1,Royal Enfield Classic 350cc,Delhi,11000.0,First Owner,4.0,350.0,Royal Enfield
2,Uncommon,Delhi,110.0,First Owner,8.0,675.0,Uncommon
3,Uncommon,Bangalore,16329.0,First Owner,4.0,180.0,TVS
4,Uncommon,Bangalore,10000.0,First Owner,3.0,150.0,Yamaha
5,Uncommon,Delhi,25000.0,First Owner,6.0,150.0,Yamaha
6,Uncommon,Delhi,8200.0,First Owner,3.0,160.0,Honda
7,Uncommon,Delhi,12645.0,First Owner,3.0,100.0,Hero
8,Uncommon,Bangalore,9190.0,First Owner,3.0,350.0,Royal Enfield
9,Uncommon,Delhi,19000.0,Second Owner,7.0,500.0,Royal Enfield


In [51]:
transformed_test_query=transformer.transform(test_query)


In [52]:
random_forest_reg.predict(transformed_test_query)



array([ 39987.02      , 117980.        , 470302.844     ,  66449.98      ,
       114720.        ,  48571.38      ,  80333.        ,  48225.83333333,
       145891.        , 112252.        ,  73314.        ])

In [53]:
###making code , to seggregate values which are not in top 10.

def not_top_10(df):
    top_10=df.value_counts().sort_values(ascending=False).head(10).index.to_list()
    not_top_10=df.loc[df.isin (top_10)==False].unique()
    a=list(not_top_10)
    return a 

In [54]:
not_top_10_city=not_top_10(df["city"])
not_top_10_bike=not_top_10(df["bike_name"])
not_top_10_brand=not_top_10(df["brand"])

In [55]:
print(not_top_10_brand)

['Triumph', 'Benelli', 'Mahindra', 'Ducati', 'Hyosung', 'Jawa', 'BMW', 'Indian', 'Rajdoot', 'LML', 'Yezdi', 'MV', 'Ideal']


In [56]:
df1=pd.read_csv(r"C:\Users\kannu\OneDrive\Desktop\Pandas_csvs\Datasets\Regression_data_sets\used_bikes.csv")

In [57]:
df1["bike_name"].loc[df1["bike_name"].isin (not_top_10_bike) ]="Uncommon"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["bike_name"].loc[df1["bike_name"].isin (not_top_10_bike) ]="Uncommon"


In [58]:
df1["bike_name"]

0                           Uncommon
1        Royal Enfield Classic 350cc
2                           Uncommon
3                           Uncommon
4                           Uncommon
                    ...             
32643         Hero Passion Pro 100cc
32644                       Uncommon
32645       Bajaj Avenger Street 220
32646                       Uncommon
32647             Bajaj Pulsar 150cc
Name: bike_name, Length: 32648, dtype: object

In [59]:
with open ("encoder.pkl","rb") as file:
        transformer=pickle.load(file)


In [63]:
df["owner"].value_counts().to_dict()


{'First Owner': 29964,
 'Second Owner': 2564,
 'Third Owner': 108,
 'Fourth Owner Or More': 12}

In [66]:
a=x[0:5]

In [76]:
encoded=transformer.transform(a)
df_encoded=pd.DataFrame(encoded)

In [77]:
random_forest_reg.predict(df_encoded)



array([ 39987.02 , 117980.   , 470302.844,  66449.98 , 114720.   ])

![image.png](attachment:image.png)

In [None]:
37049

In [None]:
x=pd.DataFrame({"bike_name":["Bajaj Pulsar 150cc"],"city":"Delhi","brand":"Bajaj","owner":"First owner","age":4,"kms_driven":343,"power":67})