In [1]:
import pandas as pd
import numpy as np

# from sklearn.datasets import load_boston, load_diabetes
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import pickle

In [2]:
df = pd.read_csv("Bikes_data.csv")

In [3]:
df

Unnamed: 0,Company,Country of Origin,Model,Number of cc,Horsepower,Torque,Transmission Type,Drivetrain,Number of Seating,Price (in INR),Year,Looks,Body Type,Engine Type,Number of Cylinders
0,Aprilia,Italy,RS 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 10,99,000",2021,Sport,Naked,Parallel-twin,2
1,Aprilia,Italy,Tuono 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 11,99,000",2021,Sport,Naked,Parallel-twin,2
2,Aprilia,Italy,RS 125,124.9,15 hp,12 Nm,6-speed manual,Chain,2,"INR 4,49,000",2022,Sport,Racing,Single-cylinder,1
3,Aprilia,Italy,Shiver 900,896,95 hp,90 Nm,6-speed manual,Shaft,2,"INR 13,99,000",2022,Adventure,Naked,V-twin,2
4,Aprilia,Italy,Tuono 1100,1077,175 hp,121 Nm,6-speed manual,Shaft,2,"INR 19,99,000",2022,Adventure,Naked,V-twin,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,Energica Motor Company,Italy,EsseEsse9,110 kW,163 hp,195 lb-ft,Automatic,Electric,2,INR 25 lakhs,2019,Retro,Cruiser,Electric,1
358,Energica Motor Company,Italy,EGO+,126 kW,171 hp,222 lb-ft,Automatic,Electric,1,INR 23.75 lakhs,2022,Futuristic,Naked,Electric,1
359,Energica Motor Company,Italy,Eva Ribelle RS,126 kW,171 hp,222 lb-ft,Automatic,Electric,2,INR 25.25 lakhs,2022,Modern,Roadster,Electric,1
360,Moto Morini,Italy,X-Cape,649,60.8 PS,62 Nm,6-speed manual,Parallel twin,2,720000,2023,Modern,Adventure touring,"Liquid-cooled, DOHC",2


In [4]:
df.columns

Index(['Company', 'Country of Origin', 'Model', 'Number of cc', 'Horsepower',
       'Torque', 'Transmission Type', 'Drivetrain', 'Number of Seating',
       'Price (in INR)', 'Year', 'Looks', 'Body Type', 'Engine Type',
       'Number of Cylinders'],
      dtype='object')

In [5]:
df = df.rename(columns={'Country of Origin': 'Country_of_Origin', 'Number of cc': 'Number_of_cc','Transmission Type':'Transmission_Type','Number of Seating':'Number_of_Seating','Number of Cylinders':'Number_of_Cylinders'})

In [6]:
df["Price (in INR)"]

0        INR 10,99,000
1        INR 11,99,000
2         INR 4,49,000
3        INR 13,99,000
4        INR 19,99,000
            ...       
357       INR 25 lakhs
358    INR 23.75 lakhs
359    INR 25.25 lakhs
360           7,20,000
361           6,89,000
Name: Price (in INR), Length: 362, dtype: object

## 1 -  PROBLEM STATEMENT 

## 2 - DATA GATHERING 

In [7]:
df

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Torque,Transmission_Type,Drivetrain,Number_of_Seating,Price (in INR),Year,Looks,Body Type,Engine Type,Number_of_Cylinders
0,Aprilia,Italy,RS 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 10,99,000",2021,Sport,Naked,Parallel-twin,2
1,Aprilia,Italy,Tuono 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 11,99,000",2021,Sport,Naked,Parallel-twin,2
2,Aprilia,Italy,RS 125,124.9,15 hp,12 Nm,6-speed manual,Chain,2,"INR 4,49,000",2022,Sport,Racing,Single-cylinder,1
3,Aprilia,Italy,Shiver 900,896,95 hp,90 Nm,6-speed manual,Shaft,2,"INR 13,99,000",2022,Adventure,Naked,V-twin,2
4,Aprilia,Italy,Tuono 1100,1077,175 hp,121 Nm,6-speed manual,Shaft,2,"INR 19,99,000",2022,Adventure,Naked,V-twin,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,Energica Motor Company,Italy,EsseEsse9,110 kW,163 hp,195 lb-ft,Automatic,Electric,2,INR 25 lakhs,2019,Retro,Cruiser,Electric,1
358,Energica Motor Company,Italy,EGO+,126 kW,171 hp,222 lb-ft,Automatic,Electric,1,INR 23.75 lakhs,2022,Futuristic,Naked,Electric,1
359,Energica Motor Company,Italy,Eva Ribelle RS,126 kW,171 hp,222 lb-ft,Automatic,Electric,2,INR 25.25 lakhs,2022,Modern,Roadster,Electric,1
360,Moto Morini,Italy,X-Cape,649,60.8 PS,62 Nm,6-speed manual,Parallel twin,2,720000,2023,Modern,Adventure touring,"Liquid-cooled, DOHC",2


## 3-  EDA

1. Missing Values
2. Outliers
3. Categorical Data(Object DataTypes)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company              362 non-null    object
 1   Country_of_Origin    362 non-null    object
 2   Model                362 non-null    object
 3   Number_of_cc         362 non-null    object
 4   Horsepower           362 non-null    object
 5   Torque               362 non-null    object
 6   Transmission_Type    362 non-null    object
 7   Drivetrain           362 non-null    object
 8   Number_of_Seating    362 non-null    int64 
 9   Price (in INR)       362 non-null    object
 10  Year                 362 non-null    int64 
 11  Looks                362 non-null    object
 12  Body Type            362 non-null    object
 13  Engine Type          362 non-null    object
 14  Number_of_Cylinders  361 non-null    object
dtypes: int64(2), object(13)
memory usage: 42.5+ KB


### 3.1 Company

In [9]:
df["Company"].value_counts()

Company
Benelli             16
Husqvarna           12
Mutt Motorcycles    11
KTM                  9
Genuine Scooters     9
                    ..
Italjet              1
Derbi                1
Ontrack              1
Ampere               1
Fantic               1
Name: count, Length: 75, dtype: int64

In [10]:
df["Company"].value_counts()

Company
Benelli             16
Husqvarna           12
Mutt Motorcycles    11
KTM                  9
Genuine Scooters     9
                    ..
Italjet              1
Derbi                1
Ontrack              1
Ampere               1
Fantic               1
Name: count, Length: 75, dtype: int64

In [11]:
df["Company"].value_counts().to_dict()

{'Benelli': 16,
 'Husqvarna': 12,
 'Mutt Motorcycles': 11,
 'KTM': 9,
 'Genuine Scooters': 9,
 'Triumph': 9,
 'Harley-Davidson': 9,
 'Beta': 8,
 'GasGas': 8,
 'BMW': 8,
 'TVS': 7,
 'Royal Enfield': 7,
 'Honda': 7,
 'Hero': 6,
 'Suzuki': 6,
 'Moto Guzzi': 6,
 'Bajaj': 6,
 'Jawa': 6,
 'Keeway': 6,
 'Yamaha': 6,
 'Kawasaki': 6,
 'Hanway': 5,
 'Victory': 5,
 'Sherco': 5,
 'Zero Motorcycles': 5,
 'UM Motorcycles': 5,
 'NIU': 5,
 'Sinnis': 5,
 'SWM': 5,
 'Super Soco': 5,
 'Royal Alloy': 5,
 'Aprilia': 5,
 'Vespa': 5,
 'SYM': 5,
 'Brixton Motorcycles': 5,
 'Buell Motorcycle Company': 5,
 'Cagiva': 5,
 'Ducati': 5,
 'Indian': 5,
 'Energica Motor Company': 5,
 'MV Agusta': 5,
 'Mahindra': 5,
 'Piaggio': 5,
 'Mash': 4,
 'Lexmoto': 4,
 'Aeon': 4,
 'Daelim': 4,
 'CFMOTO': 4,
 'CCM': 4,
 'Norton': 4,
 'Kymco': 4,
 'Leonart': 4,
 'Lifan': 4,
 'Vmoto Soco': 4,
 'Moto Morini': 4,
 'Zontes': 4,
 'Scomadi': 3,
 'Confederate': 3,
 'AJS': 3,
 'FB Mondial': 3,
 'Peugeot': 3,
 'Can-Am': 3,
 'Lambretta': 2,


In [12]:
df["Company"].replace({"Benelli" : 1,
 'Husqvarna': 2,
 'Mutt Motorcycles': 3,
 'KTM': 4,
 'Genuine Scooters': 5,
 'Triumph': 6,
 'Harley-Davidson': 7,
 'Beta': 8,
 'GasGas':9,
 'BMW': 10,
 'TVS': 12,
 'Royal Enfield': 13,
 'Honda': 14,
 'Hero': 15,
 'Suzuki': 16,
 'Moto Guzzi': 17,
 'Bajaj': 18,
 'Jawa': 19,
 'Keeway': 20,
 'Yamaha': 21,
 'Kawasaki': 22,
 'Hanway': 23,
 'Victory': 24,
 'Sherco': 25,
 'Zero Motorcycles': 26,
 'UM Motorcycles': 27,
 'NIU': 28,
 'Sinnis': 29,
 'SWM': 30,
 'Super Soco': 31,
 'Royal Alloy': 31,
 'Aprilia': 33,
 'Vespa': 34,
 'SYM': 35,
 'Brixton Motorcycles': 36,
 'Buell Motorcycle Company': 37,
 'Cagiva': 39,
 'Ducati': 40,
 'Indian': 41,
 'Energica Motor Company': 42,
 'MV Agusta': 43,
 'Mahindra': 43,
 'Piaggio': 43,
 'Mash': 44,
 'Lexmoto': 45,
 'Aeon': 45,
 'Daelim': 45,
 'CFMOTO': 45,
 'CCM': 45,
 'Norton': 45,
 'Kymco': 45,
 'Leonart': 45,
 'Lifan': 45,
 'Vmoto Soco': 45,
 'Moto Morini': 45,
 'Zontes': 45,
 'Scomadi': 46,
 'Confederate': 46,
 'AJS': 46,
 'FB Mondial': 46,
 'Peugeot': 46,
 'Can-Am': 46,
 'Lambretta': 47,
 'Apollo': 47,
 'Larry vs Harry': 47,
 'Hero Electric': 47,
 'Hyosung': 47,
 'Arcfox': 47,
 'Segway': 47,
 'Triton Electric Bikes': 47,
 'Italjet': 47,
 'Derbi': 47,
 'Ontrack': 47,
 'Ampere': 47,
 'Fantic': 47},inplace = True)

In [13]:
df["Company"]

0      33
1      33
2      33
3      33
4      33
       ..
357    42
358    42
359    42
360    45
361    45
Name: Company, Length: 362, dtype: int64

In [14]:
Company_data = {"Benelli" : 1,
 'Husqvarna': 2,
 'Mutt Motorcycles': 3,
 'KTM': 4,
 'Genuine Scooters': 5,
 'Triumph': 6,
 'Harley-Davidson': 7,
 'Beta': 8,
 'GasGas':9,
 'BMW': 10,
 'TVS': 12,
 'Royal Enfield': 13,
 'Honda': 14,
 'Hero': 15,
 'Suzuki': 16,
 'Moto Guzzi': 17,
 'Bajaj': 18,
 'Jawa': 19,
 'Keeway': 20,
 'Yamaha': 21,
 'Kawasaki': 22,
 'Hanway': 23,
 'Victory': 24,
 'Sherco': 25,
 'Zero Motorcycles': 26,
 'UM Motorcycles': 27,
 'NIU': 28,
 'Sinnis': 29,
 'SWM': 30,
 'Super Soco': 31,
 'Royal Alloy': 31,
 'Aprilia': 33,
 'Vespa': 34,
 'SYM': 35,
 'Brixton Motorcycles': 36,
 'Buell Motorcycle Company': 37,
 'Cagiva': 39,
 'Ducati': 40,
 'Indian': 41,
 'Energica Motor Company': 42,
 'MV Agusta': 43,
 'Mahindra': 43,
 'Piaggio': 43,
 'Mash': 44,
 'Lexmoto': 45,
 'Aeon': 45,
 'Daelim': 45,
 'CFMOTO': 45,
 'CCM': 45,
 'Norton': 45,
 'Kymco': 45,
 'Leonart': 45,
 'Lifan': 45,
 'Vmoto Soco': 45,
 'Moto Morini': 45,
 'Zontes': 45,
 'Scomadi': 46,
 'Confederate': 46,
 'AJS': 46,
 'FB Mondial': 46,
 'Peugeot': 46,
 'Can-Am': 46,
 'Lambretta': 47,
 'Apollo': 47,
 'Larry vs Harry': 47,
 'Hero Electric': 47,
 'Hyosung': 47,
 'Arcfox': 47,
 'Segway': 47,
 'Triton Electric Bikes': 47,
 'Italjet': 47,
 'Derbi': 47,
 'Ontrack': 47,
 'Ampere': 47,
 'Fantic': 47}


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company              362 non-null    int64 
 1   Country_of_Origin    362 non-null    object
 2   Model                362 non-null    object
 3   Number_of_cc         362 non-null    object
 4   Horsepower           362 non-null    object
 5   Torque               362 non-null    object
 6   Transmission_Type    362 non-null    object
 7   Drivetrain           362 non-null    object
 8   Number_of_Seating    362 non-null    int64 
 9   Price (in INR)       362 non-null    object
 10  Year                 362 non-null    int64 
 11  Looks                362 non-null    object
 12  Body Type            362 non-null    object
 13  Engine Type          362 non-null    object
 14  Number_of_Cylinders  361 non-null    object
dtypes: int64(3), object(12)
memory usage: 42.5+ KB


### 3.2 Country of Origin

In [16]:
df["Country_of_Origin"]

0      Italy
1      Italy
2      Italy
3      Italy
4      Italy
       ...  
357    Italy
358    Italy
359    Italy
360    Italy
361    Italy
Name: Country_of_Origin, Length: 362, dtype: object

In [17]:
df["Country_of_Origin"].value_counts().to_dict()

{'Italy': 92,
 'India': 47,
 'China': 44,
 'UK': 28,
 'Japan': 24,
 'United States': 19,
 'Taiwan': 18,
 'United Kingdom': 17,
 'Austria': 14,
 'USA': 13,
 'Sweden': 12,
 'Spain': 9,
 'Germany': 8,
 'France': 8,
 'South Korea': 5,
 'Canada': 3,
 'Denmark': 1}

In [18]:
df["Country_of_Origin"].replace({'Italy': 1,
 'India': 2,
 'China':3,
 'UK': 4,
 'Japan': 4,
 'United States': 5,
 'Taiwan': 6,
 'United Kingdom': 7,
 'Austria': 8,
 'USA': 9,
 'Sweden': 10,
 'Spain': 11,
 'Germany': 12,
 'France': 13,
 'South Korea': 14,
 'Canada': 15,
                                 
 'Denmark': 16},inplace = True)

In [19]:
Country_of_Origin_data = {'Italy': 1,
 'India': 2,
 'China':3,
 'UK': 4,
 'Japan': 4,
 'United States': 5,
 'Taiwan': 6,
 'United Kingdom': 7,
 'Austria': 8,
 'USA': 9,
 'Sweden': 10,
 'Spain': 11,
 'Germany': 12,
 'France': 13,
 'South Korea': 14,
 'Canada': 15,
                                 
 'Denmark': 16}

In [20]:
df["Country_of_Origin"].astype(int)

0      1
1      1
2      1
3      1
4      1
      ..
357    1
358    1
359    1
360    1
361    1
Name: Country_of_Origin, Length: 362, dtype: int32

In [21]:
Country_of_Origin_data.keys()

dict_keys(['Italy', 'India', 'China', 'UK', 'Japan', 'United States', 'Taiwan', 'United Kingdom', 'Austria', 'USA', 'Sweden', 'Spain', 'Germany', 'France', 'South Korea', 'Canada', 'Denmark'])

In [22]:
df

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Torque,Transmission_Type,Drivetrain,Number_of_Seating,Price (in INR),Year,Looks,Body Type,Engine Type,Number_of_Cylinders
0,33,1,RS 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 10,99,000",2021,Sport,Naked,Parallel-twin,2
1,33,1,Tuono 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 11,99,000",2021,Sport,Naked,Parallel-twin,2
2,33,1,RS 125,124.9,15 hp,12 Nm,6-speed manual,Chain,2,"INR 4,49,000",2022,Sport,Racing,Single-cylinder,1
3,33,1,Shiver 900,896,95 hp,90 Nm,6-speed manual,Shaft,2,"INR 13,99,000",2022,Adventure,Naked,V-twin,2
4,33,1,Tuono 1100,1077,175 hp,121 Nm,6-speed manual,Shaft,2,"INR 19,99,000",2022,Adventure,Naked,V-twin,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,42,1,EsseEsse9,110 kW,163 hp,195 lb-ft,Automatic,Electric,2,INR 25 lakhs,2019,Retro,Cruiser,Electric,1
358,42,1,EGO+,126 kW,171 hp,222 lb-ft,Automatic,Electric,1,INR 23.75 lakhs,2022,Futuristic,Naked,Electric,1
359,42,1,Eva Ribelle RS,126 kW,171 hp,222 lb-ft,Automatic,Electric,2,INR 25.25 lakhs,2022,Modern,Roadster,Electric,1
360,45,1,X-Cape,649,60.8 PS,62 Nm,6-speed manual,Parallel twin,2,720000,2023,Modern,Adventure touring,"Liquid-cooled, DOHC",2


In [23]:
df

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Torque,Transmission_Type,Drivetrain,Number_of_Seating,Price (in INR),Year,Looks,Body Type,Engine Type,Number_of_Cylinders
0,33,1,RS 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 10,99,000",2021,Sport,Naked,Parallel-twin,2
1,33,1,Tuono 660,659,100 hp,67 Nm,6-speed quickshifter,Chain,2,"INR 11,99,000",2021,Sport,Naked,Parallel-twin,2
2,33,1,RS 125,124.9,15 hp,12 Nm,6-speed manual,Chain,2,"INR 4,49,000",2022,Sport,Racing,Single-cylinder,1
3,33,1,Shiver 900,896,95 hp,90 Nm,6-speed manual,Shaft,2,"INR 13,99,000",2022,Adventure,Naked,V-twin,2
4,33,1,Tuono 1100,1077,175 hp,121 Nm,6-speed manual,Shaft,2,"INR 19,99,000",2022,Adventure,Naked,V-twin,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,42,1,EsseEsse9,110 kW,163 hp,195 lb-ft,Automatic,Electric,2,INR 25 lakhs,2019,Retro,Cruiser,Electric,1
358,42,1,EGO+,126 kW,171 hp,222 lb-ft,Automatic,Electric,1,INR 23.75 lakhs,2022,Futuristic,Naked,Electric,1
359,42,1,Eva Ribelle RS,126 kW,171 hp,222 lb-ft,Automatic,Electric,2,INR 25.25 lakhs,2022,Modern,Roadster,Electric,1
360,45,1,X-Cape,649,60.8 PS,62 Nm,6-speed manual,Parallel twin,2,720000,2023,Modern,Adventure touring,"Liquid-cooled, DOHC",2


In [24]:
Country_of_Origin_data.keys()

dict_keys(['Italy', 'India', 'China', 'UK', 'Japan', 'United States', 'Taiwan', 'United Kingdom', 'Austria', 'USA', 'Sweden', 'Spain', 'Germany', 'France', 'South Korea', 'Canada', 'Denmark'])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company              362 non-null    int64 
 1   Country_of_Origin    362 non-null    int64 
 2   Model                362 non-null    object
 3   Number_of_cc         362 non-null    object
 4   Horsepower           362 non-null    object
 5   Torque               362 non-null    object
 6   Transmission_Type    362 non-null    object
 7   Drivetrain           362 non-null    object
 8   Number_of_Seating    362 non-null    int64 
 9   Price (in INR)       362 non-null    object
 10  Year                 362 non-null    int64 
 11  Looks                362 non-null    object
 12  Body Type            362 non-null    object
 13  Engine Type          362 non-null    object
 14  Number_of_Cylinders  361 non-null    object
dtypes: int64(4), object(11)
memory usage: 42.5+ KB


### 3.3 Model

In [26]:
df["Model"]

0              RS 660
1           Tuono 660
2              RS 125
3          Shiver 900
4          Tuono 1100
            ...      
357         EsseEsse9
358              EGO+
359    Eva Ribelle RS
360            X-Cape
361    Seiemmezzo 6 ½
Name: Model, Length: 362, dtype: object

In [27]:
df["Model"].value_counts().to_dict()

{'Artemis': 4,
 'TC': 2,
 'Svartpilen 401': 2,
 'X-Cape': 2,
 'Seiemmezzo 6 ½': 2,
 'TC Max': 2,
 'Leoncino 500': 2,
 'TRK 502': 2,
 'TRK 251': 2,
 'TS Street Hunter': 2,
 'TNT 600i': 2,
 'Himalayan': 2,
 'Meteor 350': 2,
 'Interceptor 650': 2,
 'Vitpilen 401': 2,
 'Octane': 1,
 'TC Wanderer': 1,
 'Dragster 200': 1,
 'SE 125 Factory 2T': 1,
 'SE 250 Factory 2T': 1,
 'STX 150': 1,
 'Metropolis 400': 1,
 'SE 300 Factory 2T': 1,
 'SEF 250i-S': 1,
 'SEF 300i-S': 1,
 'Cross Country': 1,
 'Judge': 1,
 'Hammer': 1,
 'Brutus Single Speed MTB 29': 1,
 'RS 660': 1,
 'Vision': 1,
 'Django 300': 1,
 'Vendetta 250': 1,
 'Milano 125': 1,
 'ZSX 125': 1,
 'LXR 125': 1,
 'Jawa Perak': 1,
 'Jawa 42 Bobber': 1,
 'Jawa 42': 1,
 'SR125': 1,
 'Sixties 300i': 1,
 'K300 R': 1,
 'K-Light 250V': 1,
 'Vieste 300': 1,
 'V302C': 1,
 'Raw 125': 1,
 'Scrambler 125': 1,
 'Muscle 125': 1,
 'Black Cafe 125': 1,
 'Furious 125': 1,
 'Radeon': 1,
 'Django 125': 1,
 'Two Fifty': 1,
 'Pulsar 125': 1,
 'MT-10': 1,
 'Rocket 3

In [28]:
df["Model"].replace({'Artemis': 4,
 'TC': 2,
 'Svartpilen 401': 2,
 'X-Cape': 2,
 'Seiemmezzo 6 ½': 2,
 'TC Max': 2,
 'Leoncino 500': 2,
 'TRK 502': 2,
 'TRK 251': 2,
 'TS Street Hunter': 2,
 'TNT 600i': 2,
 'Himalayan': 2,
 'Meteor 350': 2,
 'Interceptor 650': 2,
 'Vitpilen 401': 2,
 'Octane': 3,
 'TC Wanderer': 3,
 'Dragster 200': 3,
 'SE 125 Factory 2T': 3,
 'SE 250 Factory 2T': 3,
 'STX 150': 3,
 'Metropolis 400': 3,
 'SE 300 Factory 2T': 3,
 'SEF 250i-S': 1,
 'SEF 300i-S': 3,
 'Cross Country': 3,
 'Judge': 3,
 'Hammer': 3,
 'Brutus Single Speed MTB 29': 3,
 'RS 660': 3,
 'Vision': 3,
 'Django 300': 3,
 'Vendetta 250': 3,
 'Milano 125': 3,
 'ZSX 125': 3,
 'LXR 125': 3,
 'Jawa Perak': 3,
 'Jawa 42 Bobber': 3,
 'Jawa 42': 3,
 'SR125': 4,
 'Sixties 300i': 1,
 'K300 R': 1,
 'K-Light 250V': 1,
 'Vieste 300': 1,
 'V302C': 1,
 'Raw 125': 1,
 'Scrambler 125': 2,
 'Muscle 125': 2,
 'Black Cafe 125': 2,
 'Furious 125': 2,
 'Radeon': 2,
 'Django 125': 2,
 'Two Fifty': 2,
 'Pulsar 125': 2,
 'MT-10': 2,
 'Rocket 3': 3,
 'Hayabusa': 3,
 '1290 Super Duke R': 4,
 'TE 501i': 4,
 'FE 501': 4,
 'TE 300i': 4,
 'FE 350': 4,
 'Svartpilen 250': 3,
 'Vitpilen 250': 3,
 'Super Cub': 3,
 'CT125': 3,
 'LiveWire': 3,
 'SM 700': 3,
 'EC 300F': 3,
 'EC 250F': 3,
 'Pulsar 150': 1,
 'Pulsar NS200': 1,
 'Dominar 400': 1,
 'Forty Two': 1,
 'Sport 125': 1,
 'Jupiter': 1,
 'Ntorq 125': 1,
 'Apache RTR 160 2V': 1,
 'Apache RTR 200 4V FI': 1,
 'Apache RTR 200 4V': 1,
 'Perak': 1,
 'Jawa': 1,
 'Avenger 220': 1,
 'HF Deluxe': 1,
 'Glamour 125': 1,
 'XPulse 200T': 1,
 'XPulse 200': 1,
 'Passion Pro': 1,
 'Splendor Plus': 1,
 'Platina 110': 1,
 'Black Seven 125': 1,
 'Alpha S': 1,
 'Dirt Track 125': 1,
 'CUx': 1,
 'Cadwell 125': 1,
 'TNT 600RR': 1,
 '502 C': 1,
 'Leoncino 250': 1,
 'Imperiale 400': 1,
 'Kent 125': 1,
 'CT 125': 1,
 'CPx': 1,
 'GRAN TURISMO 125': 1,
 'BQi-C3 Pro': 1,
 'RACER 125': 1,
 'Viale 650': 1,
 'RS 650': 1,
 'Superdual T': 1,
 'SM-XE 125': 1,
 'Terrain 125': 1,
 'City 125': 1,
 'Meteor 125': 1,
 'Tempest Scrambler 125': 1,
 'Tempest Roadster 125': 1,
 'Akita 125': 1,
 '350R': 1,
 'EGO+': 1,
 'EsseEsse9': 1,
 'Eva Ribelle': 1,
 'Ego': 1,
 'Sabbath': 1,
 'RS-13': 1,
 'Razorback': 1,
 'Mongrel': 1,
 'Mastiff': 1,
 'Hilts': 1,
 'GT-SS': 1,
 'GT-SR': 1,
 'FSR': 1,
 'Akita': 1,
 'GK350': 1,
 '350X': 1,
 '350T': 1,
 'Apache 125': 1,
 'NQi Sport': 1,
 'Five Hundred': 1,
 'Zero SR/F': 1,
 'Spyder RT': 1,
 'Mio 110': 1,
 'Neo 110': 1,
 'Revo 125': 1,
 'Revo 300': 1,
 'Bullitt': 1,
 'Zero FX': 1,
 'Zero DS': 1,
 'Zero SR': 1,
 'MQi Plus': 1,
 'Zero S': 1,
 '125 SuperSport': 1,
 '300 SuperSport': 1,
 'HPS 300': 1,
 'Caballero 125': 1,
 'Aquila GV650': 1,
 'EC 250': 1,
 'Dirt eBike X260': 1,
 'Ryker': 1,
 'F3': 1,
 'TL125': 1,
 'Sixty2': 1,
 'NQi GTS': 1,
 'MQi GT': 1,
 'Renegade Thor': 1,
 'Renegade Duty Ace': 1,
 'Renegade Duty S': 1,
 'Renegade Sport S': 1,
 'Renegade Commando': 1,
 'Venture 50': 1,
 'Rattler 50': 1,
 'Roughhouse 50 SPORT': 1,
 'Roughhouse 50': 1,
 'Hooligan 170i': 1,
 'Buddy KICK': 1,
 'Buddy 170i': 1,
 'Buddy 125': 1,
 'Buddy 50': 1,
 'TT125': 1,
 'EC 300': 1,
 'Alp 200': 1,
 'MC 450F': 1,
 'Hness CB350': 1,
 'CBR250R': 1,
 'Heritage Classic': 1,
 'Fat Boy': 1,
 'Fat Bob': 1,
 'PAN America 1250': 1,
 'Street Glide Special': 1,
 'Road Glide Special': 1,
 'Sportster S': 1,
 'Nightster': 1,
 'Multistrada V4': 1,
 'Monster': 1,
 'Scrambler 800': 1,
 'Diavel': 1,
 'Panigale V4': 1,
 'Roadwin 250': 1,
 'NS 250 FI': 1,
 'V125 FI': 1,
 'CB350R': 1,
 'CB Shine': 1,
 'X132 Hellcat Combat Fighter': 1,
 'Activa 125': 1,
 '390 Duke': 4,
 '250 Duke': 4,
 '200 Duke': 4,
 '125 Duke': 4,
 'Vulcan S': 4,
 'Z H2': 4,
 'Z900': 4,
 'Ninja 650':4 ,
 'Ninja 400': 4,
 'Ninja 300': 4,
 'Scout Bobber': 4,
 'FTR 1200 S': 4,
 'Challenger Dark Horse': 4,
 'Chief Bobber Dark Horse': 4,
 'Chief Dark Horse': 4,
 '701 Enduro': 4,
 '701 Supermoto': 4,
 'VL 125': 4,
 'Wraith': 4,
 'RC 390': 4,
 'Crossfire 500 X': 1,
 'BX 150': 1,
 'Cromwell 125': 1,
 'S 1000 R': 1,
 'S 1000 RR': 1,
 'R 1250 RT': 1,
 'R 1250 GS': 1,
 'F 850 GS Adventure': 1,
 'F 850 GS': 1,
 'G 310 R': 1,
 'G 310 GS': 1,
 'TNT 899': 1,
 'TNT 300': 1,
 'TNT 250': 1,
 'TNT 125': 1,
 'Tuono 1100': 1,
 'Shiver 900': 1,
 'RS 125': 1,
 'Crossfire 500': 1,
 'Sunray 500': 1,
 'Hellcat': 1,
 'Blast': 1,
 '650GT': 1,
 '650MT': 1,
 '650NK': 1,
 '300NK': 1,
 'Storm 1200': 1,
 'Interceptor 900': 1,
 'GP450R': 1,
 'Spitfire 650': 1,
 'Gran Canyon 1100': 1,
 'V-Raptor 1000': 1,
 'Elefant 900': 1,
 'Mito 125': 1,
 'Raptor 125': 1,
 'XB12S Lightning': 1,
 'XB9R Firebolt': 1,
 'Ulysses': 1,
 'Lightning': 1,
 'RC 200': 1,
 '390 Adventure': 1,
 'MC 350F': 1,
 'Bonneville T120': 1,
 'Speed Twin 900': 3,
 'Tiger 850 Sport': 3,
 'Tiger 900 GT Pro': 3,
 'Trident 660': 3,
 'Speed Triple 1200 RR': 3,
 'Rocket 3 R': 3,
 'HD 200': 3,
 'Fiddle III 125': 3,
 'Joyride 125': 3,
 'Maxsym 400i': 3,
 'Joymax Z 300': 3,
 'Burgman Street 125': 3,
 'Access 125': 3,
 'Intruder 150': 3,
 'Gixxer 250': 3,
 'Gixxer SF 250': 3,
 'Super Meteor 650': 3,
 'Bonneville T100': 3,
 'Primavera': 3,
 'Bullet 500': 3,
 'Sprint': 3,
 'MC 250F': 1,
 'Alp 125': 1,
 'Tuono 660': 1,
 'RR 125 4-Stroke': 1,
 'RR 250 4-Stroke': 1,
 'RR 390 4-Stroke': 1,
 'RR 125 2-Stroke': 1,
 'RR 250 2-Stroke': 1,
 'RR 350 2-Stroke': 1,
 'XSR155': 1,
 'FZ-25': 1,
 'R3': 1,
 'MT-09': 1,
 'R1': 1,
 'Elettrica': 1,
 '946': 1,
 'GTS 300': 1,
 'Continental GT 650': 1,
 'Bullet 350': 1,
 '790 Adventure': 1,
 'Centuro 100': 1,
 'Gusto 125': 1,
 'Xpulse 200': 1,
 'Mojo 300': 1,
 'LF250GY-3': 1,
 'LF250K': 1,
 'LF200GY': 1,
 'LF125S': 1,
 'V200': 1,
 'V125': 1,
 'Scrambler 650': 1,
 'Cafe Racer 500': 1,
 'Gran Turismo 300': 1,
 'Daytona 125': 1,
 'AK 550': 1,
 'K-XCT 300': 1,
 'Downtown 300i': 4,
 'Agility 125': 4,
 'Duro DZire': 4,
 'V85 TT': 4,
 'Vespa GTS300': 4,
 'V9 Bobber': 4,
 'Vespa VX125': 4,
 'Aprilia Tuono 660': 4,
 'Aprilia RS150': 4,
 'Aprilia SR160': 1,
 'V4SV': 3,
 'Atlas': 3,
 'Commando 961 Sport': 1,
 'Commando 961 Café Racer': 1,
 'Superveloce 800': 1,
 'Dragster RR': 1,
 'Turismo Veloce Lusso': 1,
 'Brutale 1000 RR': 1,
 'F3 800 RR': 1,
 'MGX-21': 1,
 'Eldorado': 1,
 'Audace': 1,
 'V9 Roamer': 1,
 'Eva Ribelle RS': 1},inplace = True)

In [29]:
Model_data = {'Artemis': 4,
 'TC': 2,
 'Svartpilen 401': 2,
 'X-Cape': 2,
 'Seiemmezzo 6 ½': 2,
 'TC Max': 2,
 'Leoncino 500': 2,
 'TRK 502': 2,
 'TRK 251': 2,
 'TS Street Hunter': 2,
 'TNT 600i': 2,
 'Himalayan': 2,
 'Meteor 350': 2,
 'Interceptor 650': 2,
 'Vitpilen 401': 2,
 'Octane': 3,
 'TC Wanderer': 3,
 'Dragster 200': 3,
 'SE 125 Factory 2T': 3,
 'SE 250 Factory 2T': 3,
 'STX 150': 3,
 'Metropolis 400': 3,
 'SE 300 Factory 2T': 3,
 'SEF 250i-S': 1,
 'SEF 300i-S': 3,
 'Cross Country': 3,
 'Judge': 3,
 'Hammer': 3,
 'Brutus Single Speed MTB 29': 3,
 'RS 660': 3,
 'Vision': 3,
 'Django 300': 3,
 'Vendetta 250': 3,
 'Milano 125': 3,
 'ZSX 125': 3,
 'LXR 125': 3,
 'Jawa Perak': 3,
 'Jawa 42 Bobber': 3,
 'Jawa 42': 3,
 'SR125': 4,
 'Sixties 300i': 1,
 'K300 R': 1,
 'K-Light 250V': 1,
 'Vieste 300': 1,
 'V302C': 1,
 'Raw 125': 1,
 'Scrambler 125': 2,
 'Muscle 125': 2,
 'Black Cafe 125': 2,
 'Furious 125': 2,
 'Radeon': 2,
 'Django 125': 2,
 'Two Fifty': 2,
 'Pulsar 125': 2,
 'MT-10': 2,
 'Rocket 3': 3,
 'Hayabusa': 3,
 '1290 Super Duke R': 4,
 'TE 501i': 4,
 'FE 501': 4,
 'TE 300i': 4,
 'FE 350': 4,
 'Svartpilen 250': 3,
 'Vitpilen 250': 3,
 'Super Cub': 3,
 'CT125': 3,
 'LiveWire': 3,
 'SM 700': 3,
 'EC 300F': 3,
 'EC 250F': 3,
 'Pulsar 150': 1,
 'Pulsar NS200': 1,
 'Dominar 400': 1,
 'Forty Two': 1,
 'Sport 125': 1,
 'Jupiter': 1,
 'Ntorq 125': 1,
 'Apache RTR 160 2V': 1,
 'Apache RTR 200 4V FI': 1,
 'Apache RTR 200 4V': 1,
 'Perak': 1,
 'Jawa': 1,
 'Avenger 220': 1,
 'HF Deluxe': 1,
 'Glamour 125': 1,
 'XPulse 200T': 1,
 'XPulse 200': 1,
 'Passion Pro': 1,
 'Splendor Plus': 1,
 'Platina 110': 1,
 'Black Seven 125': 1,
 'Alpha S': 1,
 'Dirt Track 125': 1,
 'CUx': 1,
 'Cadwell 125': 1,
 'TNT 600RR': 1,
 '502 C': 1,
 'Leoncino 250': 1,
 'Imperiale 400': 1,
 'Kent 125': 1,
 'CT 125': 1,
 'CPx': 1,
 'GRAN TURISMO 125': 1,
 'BQi-C3 Pro': 1,
 'RACER 125': 1,
 'Viale 650': 1,
 'RS 650': 1,
 'Superdual T': 1,
 'SM-XE 125': 1,
 'Terrain 125': 1,
 'City 125': 1,
 'Meteor 125': 1,
 'Tempest Scrambler 125': 1,
 'Tempest Roadster 125': 1,
 'Akita 125': 1,
 '350R': 1,
 'EGO+': 1,
 'EsseEsse9': 1,
 'Eva Ribelle': 1,
 'Ego': 1,
 'Sabbath': 1,
 'RS-13': 1,
 'Razorback': 1,
 'Mongrel': 1,
 'Mastiff': 1,
 'Hilts': 1,
 'GT-SS': 1,
 'GT-SR': 1,
 'FSR': 1,
 'Akita': 1,
 'GK350': 1,
 '350X': 1,
 '350T': 1,
 'Apache 125': 1,
 'NQi Sport': 1,
 'Five Hundred': 1,
 'Zero SR/F': 1,
 'Spyder RT': 1,
 'Mio 110': 1,
 'Neo 110': 1,
 'Revo 125': 1,
 'Revo 300': 1,
 'Bullitt': 1,
 'Zero FX': 1,
 'Zero DS': 1,
 'Zero SR': 1,
 'MQi Plus': 1,
 'Zero S': 1,
 '125 SuperSport': 1,
 '300 SuperSport': 1,
 'HPS 300': 1,
 'Caballero 125': 1,
 'Aquila GV650': 1,
 'EC 250': 1,
 'Dirt eBike X260': 1,
 'Ryker': 1,
 'F3': 1,
 'TL125': 1,
 'Sixty2': 1,
 'NQi GTS': 1,
 'MQi GT': 1,
 'Renegade Thor': 1,
 'Renegade Duty Ace': 1,
 'Renegade Duty S': 1,
 'Renegade Sport S': 1,
 'Renegade Commando': 1,
 'Venture 50': 1,
 'Rattler 50': 1,
 'Roughhouse 50 SPORT': 1,
 'Roughhouse 50': 1,
 'Hooligan 170i': 1,
 'Buddy KICK': 1,
 'Buddy 170i': 1,
 'Buddy 125': 1,
 'Buddy 50': 1,
 'TT125': 1,
 'EC 300': 1,
 'Alp 200': 1,
 'MC 450F': 1,
 'Hness CB350': 1,
 'CBR250R': 1,
 'Heritage Classic': 1,
 'Fat Boy': 1,
 'Fat Bob': 1,
 'PAN America 1250': 1,
 'Street Glide Special': 1,
 'Road Glide Special': 1,
 'Sportster S': 1,
 'Nightster': 1,
 'Multistrada V4': 1,
 'Monster': 1,
 'Scrambler 800': 1,
 'Diavel': 1,
 'Panigale V4': 1,
 'Roadwin 250': 1,
 'NS 250 FI': 1,
 'V125 FI': 1,
 'CB350R': 1,
 'CB Shine': 1,
 'X132 Hellcat Combat Fighter': 1,
 'Activa 125': 1,
 '390 Duke': 4,
 '250 Duke': 4,
 '200 Duke': 4,
 '125 Duke': 4,
 'Vulcan S': 4,
 'Z H2': 4,
 'Z900': 4,
 'Ninja 650':4 ,
 'Ninja 400': 4,
 'Ninja 300': 4,
 'Scout Bobber': 4,
 'FTR 1200 S': 4,
 'Challenger Dark Horse': 4,
 'Chief Bobber Dark Horse': 4,
 'Chief Dark Horse': 4,
 '701 Enduro': 4,
 '701 Supermoto': 4,
 'VL 125': 4,
 'Wraith': 4,
 'RC 390': 4,
 'Crossfire 500 X': 1,
 'BX 150': 1,
 'Cromwell 125': 1,
 'S 1000 R': 1,
 'S 1000 RR': 1,
 'R 1250 RT': 1,
 'R 1250 GS': 1,
 'F 850 GS Adventure': 1,
 'F 850 GS': 1,
 'G 310 R': 1,
 'G 310 GS': 1,
 'TNT 899': 1,
 'TNT 300': 1,
 'TNT 250': 1,
 'TNT 125': 1,
 'Tuono 1100': 1,
 'Shiver 900': 1,
 'RS 125': 1,
 'Crossfire 500': 1,
 'Sunray 500': 1,
 'Hellcat': 1,
 'Blast': 1,
 '650GT': 1,
 '650MT': 1,
 '650NK': 1,
 '300NK': 1,
 'Storm 1200': 1,
 'Interceptor 900': 1,
 'GP450R': 1,
 'Spitfire 650': 1,
 'Gran Canyon 1100': 1,
 'V-Raptor 1000': 1,
 'Elefant 900': 1,
 'Mito 125': 1,
 'Raptor 125': 1,
 'XB12S Lightning': 1,
 'XB9R Firebolt': 1,
 'Ulysses': 1,
 'Lightning': 1,
 'RC 200': 1,
 '390 Adventure': 1,
 'MC 350F': 1,
 'Bonneville T120': 1,
 'Speed Twin 900': 3,
 'Tiger 850 Sport': 3,
 'Tiger 900 GT Pro': 3,
 'Trident 660': 3,
 'Speed Triple 1200 RR': 3,
 'Rocket 3 R': 3,
 'HD 200': 3,
 'Fiddle III 125': 3,
 'Joyride 125': 3,
 'Maxsym 400i': 3,
 'Joymax Z 300': 3,
 'Burgman Street 125': 3,
 'Access 125': 3,
 'Intruder 150': 3,
 'Gixxer 250': 3,
 'Gixxer SF 250': 3,
 'Super Meteor 650': 3,
 'Bonneville T100': 3,
 'Primavera': 3,
 'Bullet 500': 3,
 'Sprint': 3,
 'MC 250F': 1,
 'Alp 125': 1,
 'Tuono 660': 1,
 'RR 125 4-Stroke': 1,
 'RR 250 4-Stroke': 1,
 'RR 390 4-Stroke': 1,
 'RR 125 2-Stroke': 1,
 'RR 250 2-Stroke': 1,
 'RR 350 2-Stroke': 1,
 'XSR155': 1,
 'FZ-25': 1,
 'R3': 1,
 'MT-09': 1,
 'R1': 1,
 'Elettrica': 1,
 '946': 1,
 'GTS 300': 1,
 'Continental GT 650': 1,
 'Bullet 350': 1,
 '790 Adventure': 1,
 'Centuro 100': 1,
 'Gusto 125': 1,
 'Xpulse 200': 1,
 'Mojo 300': 1,
 'LF250GY-3': 1,
 'LF250K': 1,
 'LF200GY': 1,
 'LF125S': 1,
 'V200': 1,
 'V125': 1,
 'Scrambler 650': 1,
 'Cafe Racer 500': 1,
 'Gran Turismo 300': 1,
 'Daytona 125': 1,
 'AK 550': 1,
 'K-XCT 300': 1,
 'Downtown 300i': 4,
 'Agility 125': 4,
 'Duro DZire': 4,
 'V85 TT': 4,
 'Vespa GTS300': 4,
 'V9 Bobber': 4,
 'Vespa VX125': 4,
 'Aprilia Tuono 660': 4,
 'Aprilia RS150': 4,
 'Aprilia SR160': 1,
 'V4SV': 3,
 'Atlas': 3,
 'Commando 961 Sport': 1,
 'Commando 961 Café Racer': 1,
 'Superveloce 800': 1,
 'Dragster RR': 1,
 'Turismo Veloce Lusso': 1,
 'Brutale 1000 RR': 1,
 'F3 800 RR': 1,
 'MGX-21': 1,
 'Eldorado': 1,
 'Audace': 1,
 'V9 Roamer': 1,
 'Eva Ribelle RS': 1}

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company              362 non-null    int64 
 1   Country_of_Origin    362 non-null    int64 
 2   Model                362 non-null    int64 
 3   Number_of_cc         362 non-null    object
 4   Horsepower           362 non-null    object
 5   Torque               362 non-null    object
 6   Transmission_Type    362 non-null    object
 7   Drivetrain           362 non-null    object
 8   Number_of_Seating    362 non-null    int64 
 9   Price (in INR)       362 non-null    object
 10  Year                 362 non-null    int64 
 11  Looks                362 non-null    object
 12  Body Type            362 non-null    object
 13  Engine Type          362 non-null    object
 14  Number_of_Cylinders  361 non-null    object
dtypes: int64(5), object(10)
memory usage: 42.5+ KB


In [31]:
Model_data.keys()

dict_keys(['Artemis', 'TC', 'Svartpilen 401', 'X-Cape', 'Seiemmezzo 6 ½', 'TC Max', 'Leoncino 500', 'TRK 502', 'TRK 251', 'TS Street Hunter', 'TNT 600i', 'Himalayan', 'Meteor 350', 'Interceptor 650', 'Vitpilen 401', 'Octane', 'TC Wanderer', 'Dragster 200', 'SE 125 Factory 2T', 'SE 250 Factory 2T', 'STX 150', 'Metropolis 400', 'SE 300 Factory 2T', 'SEF 250i-S', 'SEF 300i-S', 'Cross Country', 'Judge', 'Hammer', 'Brutus Single Speed MTB 29', 'RS 660', 'Vision', 'Django 300', 'Vendetta 250', 'Milano 125', 'ZSX 125', 'LXR 125', 'Jawa Perak', 'Jawa 42 Bobber', 'Jawa 42', 'SR125', 'Sixties 300i', 'K300 R', 'K-Light 250V', 'Vieste 300', 'V302C', 'Raw 125', 'Scrambler 125', 'Muscle 125', 'Black Cafe 125', 'Furious 125', 'Radeon', 'Django 125', 'Two Fifty', 'Pulsar 125', 'MT-10', 'Rocket 3', 'Hayabusa', '1290 Super Duke R', 'TE 501i', 'FE 501', 'TE 300i', 'FE 350', 'Svartpilen 250', 'Vitpilen 250', 'Super Cub', 'CT125', 'LiveWire', 'SM 700', 'EC 300F', 'EC 250F', 'Pulsar 150', 'Pulsar NS200', 'D

### 3.4 Number_of_cc

In [32]:
df["Number_of_cc"]

0         659
1         659
2       124.9
3         896
4        1077
        ...  
357    110 kW
358    126 kW
359    126 kW
360       649
361       649
Name: Number_of_cc, Length: 362, dtype: object

In [33]:
df["Number_of_cc"].value_counts()


Number_of_cc
125     60
250     24
300     12
500      8
110      7
        ..
1133     1
693      1
1158     1
321      1
346      1
Name: count, Length: 139, dtype: int64

In [34]:
df["Number_of_cc"]

0         659
1         659
2       124.9
3         896
4        1077
        ...  
357    110 kW
358    126 kW
359    126 kW
360       649
361       649
Name: Number_of_cc, Length: 362, dtype: object

In [35]:
data = df["Number_of_cc"].unique()

In [36]:
df["Number_of_cc"] = [value.replace(' cc', '').replace(' kW', '').replace(",","") for value in df["Number_of_cc"]]

In [37]:
df["Number_of_cc"].value_counts()

Number_of_cc
125      60
250      24
300      12
500      11
110      10
         ..
1340      1
2458      1
149.5     1
2163      1
160       1
Name: count, Length: 128, dtype: int64

In [38]:
df["Number_of_cc"].tail(20)

342    348
343    348
344    348
345    125
346    250
347    250
348    250
349    250
350    250
351    250
352    250
353    250
354    250
355    110
356    110
357    110
358    126
359    126
360    649
361    649
Name: Number_of_cc, dtype: object

In [39]:
df["Number_of_cc"].head(50)

0       659
1       659
2     124.9
3       896
4      1077
5       125
6       250
7       300
8       600
9       899
10      250
11      500
12      500
13      313
14      313
15      853
16      853
17     1170
18     1170
19      999
20      999
21      125
22      149
23      500
24      500
25      500
26      492
27      984
28     1125
29      984
30     1201
31    124.9
32      125
33      893
34      998
35     1078
36      650
37      450
38      900
39     1200
40    292.4
41    649.3
42      649
43    649.3
44     1967
45     1967
46     2163
47    124.9
48    124.9
49    249.6
Name: Number_of_cc, dtype: object

In [40]:
import re


# Remove "cc" suffix from the data
df["Number_of_cc"] = [re.sub(r'cc$', '', value) for value in df["Number_of_cc"]]

df["Number_of_cc"]


0        659
1        659
2      124.9
3        896
4       1077
       ...  
357      110
358      126
359      126
360      649
361      649
Name: Number_of_cc, Length: 362, dtype: object

In [41]:
import pandas as pd
import re

# # Create a sample dataframe
# df = pd.DataFrame({
#     'Column1': ['125cc', 'ABC', '300cc'],
#     'Column2': ['XYZ', '500cc', '600cc'],
#     'Column3': ['700cc', 'PQR', '900cc']
# })

# Remove pure strings from the dataframe
df["Number_of_cc"] = df[["Number_of_cc"]].applymap(lambda x: re.sub(r'[A-Za-z]+', '', str(x)) if isinstance(x, str) else x)

df["Number_of_cc"]


0        659
1        659
2      124.9
3        896
4       1077
       ...  
357      110
358      126
359      126
360      649
361      649
Name: Number_of_cc, Length: 362, dtype: object

In [42]:
df["Number_of_cc"].isna().sum()

0

In [43]:
import pandas as pd
import numpy as np

# Remove spaces and replace with null
df["Number_of_cc"] = df["Number_of_cc"].replace(r'^\s*$', np.nan, regex=True)




In [44]:
df["Number_of_cc"].isna().sum()

1

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Company              362 non-null    int64 
 1   Country_of_Origin    362 non-null    int64 
 2   Model                362 non-null    int64 
 3   Number_of_cc         361 non-null    object
 4   Horsepower           362 non-null    object
 5   Torque               362 non-null    object
 6   Transmission_Type    362 non-null    object
 7   Drivetrain           362 non-null    object
 8   Number_of_Seating    362 non-null    int64 
 9   Price (in INR)       362 non-null    object
 10  Year                 362 non-null    int64 
 11  Looks                362 non-null    object
 12  Body Type            362 non-null    object
 13  Engine Type          362 non-null    object
 14  Number_of_Cylinders  361 non-null    object
dtypes: int64(5), object(10)
memory usage: 42.5+ KB


In [46]:
df["Number_of_cc"] = df["Number_of_cc"].replace(np.nan, "1", regex=True)

In [47]:
df["Number_of_cc"] = df["Number_of_cc"].astype(float)

In [48]:
df["Number_of_cc"].head(20)

0      659.0
1      659.0
2      124.9
3      896.0
4     1077.0
5      125.0
6      250.0
7      300.0
8      600.0
9      899.0
10     250.0
11     500.0
12     500.0
13     313.0
14     313.0
15     853.0
16     853.0
17    1170.0
18    1170.0
19     999.0
Name: Number_of_cc, dtype: float64

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    object 
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    object 
 7   Drivetrain           362 non-null    object 
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int64(5), object(9)
memor

In [50]:
df["Number_of_cc"].isna().sum()

0

### 3.5 Horsepower

In [51]:
df["Horsepower"]

0       100 hp
1       100 hp
2        15 hp
3        95 hp
4       175 hp
        ...   
357     163 hp
358     171 hp
359     171 hp
360    60.8 PS
361    55.7 PS
Name: Horsepower, Length: 362, dtype: object

In [52]:
df["Horsepower"].value_counts()

Horsepower
11 hp                  6
20 bhp                 6
95 hp                  5
15 hp                  5
18.4 bhp               5
                      ..
26.5 hp @ 8,500 rpm    1
38 hp                  1
24 hp                  1
94                     1
42 hp                  1
Name: count, Length: 245, dtype: int64

In [53]:
df["Horsepower"].unique()

array(['100 hp', '15 hp', '95 hp', '175 hp', '11 hp', '27 hp', '39 hp',
       '85 hp', '116 hp', '47 hp', '34 hp', '136 hp', '199 hp', '165 hp',
       '11.7 hp', '41 hp', '45 hp', '110 hp', '96 hp', '105 hp', '80 hp',
       '65 hp', '60 hp', '86 hp', '115 hp', '33.99 PS', '61.18 PS',
       '70.70 PS', '62.54 PS', '205 hp', '190 hp', '227 hp', '10.8 hp',
       '11.2 hp', '27.6 hp', '221 hp', '159.6 hp', '73 hp', '111 hp',
       '168 hp', '88.5', '120.7', '92.5', '150.2', '94',
       '26.5 hp @ 8,500 rpm', '31 hp @ 8,500 rpm', '29.6 hp @ 5,500 rpm',
       '10.5 hp @ 7,500 rpm', '8.75 hp @ 7,500 rpm', '43 hp @ 9,000 rpm',
       '75 hp @ 8,500 rpm', '108 hp @ 5,600 rpm', '122 hp @ 6,000 rpm',
       '120 hp @ 8,250 rpm', '94 hp @ 6,000 rpm', '39 PS @ 11000 rpm',
       '45 PS @ 10,000 rpm', '67 PS @ 8000 rpm', '125 PS @ 9500 rpm',
       '197 PS @ 11,000 rpm', '60 PS @ 6500 rpm', '25 hp', '30 hp',
       '43 hp', '10 hp', '29 hp', '12.50 HP @ 10000 RPM',
       '25.8 HP @ 9000 RPM

In [54]:
df["Horsepower"] = [re.sub(r'(\d+)[^\d]*', r'\1', value) for value in df["Horsepower"]]

In [55]:
df["Horsepower"].isna().sum()

0

In [56]:
df["Horsepower"]

0      100
1      100
2       15
3       95
4      175
      ... 
357    163
358    171
359    171
360    608
361    557
Name: Horsepower, Length: 362, dtype: object

In [57]:
df["Horsepower"].unique()

array(['100', '15', '95', '175', '11', '27', '39', '85', '116', '47',
       '34', '136', '199', '165', '117', '41', '45', '110', '96', '105',
       '80', '65', '60', '86', '115', '3399', '6118', '7070', '6254',
       '205', '190', '227', '108', '112', '276', '221', '1596', '73',
       '111', '168', '885', '1207', '925', '1502', '94', '2658500',
       '318500', '2965500', '1057500', '8757500', '439000', '758500',
       '1085600', '1226000', '1208250', '946000', '3911000', '4510000',
       '678000', '1259500', '19711000', '606500', '25', '30', '43', '10',
       '29', '125010000', '2589000', '389000', '479000', '1019', '129',
       '12', '185', '225', '1818500', '1078500', '857500', '75', '147',
       '208', '140', '177', '113', '238', '198', '272', '2026100',
       '2456500', '4767250', '477250', '2659700', '2659000', '1488000',
       '877000', '32', '33', '9', '17', '167', '89', '97', '79', '121',
       '107', '4', '200', '42', '26', '18', '40', '35', '38', '24', '20',
    

In [58]:
df["Horsepower"] = df["Horsepower"].astype(int)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    object 
 7   Drivetrain           362 non-null    object 
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int32(1), int64(5), objec

### 3.6 Torque

In [60]:
df["Torque"]

0          67 Nm
1          67 Nm
2          12 Nm
3          90 Nm
4         121 Nm
         ...    
357    195 lb-ft
358    222 lb-ft
359    222 lb-ft
360        62 Nm
361        58 Nm
Name: Torque, Length: 362, dtype: object

In [61]:
df["Torque"][359]

'222 lb-ft'

In [62]:
df["Torque"].isna().sum()

0

In [63]:
import re
df["Torque"] =  [re.sub(r'(-\d+)[^\d]*', r'\1', value) for value in df["Torque"]]

In [64]:
df["Torque"].value_counts()

Torque
10.5 Nm               10
18 Nm                  9
10 Nm                  8
11 Nm                  6
12 Nm                  6
                      ..
32 Nm@4250 rpm         1
52.3 Nm @ 5650 rpm     1
14 Nm @ 6000 rpm       1
163 lb-ft              1
58 Nm                  1
Name: count, Length: 219, dtype: int64

In [65]:
df["Torque"]

0          67 Nm
1          67 Nm
2          12 Nm
3          90 Nm
4         121 Nm
         ...    
357    195 lb-ft
358    222 lb-ft
359    222 lb-ft
360        62 Nm
361        58 Nm
Name: Torque, Length: 362, dtype: object

In [66]:
df["Torque"] = df["Torque"].replace({"_",""})

In [67]:
df["Torque"].unique(
)

array(['67 Nm', '12 Nm', '90 Nm', '121 Nm', '10.8 Nm', '21.2 Nm',
       '26.5 Nm', '54.6 Nm', '86.3 Nm', '46 Nm', '28 Nm', '92 Nm',
       '112 Nm', '113 Nm', '11.9 Nm', '38 Nm', '37 Nm', '97 Nm', '98 Nm',
       '103 Nm', '66 Nm', '95 Nm', '54 Nm', '40 Nm', '77 Nm', '105 Nm',
       '21.5 Nm', '45 Nm', '55 Nm', '49 Nm', '135 lb-ft', '125 lb-ft',
       '143 lb-ft', '10.2 Nm', '22.4 Nm', '124 lb-ft', '93 lb-ft',
       '47 lb-ft', '69 lb-ft', '65 ft-lbs', '94 ft-lbs', '118 ft-lbs',
       '112 ft-lbs', '19.4 Nm @ 7,000 rpm', '27.3 Nm @ 6,500 rpm',
       '30.0 Nm @ 3,000 rpm', '10.3 Nm @ 5,500 rpm', '37 Nm @ 7,000 rpm',
       '72 Nm @ 6,750 rpm', '114 Nm @ 3,200 rpm', '178 Nm @ 3,800 rpm',
       '110 Nm @ 6,000 rpm', '95 Nm @ 3,750 rpm', '27 Nm @ 10000 rpm',
       '39 Nm @ 8000 rpm', '66 Nm @ 6500 rpm', '98.6 Nm @ 7700 rpm',
       '140 Nm @ 8500 rpm', '62 Nm @ 6500 rpm', '19 Nm', '24 Nm', '88 Nm',
       '9.5 Nm', '27 Nm', '10.2 Nm @ 8000 RPM', '23.5 Nm @ 7000 RPM',
       '38 Nm 

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    object 
 7   Drivetrain           362 non-null    object 
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int32(1), int64(5), objec

In [69]:
# import re

# # def extract_integer_value(string):
# #     # Use regular expressions to find the first integer in the string
# match = re.search(r'\d+', df["Torque"])
    
# if match:
#     return int(match.group())
# else:
#     return None

# # Example usage:
# input_string = "32 Nm @4250"
# integer_value = extract_integer_value(input_string)
# print(integer_value)


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    object 
 7   Drivetrain           362 non-null    object 
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int32(1), int64(5), objec

In [71]:
# df["Torque"] = re.findall(r'\b\d+\b', df["Torque"])

In [72]:

# # Extract numeric values using regular expressions
# df['Torque'] = df['Torque'].apply(lambda x: re.findall(r'\b\d+\b', x))


In [73]:
# df['Torque'] = df['Torque'].apply(lambda x: re.sub(r'[^0-9]', '', x))

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    object 
 7   Drivetrain           362 non-null    object 
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int32(1), int64(5), objec

### 3.7 Transmission_Type

In [75]:
df["Transmission_Type"]

0      6-speed quickshifter
1      6-speed quickshifter
2            6-speed manual
3            6-speed manual
4            6-speed manual
               ...         
357               Automatic
358               Automatic
359               Automatic
360          6-speed manual
361          6-speed manual
Name: Transmission_Type, Length: 362, dtype: object

In [76]:
df["Transmission_Type"].value_counts()

Transmission_Type
6-speed manual                  114
Manual                           50
CVT                              42
Automatic                        36
5-speed manual                   27
6-speed Manual                    8
Six-speed                         8
5-speed                           7
Five-speed constant mesh          7
4-speed constant mesh             6
6-speed sequential                6
Six-speed sequential              6
Six-speed constant mesh           5
5-speed constant mesh             4
6-Speed                           4
6-speed automatic                 4
4-speed manual                    3
CVT Automatic                     3
4-speed                           3
6 Speed Manual                    3
BLDC hub motor                    2
6-speed sequential manual         2
6-speed quickshifter              2
6-speed constant mesh             2
Direct drive                      1
Single Speed                      1
Bafang BBS02 mid-drive motor      1
BAFANG M60

In [77]:
Transmission_Type_data  = {'6-speed manual':1,
 'Manual': 40,
 'CVT': 13,
 'Automatic': 12,
 '5-speed manual': 14,
 '6-speed Manual': 15,
 'Six-speed': 16,
 '5-speed': 17,
 'Five-speed constant mesh': 18,
 '4-speed constant mesh': 6,
 '6-speed sequential': 19,
 'Six-speed sequential': 20,
 'Six-speed constant mesh': 21,
 '5-speed constant mesh':22,
 '6-Speed': 23,
 '6-speed automatic': 24,
 '4-speed manual': 25,
 'CVT Automatic': 26,
 '4-speed': 39,
 '6 Speed Manual': 27,
 'BLDC hub motor': 2,
 '6-speed sequential manual': 28,
 '6-speed quickshifter': 29,
 '6-speed constant mesh': 30,
 'Direct drive': 31,
 'Single Speed': 38,
 'Bafang BBS02 mid-drive motor': 31,
 'BAFANG M600 mid-drive motor': 33,
 'CVT automatic': 35,
 '6-speed': 34,
 'Pedal-assist and throttle': 36,
 'Constant Mesh, 6-speed': 37}
Transmission_Type_data

{'6-speed manual': 1,
 'Manual': 40,
 'CVT': 13,
 'Automatic': 12,
 '5-speed manual': 14,
 '6-speed Manual': 15,
 'Six-speed': 16,
 '5-speed': 17,
 'Five-speed constant mesh': 18,
 '4-speed constant mesh': 6,
 '6-speed sequential': 19,
 'Six-speed sequential': 20,
 'Six-speed constant mesh': 21,
 '5-speed constant mesh': 22,
 '6-Speed': 23,
 '6-speed automatic': 24,
 '4-speed manual': 25,
 'CVT Automatic': 26,
 '4-speed': 39,
 '6 Speed Manual': 27,
 'BLDC hub motor': 2,
 '6-speed sequential manual': 28,
 '6-speed quickshifter': 29,
 '6-speed constant mesh': 30,
 'Direct drive': 31,
 'Single Speed': 38,
 'Bafang BBS02 mid-drive motor': 31,
 'BAFANG M600 mid-drive motor': 33,
 'CVT automatic': 35,
 '6-speed': 34,
 'Pedal-assist and throttle': 36,
 'Constant Mesh, 6-speed': 37}

In [78]:
df["Transmission_Type"].replace({'6-speed manual':1,
 'Manual': 40,
 'CVT': 13,
 'Automatic': 12,
 '5-speed manual': 14,
 '6-speed Manual': 15,
 'Six-speed': 16,
 '5-speed': 17,
 'Five-speed constant mesh': 18,
 '4-speed constant mesh': 6,
 '6-speed sequential': 19,
 'Six-speed sequential': 20,
 'Six-speed constant mesh': 21,
 '5-speed constant mesh':22,
 '6-Speed': 23,
 '6-speed automatic': 24,
 '4-speed manual': 25,
 'CVT Automatic': 26,
 '4-speed': 39,
 '6 Speed Manual': 27,
 'BLDC hub motor': 2,
 '6-speed sequential manual': 28,
 '6-speed quickshifter': 29,
 '6-speed constant mesh': 30,
 'Direct drive': 31,
 'Single Speed': 38,
 'Bafang BBS02 mid-drive motor': 31,
 'BAFANG M600 mid-drive motor': 33,
 'CVT automatic': 35,
 '6-speed': 34,
 'Pedal-assist and throttle': 36,
 'Constant Mesh, 6-speed': 37},inplace  = True)

In [79]:
df["Transmission_Type"].astype(int)

0      29
1      29
2       1
3       1
4       1
       ..
357    12
358    12
359    12
360     1
361     1
Name: Transmission_Type, Length: 362, dtype: int32

In [80]:
Transmission_Type_data.keys()

dict_keys(['6-speed manual', 'Manual', 'CVT', 'Automatic', '5-speed manual', '6-speed Manual', 'Six-speed', '5-speed', 'Five-speed constant mesh', '4-speed constant mesh', '6-speed sequential', 'Six-speed sequential', 'Six-speed constant mesh', '5-speed constant mesh', '6-Speed', '6-speed automatic', '4-speed manual', 'CVT Automatic', '4-speed', '6 Speed Manual', 'BLDC hub motor', '6-speed sequential manual', '6-speed quickshifter', '6-speed constant mesh', 'Direct drive', 'Single Speed', 'Bafang BBS02 mid-drive motor', 'BAFANG M600 mid-drive motor', 'CVT automatic', '6-speed', 'Pedal-assist and throttle', 'Constant Mesh, 6-speed'])

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    object 
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int32(1), int64(6), objec

### 3.8 Drivetrain

In [82]:
df["Drivetrain"].value_counts().to_dict()

{'Chain': 110,
 'Chain drive': 93,
 'Automatic': 35,
 'Chain Drive': 22,
 'Belt drive': 20,
 'Electric': 16,
 'Shaft drive': 16,
 'CVT': 15,
 'Shaft': 11,
 'Belt': 4,
 'Shimano drivetrain': 4,
 'Automatic CVT': 3,
 'Front wheel drive': 3,
 'Belt Drive': 2,
 'Parallel twin': 2,
 'Permanent magnet synchronous motor': 1,
 'Front Suspension': 1,
 'Single-gear': 1,
 'Dual-motor, all-wheel drive': 1,
 'V-twin, liquid-cooled, 4-stroke': 1,
 'Rear wheel': 1}

In [83]:
df["Drivetrain"].replace({'Chain': 1,
 'Chain drive': 2,
 'Automatic': 3,
 'Chain Drive': 4,
 'Belt drive': 5,
 'Electric': 6,
 'Shaft drive': 7,
 'CVT': 8,
 'Shaft': 9,
 'Belt': 10,
 'Shimano drivetrain': 10,
 'Automatic CVT': 11,
 'Front wheel drive': 12,
 'Belt Drive': 12,
 'Parallel twin': 12,
 'Permanent magnet synchronous motor': 13,
 'Front Suspension': 13,
 'Single-gear': 13,
 'Dual-motor, all-wheel drive': 13,
 'V-twin, liquid-cooled, 4-stroke': 13,
 'Rear wheel': 13},inplace = True)

In [84]:
Drivetrain_data = {'Chain': 1,
 'Chain drive': 2,
 'Automatic': 3,
 'Chain Drive': 4,
 'Belt drive': 5,
 'Electric': 6,
 'Shaft drive': 7,
 'CVT': 8,
 'Shaft': 9,
 'Belt': 10,
 'Shimano drivetrain': 10,
 'Automatic CVT': 11,
 'Front wheel drive': 12,
 'Belt Drive': 12,
 'Parallel twin': 12,
 'Permanent magnet synchronous motor': 13,
 'Front Suspension': 13,
 'Single-gear': 13,
 'Dual-motor, all-wheel drive': 13,
 'V-twin, liquid-cooled, 4-stroke': 13,
 'Rear wheel': 13}

In [85]:
Drivetrain_data.keys()

dict_keys(['Chain', 'Chain drive', 'Automatic', 'Chain Drive', 'Belt drive', 'Electric', 'Shaft drive', 'CVT', 'Shaft', 'Belt', 'Shimano drivetrain', 'Automatic CVT', 'Front wheel drive', 'Belt Drive', 'Parallel twin', 'Permanent magnet synchronous motor', 'Front Suspension', 'Single-gear', 'Dual-motor, all-wheel drive', 'V-twin, liquid-cooled, 4-stroke', 'Rear wheel'])

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
dtypes: float64(1), int32(1), int64(7), objec

### 3.9 Number_of_Seating

In [87]:
df["Number_of_Seating"]

0      2
1      2
2      2
3      2
4      2
      ..
357    2
358    1
359    2
360    2
361    2
Name: Number_of_Seating, Length: 362, dtype: int64

In [88]:
df["Number_of_Seating"].value_counts()

Number_of_Seating
2    222
1    139
3      1
Name: count, dtype: int64

In [89]:
df["Number_of_Seating"].isna().sum()

0

### 3.10  Price

In [90]:
df["Price (in INR)"]

0        INR 10,99,000
1        INR 11,99,000
2         INR 4,49,000
3        INR 13,99,000
4        INR 19,99,000
            ...       
357       INR 25 lakhs
358    INR 23.75 lakhs
359    INR 25.25 lakhs
360           7,20,000
361           6,89,000
Name: Price (in INR), Length: 362, dtype: object

In [91]:
import re
cleaned_data = []

for value in df["Price (in INR)"]:
    match = re.search(r'(\d+(?:,\d+)*)', value)
    if match:
        number = match.group(1).replace(',', '')
        if 'lakh' in value:
            number = int(float(number) * 100000)
        elif 'thousand' in value:
            number = int(float(number) * 1000)
        else:
            number = int(number)
        cleaned_data.append(number)
    else:
        cleaned_data.append(None)
x = pd.DataFrame(cleaned_data)
x[0].value_counts().to_dict()


{100000: 27,
 1: 14,
 200000: 9,
 300000: 7,
 150000: 6,
 1300000: 6,
 1200000: 6,
 400000: 6,
 2000000: 5,
 800000: 5,
 2: 5,
 600000: 5,
 3: 4,
 250000: 4,
 110000: 4,
 1400000: 4,
 500000: 4,
 1100000: 4,
 1500000: 4,
 325000: 4,
 6: 4,
 1700000: 3,
 1450000: 3,
 1499000: 3,
 105000: 3,
 80000: 3,
 140000: 3,
 350000: 3,
 129999: 3,
 165000: 3,
 115000: 3,
 1000000: 3,
 2500000: 3,
 175000: 3,
 2200000: 3,
 145000: 3,
 750000: 2,
 90000: 2,
 5: 2,
 1800000: 2,
 45000: 2,
 160000: 2,
 70000: 2,
 88000: 2,
 35000: 2,
 40000: 2,
 119999: 2,
 149999: 2,
 2100000: 2,
 3500000: 2,
 650000: 2,
 550000: 2,
 62000: 2,
 1250000: 2,
 50000: 2,
 7: 2,
 65000: 2,
 219000: 2,
 135000: 2,
 9: 2,
 21: 2,
 1050000: 2,
 225000: 2,
 2449000: 2,
 850000: 2,
 55000: 2,
 72750: 1,
 130000: 1,
 240000: 1,
 78750: 1,
 167000: 1,
 62275: 1,
 14250: 1,
 69900: 1,
 329984: 1,
 1399990: 1,
 265053: 1,
 999990: 1,
 1169990: 1,
 1149990: 1,
 215000: 1,
 319983: 1,
 324992: 1,
 389000: 1,
 120000: 1,
 1379990: 1,

In [92]:
x[0].replace({1: 100000,2:200000,3:300000,6:600000,5:500000,7:700000,9:90000,21:210000,4:400000,8:80000,23:230000,12:120000,25:250000,27:270000,16:160000},inplace = True)

In [93]:
df["Price"] = x[0].to_list()
df

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Torque,Transmission_Type,Drivetrain,Number_of_Seating,Price (in INR),Year,Looks,Body Type,Engine Type,Number_of_Cylinders,Price
0,33,1,3,659.0,100,67 Nm,29,1,2,"INR 10,99,000",2021,Sport,Naked,Parallel-twin,2,1099000
1,33,1,1,659.0,100,67 Nm,29,1,2,"INR 11,99,000",2021,Sport,Naked,Parallel-twin,2,1199000
2,33,1,1,124.9,15,12 Nm,1,1,2,"INR 4,49,000",2022,Sport,Racing,Single-cylinder,1,449000
3,33,1,1,896.0,95,90 Nm,1,9,2,"INR 13,99,000",2022,Adventure,Naked,V-twin,2,1399000
4,33,1,1,1077.0,175,121 Nm,1,9,2,"INR 19,99,000",2022,Adventure,Naked,V-twin,2,1999000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,42,1,1,110.0,163,195 lb-ft,12,6,2,INR 25 lakhs,2019,Retro,Cruiser,Electric,1,2500000
358,42,1,1,126.0,171,222 lb-ft,12,6,1,INR 23.75 lakhs,2022,Futuristic,Naked,Electric,1,2300000
359,42,1,1,126.0,171,222 lb-ft,12,6,2,INR 25.25 lakhs,2022,Modern,Roadster,Electric,1,2500000
360,45,1,2,649.0,608,62 Nm,1,12,2,720000,2023,Modern,Adventure touring,"Liquid-cooled, DOHC",2,720000


In [94]:
x[0].astype(float)

0      1099000.0
1      1199000.0
2       449000.0
3      1399000.0
4      1999000.0
         ...    
357    2500000.0
358    2300000.0
359    2500000.0
360     720000.0
361     689000.0
Name: 0, Length: 362, dtype: float64

#### Dropping Price (in INR) as new Price columns is imported ``

In [95]:
# df.drop(columns = "Price (in INR)" ,inplace = True)

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
 15  Price                362 non-null    int

In [97]:
# def clean_numerical_value(value):
#     # Remove commas and extra spaces
#     value = re.sub(r'[^\d.]', '', value)

#     # Convert lakh notation to million
#     if 'lakh' in value.lower():
#         value = re.sub(r'[^\d.]', '', value)
#         value = float(value) * 10 ** 5

#     # Convert lakh/crore notation to million/billion
#     if 'lakh' in value.lower() or 'crore' in value.lower():
#         value = re.sub(r'[^\d.]', '', value)
#         value = float(value) * 10 ** 5

#     # Convert currency notation to numerical value
#     if 'rs.' in value.lower() or '₹' in value:
#         value = re.sub(r'[^\d.]', '', value)
    
#     return float(value)
# for value in data:
#     cleaned_value = re.sub(r'\.(?=\d)', '', value)
#     cleaned_data.append(cleaned_value)

# # Clean and convert each value in the dataset
# df["Price (in INR)"] = [clean_numerical_value(value) for value in df["Price (in INR)"]]
# df["Price (in INR)"]

In [98]:
# def clean_numerical_value(value):
#     # Remove commas and extra spaces
#     value = re.sub(r'[^\d.]', '', value)

#     # Convert lakh notation to million
#     if 'lakh' in value.lower():
#         value = re.sub(r'[^\d.]', '', value)
#         value = float(value) * 10 ** 5

#     # Convert lakh/crore notation to million/billion
#     if 'lakh' in value.lower() or 'crore' in value.lower():
#         value = re.sub(r'[^\d.]', '', value)
#         value = float(value) * 10 ** 5

#     # Convert currency notation to numerical value
#     if 'rs.' in value.lower() or '₹' in value:
#         value = re.sub(r'[^\d.]', '', value)
    
#     return float(value)
# for value in data:
#     cleaned_value = re.sub(r'\.(?=\d)', '', value)
#     cleaned_data.append(cleaned_value)

# # Clean and convert each value in the dataset
# df["Price (in INR)"] = [clean_numerical_value(value) for value in df["Price (in INR)"]]





In [99]:
# df['Price (in INR)']=df['Price (in INR)'].apply(lambda x:x.replace('lakh','00000'))



# df['Price (in INR)']=df['Price (in INR)'].apply(lambda x:x.replace('Lakh','00000'))


In [100]:
# df['Price (in INR)']

In [101]:
# l1=['Rs.','s','INR ','$','₹','(expected)','(ex-showroom)','(ex-howroom)',',']

# for i in l1:    
#     df['Price (in INR)']=df['Price (in INR)'].apply(lambda x:x.replace(i,''))
# df['Price (in INR)'].unique()
# df['Price (in INR)']

In [102]:
# df['Price (in INR)']=df['Price (in INR)'].apply(lambda x:re.sub('[ ]*','',x))


In [103]:
# x1 = df['Price (in INR)'].apply(lambda x:re.findall('[0-9]+[.][0-9]+',x))
# x1

In [104]:
# df['Price (in INR)'].astype(float)

In [105]:
# [float(sublist[0]) * 100000 for sublist in df['Price (in INR)'] if sublist]

In [106]:
# result = []
# for item in df['Price (in INR)'].items():
#     if '-' in item:
#         values = item.split('-')
#         values = [float(val.replace(' ', '')) for val in values]
#         average = sum(values) / len(values)
#         result.append(str(int(average)))
#     else:
#         result.append(item)
# df['Price (in INR)']=df['Price (in INR)'].apply(lambda x:re.sub('[-][0-9]+','',x))
# df['Price (in INR)'].value_counts().head(20)

In [107]:
# df['Price (in INR)'].unique()

In [108]:
# df['Price (in INR)']=df['Price (in INR)'].apply(lambda x:re.sub('[-][0-9]+','',x))
# df['Price (in INR)']

In [109]:
# df["Price (in INR)"] = df["Price (in INR)"].str.replace('lakhs', '').str.replace(',', '').str.replace('Lakhs', '').str.replace('INR', '').str.replace('s', '')

In [110]:
# df["Price (in INR)"].unique()

### 3.11 Year                 

In [111]:
df["Year"]

0      2021
1      2021
2      2022
3      2022
4      2022
       ... 
357    2019
358    2022
359    2022
360    2023
361    2023
Name: Year, Length: 362, dtype: int64

In [112]:
df["Year"].value_counts()


Year
2023    216
2022     90
2020     14
2021      7
2017      7
2019      7
2018      5
2016      3
2005      3
2003      2
2014      2
1994      1
2002      1
2000      1
2015      1
2006      1
1998      1
Name: count, dtype: int64

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
 15  Price                362 non-null    int

In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    object 
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
 15  Price                362 non-null    int

### 3.12 Looks

In [115]:
df["Looks"].value_counts().to_dict()

{'Modern': 75,
 'Classy': 48,
 'Adventure': 42,
 'Sporty': 30,
 'Retro': 30,
 'Classic': 29,
 'Retro-inspired': 10,
 'Modern, sporty': 10,
 'Futuristic': 7,
 'Sport': 7,
 'Modern, stylish': 4,
 'Classic, vintage': 4,
 'Cruiser': 4,
 'Aggressive': 4,
 'Commuter': 3,
 'Sharp': 3,
 'Simple, reliable': 3,
 'Motocross': 3,
 'Sporty, aggressive': 3,
 'Classic, stylish': 3,
 'Practical': 3,
 'Sleek': 3,
 'Enduro': 2,
 'Naked': 2,
 'Modern, aggressive': 2,
 'Retro-modern': 2,
 'Rugged': 2,
 'Race-inspired': 2,
 'Touring': 2,
 'Adventure, sporty': 1,
 'Modern, muscular': 1,
 'Urban': 1,
 'Stylish': 1,
 'Dirt': 1,
 'Modern, bobber': 1,
 'Retro, classic': 1,
 'Bold': 1,
 'Classy, cruiser': 1,
 'Budget': 1,
 'Modern, rugged': 1,
 'Fun': 1,
 'Modern, off-road': 1,
 'Bold, aggressive': 1,
 'Bobber': 1,
 'Adventure, touring': 1,
 'Classic, rugged': 1,
 'Sporty, beginner-friendly': 1,
 'Classic, bobber': 1,
 'Classy, aggressive': 1}

In [116]:
df["Looks"].replace({'Modern': 1,
 'Classy': 2,
 'Adventure': 3,
 'Sporty': 4,
 'Retro': 4,
 'Classic': 4,
 'Retro-inspired': 5,
 'Modern, sporty': 5,
 'Futuristic': 6,
 'Sport': 6,
 'Modern, stylish': 7,
 'Classic, vintage': 7,
 'Cruiser': 7,
 'Aggressive': 7,
 'Commuter': 7,
 'Sharp': 8,
 'Simple, reliable': 8,
 'Motocross': 8,
 'Sporty, aggressive': 8,
 'Classic, stylish': 8,
 'Practical': 8,
 'Sleek': 8,
 'Enduro': 9,
 'Naked': 9,
 'Modern, aggressive': 9,
 'Retro-modern':9,
 'Rugged': 9,
 'Race-inspired': 9,
 'Touring': 9,
 'Adventure, sporty': 10,
 'Modern, muscular': 10,
 'Urban': 10,
 'Stylish': 10,
 'Dirt': 10,
 'Modern, bobber': 10,
 'Retro, classic': 10,
 'Bold': 10,
 'Classy, cruiser': 10,
 'Budget': 10,
 'Modern, rugged': 10,
 'Fun': 10,
 'Modern, off-road': 10,
 'Bold, aggressive': 10,
 'Bobber': 10,
 'Adventure, touring': 10,
 'Classic, rugged': 10,
 'Sporty, beginner-friendly': 10,
 'Classic, bobber': 10,
 'Classy, aggressive': 10},inplace = True)

In [117]:
Looks_data = {'Modern': 1,
 'Classy': 2,
 'Adventure': 3,
 'Sporty': 4,
 'Retro': 4,
 'Classic': 4,
 'Retro-inspired': 5,
 'Modern, sporty': 5,
 'Futuristic': 6,
 'Sport': 6,
 'Modern, stylish': 7,
 'Classic, vintage': 7,
 'Cruiser': 7,
 'Aggressive': 7,
 'Commuter': 7,
 'Sharp': 8,
 'Simple, reliable': 8,
 'Motocross': 8,
 'Sporty, aggressive': 8,
 'Classic, stylish': 8,
 'Practical': 8,
 'Sleek': 8,
 'Enduro': 9,
 'Naked': 9,
 'Modern, aggressive': 9,
 'Retro-modern':9,
 'Rugged': 9,
 'Race-inspired': 9,
 'Touring': 9,
 'Adventure, sporty': 10,
 'Modern, muscular': 10,
 'Urban': 10,
 'Stylish': 10,
 'Dirt': 10,
 'Modern, bobber': 10,
 'Retro, classic': 10,
 'Bold': 10,
 'Classy, cruiser': 10,
 'Budget': 10,
 'Modern, rugged': 10,
 'Fun': 10,
 'Modern, off-road': 10,
 'Bold, aggressive': 10,
 'Bobber': 10,
 'Adventure, touring': 10,
 'Classic, rugged': 10,
 'Sporty, beginner-friendly': 10,
 'Classic, bobber': 10,
 'Classy, aggressive': 10}

In [118]:
Looks_data.keys()

dict_keys(['Modern', 'Classy', 'Adventure', 'Sporty', 'Retro', 'Classic', 'Retro-inspired', 'Modern, sporty', 'Futuristic', 'Sport', 'Modern, stylish', 'Classic, vintage', 'Cruiser', 'Aggressive', 'Commuter', 'Sharp', 'Simple, reliable', 'Motocross', 'Sporty, aggressive', 'Classic, stylish', 'Practical', 'Sleek', 'Enduro', 'Naked', 'Modern, aggressive', 'Retro-modern', 'Rugged', 'Race-inspired', 'Touring', 'Adventure, sporty', 'Modern, muscular', 'Urban', 'Stylish', 'Dirt', 'Modern, bobber', 'Retro, classic', 'Bold', 'Classy, cruiser', 'Budget', 'Modern, rugged', 'Fun', 'Modern, off-road', 'Bold, aggressive', 'Bobber', 'Adventure, touring', 'Classic, rugged', 'Sporty, beginner-friendly', 'Classic, bobber', 'Classy, aggressive'])

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    int64  
 12  Body Type            362 non-null    object 
 13  Engine Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
 15  Price                362 non-null    int

In [120]:
df = df.rename(columns={'Body Type': 'Body_Type',"Engine Type":"Engine_Type"})

### 3.13 Body Type

In [121]:
df["Body_Type"].nunique()

56

In [122]:
df["Body_Type"].value_counts().to_dict()

{'Cruiser': 59,
 'Naked': 46,
 'Scooter': 36,
 'Enduro': 23,
 'Standard': 23,
 'Adventure': 19,
 'Sport': 15,
 'Cafe racer': 13,
 'Scrambler': 10,
 'Street': 9,
 'Roadster': 9,
 'Adventure touring': 8,
 'Sportbike': 6,
 'Street Bike': 6,
 'Sports': 6,
 'Bobber': 5,
 'commuter': 5,
 'Cafe Racer': 5,
 'Supersport': 3,
 'Dual-sport': 3,
 'Streetfighter': 3,
 'Supermoto': 3,
 'Superbike': 3,
 'Retro': 2,
 'Trail Bike': 2,
 'Electric scooter': 2,
 'Naked streetfighter': 2,
 'Motocross': 2,
 'Dirt bike': 2,
 'Maxi-Scooter': 2,
 'Tourer': 2,
 'Naked bike': 2,
 'Adventure bike': 2,
 'Sports bike': 2,
 'Sports Naked': 1,
 'Street scrambler': 1,
 'Adventure tourer': 1,
 'Sport-touring': 1,
 'Bicycle': 1,
 'Classic': 1,
 'Cargo': 1,
 'Off-road': 1,
 'Fighter': 1,
 'Commuter': 1,
 'Naked Bike': 1,
 'Touring': 1,
 'Racing': 1,
 'Sport touring': 1,
 'Mountain Bike': 1,
 'Café racer': 1,
 'Hybrid bike': 1,
 'Mountain bike': 1,
 'Street naked': 1,
 'Trail': 1,
 'Street bike': 1,
 'Tracker': 1}

In [123]:
df["Body_Type"].replace({'Cruiser': 1,
 'Naked': 1,
 'Scooter': 1,
 'Enduro': 1,
 'Standard': 1,
 'Adventure': 2,
 'Sport': 2,
 'Cafe racer': 2,
 'Scrambler': 2,
 'Street': 3,
 'Roadster': 4,
 'Adventure touring': 5,
 'Sportbike': 6,
 'Street Bike': 6,
 'Sports': 6,
 'Bobber': 7,
 'commuter': 7,
 'Cafe Racer': 7,
 'Supersport': 7,
 'Dual-sport': 7,
 'Streetfighter': 7,
 'Supermoto': 7,
 'Superbike': 7,
 'Retro': 8,
 'Trail Bike': 8,
 'Electric scooter': 8,
 'Naked streetfighter': 8,
 'Motocross': 8,
 'Dirt bike': 8,
 'Maxi-Scooter': 8,
 'Tourer': 8,
 'Naked bike': 8,
 'Adventure bike': 8,
 'Sports bike': 8,
 'Sports Naked': 9,
 'Street scrambler': 9,
 'Adventure tourer': 9,
 'Sport-touring': 9,
 'Bicycle': 9,
 'Classic': 9,
 'Cargo': 9,
 'Off-road': 9,
 'Fighter': 9,
 'Commuter': 9,
 'Naked Bike': 9,
 'Touring': 9,
 'Racing': 9,
 'Sport touring': 9,
 'Mountain Bike': 9,
 'Café racer': 9,
 'Hybrid bike': 9,
 'Mountain bike': 9,
 'Street naked': 9,
 'Trail': 9,
 'Street bike': 9,
 'Tracker': 9},inplace = True)

In [124]:
Body_Type_data = {'Cruiser': 1,
 'Naked': 1,
 'Scooter': 1,
 'Enduro': 1,
 'Standard': 1,
 'Adventure': 2,
 'Sport': 2,
 'Cafe racer': 2,
 'Scrambler': 2,
 'Street': 3,
 'Roadster': 4,
 'Adventure touring': 5,
 'Sportbike': 6,
 'Street Bike': 6,
 'Sports': 6,
 'Bobber': 7,
 'commuter': 7,
 'Cafe Racer': 7,
 'Supersport': 7,
 'Dual-sport': 7,
 'Streetfighter': 7,
 'Supermoto': 7,
 'Superbike': 7,
 'Retro': 8,
 'Trail Bike': 8,
 'Electric scooter': 8,
 'Naked streetfighter': 8,
 'Motocross': 8,
 'Dirt bike': 8,
 'Maxi-Scooter': 8,
 'Tourer': 8,
 'Naked bike': 8,
 'Adventure bike': 8,
 'Sports bike': 8,
 'Sports Naked': 9,
 'Street scrambler': 9,
 'Adventure tourer': 9,
 'Sport-touring': 9,
 'Bicycle': 9,
 'Classic': 9,
 'Cargo': 9,
 'Off-road': 9,
 'Fighter': 9,
 'Commuter': 9,
 'Naked Bike': 9,
 'Touring': 9,
 'Racing': 9,
 'Sport touring': 9,
 'Mountain Bike': 9,
 'Café racer': 9,
 'Hybrid bike': 9,
 'Mountain bike': 9,
 'Street naked': 9,
 'Trail': 9,
 'Street bike': 9,
 'Tracker': 9}

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    int64  
 12  Body_Type            362 non-null    int64  
 13  Engine_Type          362 non-null    object 
 14  Number_of_Cylinders  361 non-null    object 
 15  Price                362 non-null    int

In [161]:
Body_Type_data.keys() 

dict_keys(['Cruiser', 'Naked', 'Scooter', 'Enduro', 'Standard', 'Adventure', 'Sport', 'Cafe racer', 'Scrambler', 'Street', 'Roadster', 'Adventure touring', 'Sportbike', 'Street Bike', 'Sports', 'Bobber', 'commuter', 'Cafe Racer', 'Supersport', 'Dual-sport', 'Streetfighter', 'Supermoto', 'Superbike', 'Retro', 'Trail Bike', 'Electric scooter', 'Naked streetfighter', 'Motocross', 'Dirt bike', 'Maxi-Scooter', 'Tourer', 'Naked bike', 'Adventure bike', 'Sports bike', 'Sports Naked', 'Street scrambler', 'Adventure tourer', 'Sport-touring', 'Bicycle', 'Classic', 'Cargo', 'Off-road', 'Fighter', 'Commuter', 'Naked Bike', 'Touring', 'Racing', 'Sport touring', 'Mountain Bike', 'Café racer', 'Hybrid bike', 'Mountain bike', 'Street naked', 'Trail', 'Street bike', 'Tracker'])

### 3.14 Engine_Type

In [126]:
df["Engine_Type"].nunique()

62

In [127]:
df["Engine_Type"].value_counts().to_dict()

{'Single-cylinder': 50,
 'Electric': 32,
 'V-twin': 31,
 'Single-cylinder, air-cooled': 22,
 'Single-cylinder, liquid-cooled': 20,
 'Single': 18,
 'Parallel-twin': 14,
 'Air-Cooled': 12,
 '3-cylinder': 11,
 'Four-stroke': 8,
 'Air Cooled, Single Cylinder, 4-Stroke': 8,
 'Air-cooled, 4-stroke, single-cylinder, DTS-i': 8,
 '4-stroke': 8,
 'Parallel-twin, liquid-cooled': 8,
 '2-stroke': 7,
 'Two-stroke': 7,
 'V-twin, liquid-cooled': 6,
 '1-cylinder': 6,
 'Liquid-cooled, parallel-twin': 6,
 'Air-cooled, single-cylinder': 6,
 'Single-cylinder, air-cooled, OHC': 4,
 'Air-cooled, parallel-twin': 4,
 'Liquid-Cooled': 3,
 '2-cylinder': 3,
 'Air-cooled, 4-stroke, single-cylinder': 3,
 'L-Twin': 3,
 'Air-Cooled, Single-Cylinder': 3,
 'Liquid-Cooled, Single-Cylinder': 3,
 'Single cylinder': 3,
 'Single-cylinder, air-cooled, 4-stroke': 2,
 'Air-cooled, 4-stroke, single-cylinder, BS6': 2,
 'Boxer': 2,
 'Liquid-cooled, 4-stroke, single-cylinder, DTS-i': 2,
 'Single-cylinder, liquid-cooled, 4-stroke':

In [128]:
df["Engine_Type"].replace({'Single-cylinder': 1,
 'Electric': 2,
 'V-twin': 2,
 'Single-cylinder, air-cooled': 2,
 'Single-cylinder, liquid-cooled': 2,
 'Single': 3,
 'Parallel-twin': 4,
 'Air-Cooled': 5,
 '3-cylinder': 5,
 'Four-stroke': 6,
 'Air Cooled, Single Cylinder, 4-Stroke': 6,
 'Air-cooled, 4-stroke, single-cylinder, DTS-i': 6,
 '4-stroke': 6,
 'Parallel-twin, liquid-cooled':6,
 '2-stroke': 7,
 'Two-stroke': 7,
 'V-twin, liquid-cooled': 8,
 '1-cylinder': 8,
 'Liquid-cooled, parallel-twin': 8,
 'Air-cooled, single-cylinder': 8,
 'Single-cylinder, air-cooled, OHC': 8,
 'Air-cooled, parallel-twin':8,
 'Liquid-Cooled':9,
 '2-cylinder': 9,
 'Air-cooled, 4-stroke, single-cylinder': 9,
 'L-Twin': 9,
 'Air-Cooled, Single-Cylinder': 9,
 'Liquid-Cooled, Single-Cylinder': 9,
 'Single cylinder': 9,
 'Single-cylinder, air-cooled, 4-stroke': 10,
 'Air-cooled, 4-stroke, single-cylinder, BS6': 10,
 'Boxer': 10,
 'Liquid-cooled, 4-stroke, single-cylinder, DTS-i': 10,
 'Single-cylinder, liquid-cooled, 4-stroke': 10,
 'Liquid-cooled inline-4': 10,
 'Liquid-cooled, DOHC':10,
 '4-cylinder': 10,
 'Liquid-cooled, 4-stroke, single-cylinder':10,
 'V4': 10,
 'Liquid-cooled, single-cylinder': 10,
 'Liquid Cooled, Single Cylinder, 4-Stroke': 10,
 'Single cylinder, air-cooled, 4-stroke, 4-valves, SOHC': 10,
 'Single Cylinder, 4-Stroke, Air-cooled': 11,
 'V-twin, air-cooled': 11,
 'V-twin, liquid-cooled, 4-stroke': 11,
 'Liquid-cooled, four-cylinder': 11,
 'Supercharged, four-cylinder': 11,
 'Rotax 990 ACE': 11,
 'Rotax 600 ACE': 11,
 'Rotax ACE': 11,
 'In-line-four': 11,
 'Single cylinder, liquid-cooled, 4-stroke, 4-valves, DOHC': 11,
 'In-line-three': 11,
 'Single-cylinder, liquid-cooled, DOHC':11,
 'Liquid-cooled, V4': 11,
 'Liquid-cooled': 11,
 'Parallel twin': 11,
 '2': 11,
 'Single-cylinder, fuel-injected, air-cooled': 11,
 'Triple': 11,
 'Liquid-cooled, 4-cylinder, DOHC': 11,
 'Single cylinder, liquid-cooled, 4-stroke, 4-valves, SOHC': 11},inplace = True)

In [129]:
Engine_Type_data = {'Single-cylinder': 1,
 'Electric': 2,
 'V-twin': 2,
 'Single-cylinder, air-cooled': 2,
 'Single-cylinder, liquid-cooled': 2,
 'Single': 3,
 'Parallel-twin': 4,
 'Air-Cooled': 5,
 '3-cylinder': 5,
 'Four-stroke': 6,
 'Air Cooled, Single Cylinder, 4-Stroke': 6,
 'Air-cooled, 4-stroke, single-cylinder, DTS-i': 6,
 '4-stroke': 6,
 'Parallel-twin, liquid-cooled':6,
 '2-stroke': 7,
 'Two-stroke': 7,
 'V-twin, liquid-cooled': 8,
 '1-cylinder': 8,
 'Liquid-cooled, parallel-twin': 8,
 'Air-cooled, single-cylinder': 8,
 'Single-cylinder, air-cooled, OHC': 8,
 'Air-cooled, parallel-twin':8,
 'Liquid-Cooled':9,
 '2-cylinder': 9,
 'Air-cooled, 4-stroke, single-cylinder': 9,
 'L-Twin': 9,
 'Air-Cooled, Single-Cylinder': 9,
 'Liquid-Cooled, Single-Cylinder': 9,
 'Single cylinder': 9,
 'Single-cylinder, air-cooled, 4-stroke': 10,
 'Air-cooled, 4-stroke, single-cylinder, BS6': 10,
 'Boxer': 10,
 'Liquid-cooled, 4-stroke, single-cylinder, DTS-i': 10,
 'Single-cylinder, liquid-cooled, 4-stroke': 10,
 'Liquid-cooled inline-4': 10,
 'Liquid-cooled, DOHC':10,
 '4-cylinder': 10,
 'Liquid-cooled, 4-stroke, single-cylinder':10,
 'V4': 10,
 'Liquid-cooled, single-cylinder': 10,
 'Liquid Cooled, Single Cylinder, 4-Stroke': 10,
 'Single cylinder, air-cooled, 4-stroke, 4-valves, SOHC': 10,
 'Single Cylinder, 4-Stroke, Air-cooled': 11,
 'V-twin, air-cooled': 11,
 'V-twin, liquid-cooled, 4-stroke': 11,
 'Liquid-cooled, four-cylinder': 11,
 'Supercharged, four-cylinder': 11,
 'Rotax 990 ACE': 11,
 'Rotax 600 ACE': 11,
 'Rotax ACE': 11,
 'In-line-four': 11,
 'Single cylinder, liquid-cooled, 4-stroke, 4-valves, DOHC': 11,
 'In-line-three': 11,
 'Single-cylinder, liquid-cooled, DOHC':11,
 'Liquid-cooled, V4': 11,
 'Liquid-cooled': 11,
 'Parallel twin': 11,
 '2': 11,
 'Single-cylinder, fuel-injected, air-cooled': 11,
 'Triple': 11,
 'Liquid-cooled, 4-cylinder, DOHC': 11,
 'Single cylinder, liquid-cooled, 4-stroke, 4-valves, SOHC': 11}

In [162]:
Engine_Type_data.keys()

dict_keys(['Single-cylinder', 'Electric', 'V-twin', 'Single-cylinder, air-cooled', 'Single-cylinder, liquid-cooled', 'Single', 'Parallel-twin', 'Air-Cooled', '3-cylinder', 'Four-stroke', 'Air Cooled, Single Cylinder, 4-Stroke', 'Air-cooled, 4-stroke, single-cylinder, DTS-i', '4-stroke', 'Parallel-twin, liquid-cooled', '2-stroke', 'Two-stroke', 'V-twin, liquid-cooled', '1-cylinder', 'Liquid-cooled, parallel-twin', 'Air-cooled, single-cylinder', 'Single-cylinder, air-cooled, OHC', 'Air-cooled, parallel-twin', 'Liquid-Cooled', '2-cylinder', 'Air-cooled, 4-stroke, single-cylinder', 'L-Twin', 'Air-Cooled, Single-Cylinder', 'Liquid-Cooled, Single-Cylinder', 'Single cylinder', 'Single-cylinder, air-cooled, 4-stroke', 'Air-cooled, 4-stroke, single-cylinder, BS6', 'Boxer', 'Liquid-cooled, 4-stroke, single-cylinder, DTS-i', 'Single-cylinder, liquid-cooled, 4-stroke', 'Liquid-cooled inline-4', 'Liquid-cooled, DOHC', '4-cylinder', 'Liquid-cooled, 4-stroke, single-cylinder', 'V4', 'Liquid-cooled, s

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    int64  
 12  Body_Type            362 non-null    int64  
 13  Engine_Type          362 non-null    int64  
 14  Number_of_Cylinders  361 non-null    object 
 15  Price                362 non-null    int

### 3.15 Number_of_Cylinders

In [131]:
df["Number_of_Cylinders"].unique()

array(['2', '1', '3', '4', '90°', '0', 'Single', nan, 'One', 'Two',
       'Three', 'Parallel-Twin'], dtype=object)

In [132]:
df["Number_of_Cylinders"].value_counts().to_dict()

{'1': 181,
 '2': 66,
 'Single': 65,
 '3': 13,
 '4': 11,
 'Two': 9,
 'One': 7,
 '90°': 6,
 '0': 1,
 'Three': 1,
 'Parallel-Twin': 1}

In [133]:
Number_of_Cylinders_data = {'1': 1,
 '2': 2,
 'Single': 3,
 '3': 4,
 '4': 5,
 'Two': 6,
 'One': 7,
 '90°': 8,
 '0': 9,
 'Three': 10,
 'Parallel-Twin': 11}

In [134]:
df["Number_of_Cylinders"].replace({'1': 1,
 '2': 2,
 'Single': 3,
 '3': 4,
 '4': 5,
 'Two': 6,
 'One': 7,
 '90°': 8,
 '0': 9,
 'Three': 10,
 'Parallel-Twin': 11},inplace = True)

In [135]:
df["Number_of_Cylinders"].astype(float)

0      2.0
1      2.0
2      1.0
3      2.0
4      2.0
      ... 
357    1.0
358    1.0
359    1.0
360    2.0
361    2.0
Name: Number_of_Cylinders, Length: 362, dtype: float64

In [136]:
df["Number_of_Cylinders"].isna().sum(
)

1

In [137]:
df["Number_of_Cylinders"].replace({np.nan:1})

0      2.0
1      2.0
2      1.0
3      2.0
4      2.0
      ... 
357    1.0
358    1.0
359    1.0
360    2.0
361    2.0
Name: Number_of_Cylinders, Length: 362, dtype: float64

In [138]:
df.info(
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    int64  
 12  Body_Type            362 non-null    int64  
 13  Engine_Type          362 non-null    int64  
 14  Number_of_Cylinders  361 non-null    float64
 15  Price                362 non-null    int

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              362 non-null    int64  
 1   Country_of_Origin    362 non-null    int64  
 2   Model                362 non-null    int64  
 3   Number_of_cc         362 non-null    float64
 4   Horsepower           362 non-null    int32  
 5   Torque               362 non-null    object 
 6   Transmission_Type    362 non-null    int64  
 7   Drivetrain           362 non-null    int64  
 8   Number_of_Seating    362 non-null    int64  
 9   Price (in INR)       362 non-null    object 
 10  Year                 362 non-null    int64  
 11  Looks                362 non-null    int64  
 12  Body_Type            362 non-null    int64  
 13  Engine_Type          362 non-null    int64  
 14  Number_of_Cylinders  361 non-null    float64
 15  Price                362 non-null    int

### Dropping the object dtata type values

In [140]:
df.drop(columns = "Torque",axis = 1,inplace = True)

In [141]:
df.drop(columns = "Number_of_Cylinders",axis = 1,inplace = True)

In [142]:
df.drop(columns = "Price (in INR)",axis = 1,inplace = True)

In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company            362 non-null    int64  
 1   Country_of_Origin  362 non-null    int64  
 2   Model              362 non-null    int64  
 3   Number_of_cc       362 non-null    float64
 4   Horsepower         362 non-null    int32  
 5   Transmission_Type  362 non-null    int64  
 6   Drivetrain         362 non-null    int64  
 7   Number_of_Seating  362 non-null    int64  
 8   Year               362 non-null    int64  
 9   Looks              362 non-null    int64  
 10  Body_Type          362 non-null    int64  
 11  Engine_Type        362 non-null    int64  
 12  Price              362 non-null    int64  
dtypes: float64(1), int32(1), int64(11)
memory usage: 35.5 KB


In [144]:
Looks_data

{'Modern': 1,
 'Classy': 2,
 'Adventure': 3,
 'Sporty': 4,
 'Retro': 4,
 'Classic': 4,
 'Retro-inspired': 5,
 'Modern, sporty': 5,
 'Futuristic': 6,
 'Sport': 6,
 'Modern, stylish': 7,
 'Classic, vintage': 7,
 'Cruiser': 7,
 'Aggressive': 7,
 'Commuter': 7,
 'Sharp': 8,
 'Simple, reliable': 8,
 'Motocross': 8,
 'Sporty, aggressive': 8,
 'Classic, stylish': 8,
 'Practical': 8,
 'Sleek': 8,
 'Enduro': 9,
 'Naked': 9,
 'Modern, aggressive': 9,
 'Retro-modern': 9,
 'Rugged': 9,
 'Race-inspired': 9,
 'Touring': 9,
 'Adventure, sporty': 10,
 'Modern, muscular': 10,
 'Urban': 10,
 'Stylish': 10,
 'Dirt': 10,
 'Modern, bobber': 10,
 'Retro, classic': 10,
 'Bold': 10,
 'Classy, cruiser': 10,
 'Budget': 10,
 'Modern, rugged': 10,
 'Fun': 10,
 'Modern, off-road': 10,
 'Bold, aggressive': 10,
 'Bobber': 10,
 'Adventure, touring': 10,
 'Classic, rugged': 10,
 'Sporty, beginner-friendly': 10,
 'Classic, bobber': 10,
 'Classy, aggressive': 10}

## Linearity 

In [145]:
df.corr()

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Transmission_Type,Drivetrain,Number_of_Seating,Year,Looks,Body_Type,Engine_Type,Price
Company,1.0,-0.098619,-0.038637,-0.026805,0.030159,0.088373,0.167863,0.146202,-0.135368,-0.059469,0.112732,0.077394,-0.06944
Country_of_Origin,-0.098619,1.0,0.203207,0.138976,-0.083948,-0.141476,-0.019274,-0.198714,0.179908,0.056183,-0.08189,-0.165659,0.130886
Model,-0.038637,0.203207,1.0,0.102423,-0.023438,-0.251859,0.024528,0.033561,0.097927,0.114477,-0.030716,-0.090514,-0.078599
Number_of_cc,-0.026805,0.138976,0.102423,1.0,-0.021004,-0.190109,0.192611,0.067884,-0.118215,0.070069,0.009685,0.101157,0.504245
Horsepower,0.030159,-0.083948,-0.023438,-0.021004,1.0,0.069979,0.00897,-0.010876,0.025994,-0.072675,-0.030084,0.07464,-0.056754
Transmission_Type,0.088373,-0.141476,-0.251859,-0.190109,0.069979,1.0,-0.094204,0.006984,-0.269129,-0.156569,0.07415,0.048527,-0.064452
Drivetrain,0.167863,-0.019274,0.024528,0.192611,0.00897,-0.094204,1.0,0.05498,0.02587,-0.03,0.038565,0.137023,0.154605
Number_of_Seating,0.146202,-0.198714,0.033561,0.067884,-0.010876,0.006984,0.05498,1.0,0.011492,-0.008244,-0.121032,0.009533,-0.059492
Year,-0.135368,0.179908,0.097927,-0.118215,0.025994,-0.269129,0.02587,0.011492,1.0,0.007033,-0.055814,-0.061974,-0.084258
Looks,-0.059469,0.056183,0.114477,0.070069,-0.072675,-0.156569,-0.03,-0.008244,0.007033,1.0,-0.068882,0.007035,-0.020106


In [146]:
df.isna().sum()

Company              0
Country_of_Origin    0
Model                0
Number_of_cc         0
Horsepower           0
Transmission_Type    0
Drivetrain           0
Number_of_Seating    0
Year                 0
Looks                0
Body_Type            0
Engine_Type          0
Price                0
dtype: int64

## Model Traning 

In [147]:
x = df.drop('Price',axis = 1) # independent Variables
y = df['Price'] # Dependent variable



In [148]:
x

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Transmission_Type,Drivetrain,Number_of_Seating,Year,Looks,Body_Type,Engine_Type
0,33,1,3,659.0,100,29,1,2,2021,6,1,4
1,33,1,1,659.0,100,29,1,2,2021,6,1,4
2,33,1,1,124.9,15,1,1,2,2022,6,9,1
3,33,1,1,896.0,95,1,9,2,2022,3,1,2
4,33,1,1,1077.0,175,1,9,2,2022,3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
357,42,1,1,110.0,163,12,6,2,2019,4,1,2
358,42,1,1,126.0,171,12,6,1,2022,6,1,2
359,42,1,1,126.0,171,12,6,2,2022,1,4,2
360,45,1,2,649.0,608,1,12,2,2023,1,5,10


In [149]:
y

0      1099000
1      1199000
2       449000
3      1399000
4      1999000
        ...   
357    2500000
358    2300000
359    2500000
360     720000
361     689000
Name: Price, Length: 362, dtype: int64

In [150]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2,
                                                   random_state = 11)
x_test

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Transmission_Type,Drivetrain,Number_of_Seating,Year,Looks,Body_Type,Engine_Type
259,45,4,3,125.0,136,1,2,2,2022,1,6,2
151,6,7,3,888.0,105,1,7,2,2023,10,2,5
153,6,7,3,900.0,97,1,2,2,2022,8,1,9
319,31,3,2,125.0,115156,13,3,2,2023,1,1,2
334,1,1,2,500.0,4758500,1,2,2,2019,1,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...
28,37,5,1,1125.0,96,40,10,2,2006,3,5,8
292,5,6,1,170.0,116,12,8,2,2023,1,1,6
251,20,1,1,278.2,184,26,11,1,2023,1,2,9
279,47,16,1,125.0,115,14,1,1,2023,4,9,3


In [151]:
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)

### Testing data Evaluation

In [152]:
y_pred = linear_reg.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2_value = r2_score(y_test, y_pred)
print("R-Squared :",r2_value)

RMSE : 596009.4681321944
MAE : 407468.8445996565
R-Squared : 0.13527503496962856


### Traning Data Evaluatin

In [153]:
y_pred_train = linear_reg.predict(x_train)

mse = mean_squared_error(y_train, y_pred_train)
rmse = np.sqrt(mse)

print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2_value = r2_score(y_train, y_pred_train )
print("R-Squared :",r2_value)

RMSE : 606550.8025463276
MAE : 417818.850647433
R-Squared : 0.32690635395120804


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Company            362 non-null    int64  
 1   Country_of_Origin  362 non-null    int64  
 2   Model              362 non-null    int64  
 3   Number_of_cc       362 non-null    float64
 4   Horsepower         362 non-null    int32  
 5   Transmission_Type  362 non-null    int64  
 6   Drivetrain         362 non-null    int64  
 7   Number_of_Seating  362 non-null    int64  
 8   Year               362 non-null    int64  
 9   Looks              362 non-null    int64  
 10  Body_Type          362 non-null    int64  
 11  Engine_Type        362 non-null    int64  
 12  Price              362 non-null    int64  
dtypes: float64(1), int32(1), int64(11)
memory usage: 35.5 KB


In [155]:
df[30:31]

Unnamed: 0,Company,Country_of_Origin,Model,Number_of_cc,Horsepower,Transmission_Type,Drivetrain,Number_of_Seating,Year,Looks,Body_Type,Engine_Type,Price
30,37,5,1,1201.0,105,40,1,1,2005,4,6,8,1200000


In [156]:
Company = "Aprilia"
Country_of_Origin = 'India'
Model = "RS 660"
Number_of_cc = 1200
Horsepower = 100
Transmission_Type = 'Automatic'
Drivetrain = "Chain"
Number_of_Seating = 2
Year = 2023
Looks = "Sport"
Body_Type = "Naked"
Engine_Type = "Single-cylinder"



Company = Company_data[Company]
Country_of_Origin = Country_of_Origin_data[Country_of_Origin]
Model = Model_data[Model]
Transmission_Type = Transmission_Type_data[Transmission_Type]
Drivetrain = Drivetrain_data[Drivetrain]
Looks = Looks_data[Looks]
Body_Type = Body_Type_data[Body_Type]
Engine_Type = Engine_Type_data[Engine_Type]


test_array = np.zeros([1,linear_reg.n_features_in_])
test_array[0,0] = Company
test_array[0,1] = Country_of_Origin
test_array[0,2] = Model
test_array[0,3] = Number_of_cc
test_array[0,4] = Horsepower
test_array[0,5]= Transmission_Type
test_array[0,6] = Drivetrain  
test_array[0,7] = Number_of_Seating
test_array[0,8] = Year
test_array[0,9] = Looks
test_array[0,10] = Body_Type
test_array[0,11] = Engine_Type

predicted_charges = np.around(linear_reg.predict(test_array)[0],3)
predicted_charges  

915205.951

In [157]:
with open('linear_regression.pkl','wb') as f:
    pickle.dump(linear_reg, f)

In [158]:
Looks_data

{'Modern': 1,
 'Classy': 2,
 'Adventure': 3,
 'Sporty': 4,
 'Retro': 4,
 'Classic': 4,
 'Retro-inspired': 5,
 'Modern, sporty': 5,
 'Futuristic': 6,
 'Sport': 6,
 'Modern, stylish': 7,
 'Classic, vintage': 7,
 'Cruiser': 7,
 'Aggressive': 7,
 'Commuter': 7,
 'Sharp': 8,
 'Simple, reliable': 8,
 'Motocross': 8,
 'Sporty, aggressive': 8,
 'Classic, stylish': 8,
 'Practical': 8,
 'Sleek': 8,
 'Enduro': 9,
 'Naked': 9,
 'Modern, aggressive': 9,
 'Retro-modern': 9,
 'Rugged': 9,
 'Race-inspired': 9,
 'Touring': 9,
 'Adventure, sporty': 10,
 'Modern, muscular': 10,
 'Urban': 10,
 'Stylish': 10,
 'Dirt': 10,
 'Modern, bobber': 10,
 'Retro, classic': 10,
 'Bold': 10,
 'Classy, cruiser': 10,
 'Budget': 10,
 'Modern, rugged': 10,
 'Fun': 10,
 'Modern, off-road': 10,
 'Bold, aggressive': 10,
 'Bobber': 10,
 'Adventure, touring': 10,
 'Classic, rugged': 10,
 'Sporty, beginner-friendly': 10,
 'Classic, bobber': 10,
 'Classy, aggressive': 10}

In [159]:
import json
project_data = {"Company" : Company_data, 
                "Country_of_Origin" :Country_of_Origin_data,
                "Model" :Model_data,
                "Transmission_Type":Transmission_Type_data,
                "Drivetrain"  : Drivetrain_data,
                "Looks" : Looks_data,
                "Body_Type" : Body_Type_data,
                "Engine_Type":Engine_Type_data }

with open('proj_data.json','w') as f:
    json.dump(project_data, f)

In [160]:
df.columns

Index(['Company', 'Country_of_Origin', 'Model', 'Number_of_cc', 'Horsepower',
       'Transmission_Type', 'Drivetrain', 'Number_of_Seating', 'Year', 'Looks',
       'Body_Type', 'Engine_Type', 'Price'],
      dtype='object')