In [679]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [680]:
cars_data = pd.read_csv('/content/sample_data/Cardetails.csv')

In [681]:
cars_data.shape

(8128, 13)

In [682]:
cars_data.isnull().sum()


Unnamed: 0,0
name,0
year,0
selling_price,0
km_driven,0
fuel,0
seller_type,0
transmission,0
owner,0
mileage,221
engine,221


In [683]:
cars_data.drop(columns=['torque'], inplace=True)
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


In [684]:
cars_data.dropna (inplace=True)
cars_data.shape

(7907, 12)

In [685]:
cars_data.duplicated().sum()


1189

In [686]:
cars_data


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,5.0
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,5.0
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248 CC,73.9 bhp,5.0
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,5.0


In [687]:
for col in cars_data.columns:
  print('Unique values of '+col)
  print(cars_data[col].unique)
  print("=============\n")

Unique values of name
<bound method Series.unique of 0             Maruti Swift Dzire VDI
1       Skoda Rapid 1.5 TDI Ambition
2           Honda City 2017-2020 EXi
3          Hyundai i20 Sportz Diesel
4             Maruti Swift VXI BSIII
                    ...             
8123               Hyundai i20 Magna
8124           Hyundai Verna CRDi SX
8125          Maruti Swift Dzire ZDi
8126                 Tata Indigo CR4
8127                 Tata Indigo CR4
Name: name, Length: 7907, dtype: object>

Unique values of year
<bound method Series.unique of 0       2014
1       2014
2       2006
3       2010
4       2007
        ... 
8123    2013
8124    2007
8125    2009
8126    2013
8127    2013
Name: year, Length: 7907, dtype: int64>

Unique values of selling_price
<bound method Series.unique of 0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8123    320000
8124    135000
8125    382000
8126    290000
8127    290000
Name: selling_price, Length: 7907,

In [688]:
def get_brand_name(car_name):
    car_name = car_name.split()[0]
    return car_name.strip()

In [689]:
def clean_data(value):
    value = value.split(' ')[0]
    value = value.strip()
    if value == '':
      value = 0
    return float(value)

In [690]:
get_brand_name('Maruti Swift Dzire VDI')

'Maruti'

In [691]:
cars_data['name'] = cars_data['name'].apply(get_brand_name)

In [692]:
cars_data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'], dtype=object)

In [693]:
cars_data['mileage'] = cars_data['mileage'].apply(clean_data)
cars_data['max_power'] = cars_data['max_power'].apply(clean_data)
cars_data['engine'] = cars_data['engine'].apply(clean_data)

In [694]:
for col in cars_data.columns:
  print('Unique values of '+col)
  print(cars_data[col].unique)
  print("=============\n")

Unique values of name
<bound method Series.unique of 0        Maruti
1         Skoda
2         Honda
3       Hyundai
4        Maruti
         ...   
8123    Hyundai
8124    Hyundai
8125     Maruti
8126       Tata
8127       Tata
Name: name, Length: 7907, dtype: object>

Unique values of year
<bound method Series.unique of 0       2014
1       2014
2       2006
3       2010
4       2007
        ... 
8123    2013
8124    2007
8125    2009
8126    2013
8127    2013
Name: year, Length: 7907, dtype: int64>

Unique values of selling_price
<bound method Series.unique of 0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8123    320000
8124    135000
8125    382000
8126    290000
8127    290000
Name: selling_price, Length: 7907, dtype: int64>

Unique values of km_driven
<bound method Series.unique of 0       145500
1       120000
2       140000
3       127000
4       120000
         ...  
8123    110000
8124    119000
8125    120000
8126     25000
8127   

In [751]:
cars_data['name'].replace([ 'Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault', 'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel'], [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars_data['name'].replace([ 'Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault', 'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel'], [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ,inplace=True)


In [749]:
cars_data['transmission'].unique()

array([1, 2])

In [752]:
cars_data['transmission'].replace([ 'Manual', 'Automatic'], [1,2], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars_data['transmission'].replace([ 'Manual', 'Automatic'], [1,2], inplace=True)


In [753]:
cars_data['seller_type'].unique()


array([1, 2, 3])

In [732]:
cars_data['seller_type'].replace([ 'Individual', 'Dealer', 'Trustmark Dealer'], [1,2,3], inplace=True)

In [754]:
cars_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7907 entries, 0 to 7906
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7907 non-null   int64  
 1   year           7907 non-null   int64  
 2   selling_price  7907 non-null   int64  
 3   km_driven      7907 non-null   int64  
 4   fuel           7907 non-null   int64  
 5   seller_type    7907 non-null   int64  
 6   transmission   7907 non-null   int64  
 7   owner          7907 non-null   object 
 8   mileage        7907 non-null   float64
 9   engine         7907 non-null   float64
 10  max_power      7907 non-null   float64
 11  seats          7907 non-null   float64
dtypes: float64(4), int64(7), object(1)
memory usage: 741.4+ KB


In [755]:
cars_data['fuel'].unique()

array([1, 2, 3, 4])

In [734]:
cars_data['fuel'].replace([ 'Diesel', 'Petrol', 'LPG', 'CNG'],[1,2,3,4], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cars_data['fuel'].replace([ 'Diesel', 'Petrol', 'LPG', 'CNG'],[1,2,3,4], inplace=True)


In [756]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7907 entries, 0 to 7906
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7907 non-null   int64  
 1   year           7907 non-null   int64  
 2   selling_price  7907 non-null   int64  
 3   km_driven      7907 non-null   int64  
 4   fuel           7907 non-null   int64  
 5   seller_type    7907 non-null   int64  
 6   transmission   7907 non-null   int64  
 7   owner          7907 non-null   object 
 8   mileage        7907 non-null   float64
 9   engine         7907 non-null   float64
 10  max_power      7907 non-null   float64
 11  seats          7907 non-null   float64
dtypes: float64(4), int64(7), object(1)
memory usage: 741.4+ KB


In [757]:
cars_data.reset_index(inplace=True)

In [758]:
cars_data

Unnamed: 0,index,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,1,2014,450000,145500,1,1,1,First Owner,23.40,1248.0,74.00,5.0
1,1,2,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,2,3,2006,158000,140000,2,1,1,3,17.70,1497.0,78.00,5.0
3,3,4,2010,225000,127000,1,1,1,First Owner,23.00,1396.0,90.00,5.0
4,4,1,2007,130000,120000,2,1,1,First Owner,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7902,7902,4,2013,320000,110000,2,1,1,First Owner,18.50,1197.0,82.85,5.0
7903,7903,4,2007,135000,119000,1,1,1,4,16.80,1493.0,110.00,5.0
7904,7904,1,2009,382000,120000,1,1,1,First Owner,19.30,1248.0,73.90,5.0
7905,7905,9,2013,290000,25000,1,1,1,First Owner,23.57,1396.0,70.00,5.0


In [763]:
cars_data['owner']. unique()


array([1, 2, 3, 4, 5])

In [762]:
cars_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner',
          'Fourth & Above Owner', 'Test Drive Car'],
                              [1,2,3,4,5], inplace=True)

  cars_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner',


In [764]:
cars_data.drop(columns=['index'], inplace=True)

In [765]:
cars_data


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,1,2014,450000,145500,1,1,1,1,23.40,1248.0,74.00,5.0
1,2,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,3,2006,158000,140000,2,1,1,3,17.70,1497.0,78.00,5.0
3,4,2010,225000,127000,1,1,1,1,23.00,1396.0,90.00,5.0
4,1,2007,130000,120000,2,1,1,1,16.10,1298.0,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7902,4,2013,320000,110000,2,1,1,1,18.50,1197.0,82.85,5.0
7903,4,2007,135000,119000,1,1,1,4,16.80,1493.0,110.00,5.0
7904,1,2009,382000,120000,1,1,1,1,19.30,1248.0,73.90,5.0
7905,9,2013,290000,25000,1,1,1,1,23.57,1396.0,70.00,5.0


In [766]:
input_data = cars_data.drop(columns=['selling_price'])
output_data =cars_data['selling_price']
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [767]:
model=LinearRegression()

In [768]:
model.fit(x_train, y_train)

In [775]:
pridict = model.predict(x_test)


In [776]:
pridict

array([ 814543.51011705,  936579.49921212,  628790.02413055, ...,
       2107610.74523231,  539171.16775312,  280888.01912946])

In [778]:
x_train.head(1)


Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
6686,8,2014,120000,1,1,1,1,12.05,2179.0,120.0,7.0


In [779]:
input_data_model = pd.DataFrame(
    [[5482,1,2020,5000,2,1,1,21.21,1197.0,81.8,5.0]],
    columns=['name','year','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','seats'])

In [780]:
input_data_model

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,5482,1,2020,5000,2,1,1,21.21,1197.0,81.8,5.0


In [782]:
model.predict(input_data_model)

array([1.15359007e+08])

In [783]:
import pickle as pk

In [784]:
pk.dump(model, open('model.pkl', 'wb'))