In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('price.csv')
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [4]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
year,4340.0,2013.090783,4.215344,1992.0,2011.0,2014.0,2016.0,2020.0
selling_price,4340.0,504127.311751,578548.736139,20000.0,208749.75,350000.0,600000.0,8900000.0
km_driven,4340.0,66215.777419,46644.102194,1.0,35000.0,60000.0,90000.0,806599.0


In [5]:
column = ['fuel','seller_type', 'transmission','owner']
values = {}
for col in column:
    values[col] = df[col].unique()
values

{'fuel': array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object),
 'seller_type': array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object),
 'transmission': array(['Manual', 'Automatic'], dtype=object),
 'owner': array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
        'Third Owner', 'Test Drive Car'], dtype=object)}

In [6]:
for key, value in values.items():
    m=0
    value = list(value)
    for i in value:
        df.replace({key:{i: m}}, inplace = True)
        m += 1

In [7]:
values = {}
for col in column:
    values[col] = df[col].unique()
values

{'fuel': array([0, 1, 2, 3, 4], dtype=int64),
 'seller_type': array([0, 1, 2], dtype=int64),
 'transmission': array([0, 1], dtype=int64),
 'owner': array([0, 1, 2, 3, 4], dtype=int64)}

<h5> Spliting the data

In [8]:
X = df.drop(['name', 'selling_price'], axis=1)
y = df.selling_price

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 2023)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Using Linear reg

In [11]:
model = LinearRegression().fit(X_train, y_train)

<h5> Evaluation 

In [12]:
train_pred =  model.predict(X_train)

In [13]:
print('r squared error: ',r2_score(y_train, train_pred))
print('mean absolute error: ',mean_absolute_error(y_train, train_pred))

r squared error:  0.44456682827893435
mean absolute error:  231402.533155347


In [14]:
test_pred =  model.predict(X_test)

In [15]:
print('r squared error: ',r2_score(y_test, test_pred))
print('mean absolute error: ',mean_absolute_error(y_test, test_pred))

r squared error:  0.4051933120177148
mean absolute error:  241568.28287083193


### Using tensorflow

In [16]:
import tensorflow as tf
import keras
from keras import layers

In [17]:
# Create a model that include the normalization layer
model = tf.keras.Sequential([
    layers.Input(shape = 6),
    layers.Normalization(),   
    layers.Dense(256, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

model.compile(loss='mae',
                optimizer="adam")
model.fit(X_train, y_train, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x27b4ffad300>

In [18]:
model.evaluate(X_test, y_test)



160058.015625