In [34]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('quikr_car.csv')

In [3]:
dataset.head(3)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [5]:
dataset['fuel_type'].unique()

array(['Petrol', 'Diesel', nan, 'LPG'], dtype=object)

## Data cleaning
- year has non-year values
- year object to int
- price object to int
- price has ask for price
- kms_driven has kms with integers
- kms_driven object to int
- kms_driven has nan values
- keep first three words of name

#Cleaning

In [6]:
backup = dataset.copy()

In [7]:
dataset = dataset[dataset['year'].str.isnumeric()]

In [8]:
dataset['year']  = dataset['year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['year']  = dataset['year'].astype(int)


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 842 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        842 non-null    object
 1   company     842 non-null    object
 2   year        842 non-null    int64 
 3   Price       842 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: int64(1), object(5)
memory usage: 46.0+ KB


In [10]:
dataset = dataset[dataset['Price'] != 'Ask For Price']

In [11]:
dataset.columns


Index(['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type'], dtype='object')

In [12]:
dataset['Price'] = dataset['Price'].str.replace(',','').astype(int)

In [13]:
dataset['kms_driven'] = dataset['kms_driven'].str.split(' ').str.get(0).str.replace(',','')

In [14]:
dataset = dataset[dataset['kms_driven'].str.isnumeric()]

In [15]:
dataset['kms_driven'] = dataset['kms_driven'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['kms_driven'] = dataset['kms_driven'].astype(int)


In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int64 
 3   Price       817 non-null    int64 
 4   kms_driven  817 non-null    int64 
 5   fuel_type   816 non-null    object
dtypes: int64(3), object(3)
memory usage: 44.7+ KB


In [17]:
dataset = dataset[~dataset['fuel_type'].isna()]

In [18]:
dataset['name'] = dataset['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [19]:
dataset = dataset.reset_index(drop=True)

In [20]:
dataset.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [22]:
dataset = dataset[dataset['Price']<6e6].reset_index(drop=True)

In [23]:
dataset.to_csv('clean_data.csv')

#Model

In [24]:
X = dataset.drop(columns='Price')
y= dataset['Price']

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [27]:
ohe = OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

In [28]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']),
                                    remainder='passthrough')

In [29]:
lr = LinearRegression()

In [30]:
pipe = make_pipeline(column_trans,lr)

In [31]:
pipe.fit(X_train,y_train)

In [33]:
y_pred = pipe.predict(X_test)

In [35]:
r2_score(y_test,y_pred)

0.7009952624721654

In [39]:
scores = []
for i in range(1000):
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=i)
  lr = LinearRegression()
  pipe = make_pipeline(column_trans,lr)
  pipe.fit(X_train,y_train)
  y_pred = pipe.predict(X_test)
  scores.append(r2_score(y_test,y_pred))

In [40]:
np.argmax(scores)

433

In [41]:
scores[np.argmax(scores)]

0.8457059012561223

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=np.argmax(scores))
lr = LinearRegression()
pipe = make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)

In [44]:
r2_score(y_test,y_pred)

0.8457059012561223

In [45]:
import pickle

In [46]:
pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))

In [47]:
dataset.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
