In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
car = pd.read_csv('quikr_car.csv')
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [12]:
backup = car.copy()
car = car[car['year'].astype(str).str.isnumeric()]
car['year'] = car['year'].astype(int)
car = car[car['Price'] != 'Ask For Price']
car['Price'] = car['Price']. astype(str).str.replace(',', '').astype(int)
car['kms_driven'] = car['kms_driven']. astype(str).str.split().str.get(0).str.replace(',', '')
car = car[car['kms_driven'].str.isnumeric()]
car['kms_driven'] = car['kms_driven'].astype(int)
car = car[~car['fuel_type'].isna()]
car['name'] = car['name'].str.split().str.slice(start=0, stop=3).str.join(' ')
car = car[car['Price'] < 6000000]
car = car.reset_index(drop=True)
X = car[['name', 'company', 'year', 'kms_driven', 'fuel_type']]
y = car['Price']
print(X)
print(y)

                       name   company  year  kms_driven fuel_type
0       Hyundai Santro Xing   Hyundai  2007       45000    Petrol
1       Mahindra Jeep CL550  Mahindra  2006          40    Diesel
2         Hyundai Grand i10   Hyundai  2014       28000    Petrol
3    Ford EcoSport Titanium      Ford  2014       36000    Diesel
4                 Ford Figo      Ford  2012       41000    Diesel
..                      ...       ...   ...         ...       ...
810      Maruti Suzuki Ritz    Maruti  2011       50000    Petrol
811          Tata Indica V2      Tata  2009       30000    Diesel
812    Toyota Corolla Altis    Toyota  2009      132000    Petrol
813            Tata Zest XM      Tata  2018       27000    Diesel
814      Mahindra Quanto C8  Mahindra  2013       40000    Diesel

[815 rows x 5 columns]
0       80000
1      425000
2      325000
3      575000
4      175000
        ...  
810    270000
811    110000
812    300000
813    260000
814    390000
Name: Price, Length: 815, dtyp

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
ohe = OneHotEncoder()
ohe.fit(X[['name', 'company', 'fuel_type']])
column_trans = make_column_transformer(
    (OneHotEncoder(categories=ohe.categories_), ['name', 'company', 'fuel_type']),
    remainder='passthrough'
)
lr = LinearRegression()
pipe = make_pipeline(column_trans,lr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("R² score:", r2_score(y_test, y_pred))
scores = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=i)
    pipe = make_pipeline(column_trans, LinearRegression())
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    scores.append(r2_score(y_test, y_pred))
print("Best R² score:", max(scores))
best_seed = np.argmax(scores)
print("Best random_state:", best_seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=best_seed)
pipe = make_pipeline(column_trans, LinearRegression())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Final model R²:", r2_score(y_test, y_pred))

R² score: 0.6301904475050143
Best R² score: 0.8991157554877304
Best random_state: 302
Final model R²: 0.8991157554877304


In [25]:
sample_input = pd.DataFrame(columns=X.columns, data=np.array([
    'Maruti Suzuki Swift', 'Maruti', 2019, 15000, 'Petrol'
]).reshape(1, 5))
predicted_price = pipe.predict(sample_input)
print("Predicted Price for sample car:", int(predicted_price[0]))

Predicted Price for sample car: 448937


In [28]:
sample_input = pd.DataFrame(columns=X.columns, data=np.array([
    'Ford EcoSport Titanium', 'Ford', 2014, 25000, 'Diesel'
]).reshape(1, 5))
predicted_price = pipe.predict(sample_input)
print("Predicted Price for sample car:", int(predicted_price[0]))

Predicted Price for sample car: 491377
