# CAR PREDICTION TOOL USING ML | SHERWIN @INTERNPE |

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# Load the dataset
car = pd.read_csv('quikr_car.csv')

# Display the first few rows of the dataframe
car.head()



Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [2]:
# Show the shape of the dataframe
print("Shape of the dataframe:", car.shape)

# Display information about the dataframe
print("Information about the dataframe:")
car.info()


Shape of the dataframe: (892, 6)
Information about the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [3]:
# Ensure the 'year' column is of string type
car['year'] = car['year'].astype(str)

# Keep only rows where 'year' is numeric
car = car[car['year'].str.isnumeric()]

# Convert 'year' column to integer type
car['year'] = car['year'].astype(int)

# Remove rows where 'Price' is 'Ask For Price'
car = car[car['Price'] != 'Ask For Price']

# Ensure the 'Price' column is of string type
car['Price'] = car['Price'].astype(str)

# Remove commas from 'Price' and convert to integer type
car['Price'] = car['Price'].str.replace(',', '').astype(int)

# Ensure the 'kms_driven' column is of string type
car['kms_driven'] = car['kms_driven'].astype(str)

# Split 'kms_driven' by space, take the first part, remove commas, and ensure numeric
car['kms_driven'] = car['kms_driven'].str.split().str.get(0).str.replace(',', '')

# Keep only rows where 'kms_driven' is numeric
car = car[car['kms_driven'].str.isnumeric()]

# Convert 'kms_driven' to integer type
car['kms_driven'] = car['kms_driven'].astype(int)

# Remove rows where 'fuel_type' is NaN
car = car[~car['fuel_type'].isna()]

# Display the cleaned dataframe
print(car.head())


                                     name   company  year   Price  kms_driven  \
0    Hyundai Santro Xing XO eRLX Euro III   Hyundai  2007   80000       45000   
1                 Mahindra Jeep CL550 MDI  Mahindra  2006  425000          40   
3  Hyundai Grand i10 Magna 1.2 Kappa VTVT   Hyundai  2014  325000       28000   
4        Ford EcoSport Titanium 1.5L TDCi      Ford  2014  575000       36000   
6                               Ford Figo      Ford  2012  175000       41000   

  fuel_type  
0    Petrol  
1    Diesel  
3    Petrol  
4    Diesel  
6    Diesel  


In [4]:
car['name']=car['name'].str.split().str.slice(start=0,stop=3).str.join(' ')
car=car.reset_index(drop=True)
car

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
...,...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,270000,50000,Petrol
812,Tata Indica V2,Tata,2009,110000,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,300000,132000,Petrol
814,Tata Zest XM,Tata,2018,260000,27000,Diesel


In [5]:
X=car[['name','company','year','kms_driven','fuel_type']]
y=car['Price']
X

Unnamed: 0,name,company,year,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,36000,Diesel
4,Ford Figo,Ford,2012,41000,Diesel
...,...,...,...,...,...
811,Maruti Suzuki Ritz,Maruti,2011,50000,Petrol
812,Tata Indica V2,Tata,2009,30000,Diesel
813,Toyota Corolla Altis,Toyota,2009,132000,Petrol
814,Tata Zest XM,Tata,2018,27000,Diesel


TRAINING THE SPLIT


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

# Assuming 'car' dataframe is already cleaned and available

# Define features (X) and target (y)
X = car.drop(columns='Price')
y = car['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipelines for numeric and categorical features
numeric_features = ['year', 'kms_driven']
categorical_features = ['name', 'company', 'fuel_type']

numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown='ignore')
)

# Combine preprocessing steps
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    remainder='passthrough'
)

# Create the pipeline with preprocessing and the model
pipe = make_pipeline(preprocessor, LinearRegression())

# Train the model using cross-validation for better evaluation
cv_scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validated R2 scores: {cv_scores}")
print(f"Mean cross-validated R2 score: {cv_scores.mean()}")

# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

# Predict on the test data
y_pred = pipe.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R2 score on test data: {r2}")


Cross-validated R2 scores: [0.66268987 0.52200726 0.70478071 0.72398873 0.82709236]
Mean cross-validated R2 score: 0.6881117835947603
R2 score on test data: 0.19818539993567064


In [7]:
import pickle

# Assuming 'pipe' is your trained pipeline model
with open('car_price_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)
