In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [2]:
df=pd.read_csv(r"C:\Users\Sachin Kamath\Downloads\Car Details.csv",low_memory=False)
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [4]:
df['manufacturer']=df['name'].apply(lambda x:x.split(' ')[0])

In [5]:
df['model']=df['name'].apply(lambda x:x.split(' ')[1])

In [6]:
data=df

In [7]:
#  Data Exploration
print('Number of Rows: ', data.shape[0])
print('Number of Columns: ', data.shape[1], '\n')
print('Columns Names:', data.columns, '\n')
print('SubSet of Data:\n ', data.head().to_string(), '\n')
print('Data Information: '); print(data.info(), '\n')
print('Data Describe:\n ', data.describe(), '\n')

Number of Rows:  4340
Number of Columns:  10 

Columns Names: Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'manufacturer', 'model'],
      dtype='object') 

SubSet of Data:
                         name  year  selling_price  km_driven    fuel seller_type transmission         owner manufacturer   model
0             Maruti 800 AC  2007          60000      70000  Petrol  Individual       Manual   First Owner       Maruti     800
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol  Individual       Manual   First Owner       Maruti   Wagon
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel  Individual       Manual   First Owner      Hyundai   Verna
3    Datsun RediGO T Option  2017         250000      46000  Petrol  Individual       Manual   First Owner       Datsun  RediGO
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel  Individual       Manual  Second Owner        Honda   

In [8]:
df.shape

(4340, 10)

In [9]:
# Top ten columns with missing values %
missing = df.count()/len(df)
missing = (1-missing)*100
missing.sort_values(ascending=False).head(10)

name             0.0
year             0.0
selling_price    0.0
km_driven        0.0
fuel             0.0
seller_type      0.0
transmission     0.0
owner            0.0
manufacturer     0.0
model            0.0
dtype: float64

In [10]:
# Check for duplicates
print(df.duplicated().any())
duplicated = df.duplicated()
print('Number of duplicated data: ', duplicated[duplicated == True].size)

True
Number of duplicated data:  763


In [11]:
print('Categorical Data: ')
print('Fuel unique values: ', df['fuel'].unique())
print('Number of unique values: ', df['fuel'].unique().size)
print('Seller type unique values: ', df['seller_type'].unique())
print('Number of unique values: ', df['seller_type'].unique().size)
print('Transmission unique values: ', df['transmission'].unique())
print('Number of unique values: ', df['transmission'].unique().size)
print('Owner unique values: ', df['owner'].unique())
print('Number of unique values: ', df['owner'].unique().size)

Categorical Data: 
Fuel unique values:  ['Petrol' 'Diesel' 'CNG' 'LPG' 'Electric']
Number of unique values:  5
Seller type unique values:  ['Individual' 'Dealer' 'Trustmark Dealer']
Number of unique values:  3
Transmission unique values:  ['Manual' 'Automatic']
Number of unique values:  2
Owner unique values:  ['First Owner' 'Second Owner' 'Fourth & Above Owner' 'Third Owner'
 'Test Drive Car']
Number of unique values:  5


In [12]:
# Unique name cars
print('Unique name of Cars: ', df['name'].unique().size)

Unique name of Cars:  1491


In [13]:
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,model
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner,Maruti,800
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner,Maruti,Wagon
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner,Hyundai,Verna
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner,Datsun,RediGO
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner,Honda,Amaze
...,...,...,...,...,...,...,...,...,...,...
4335,Hyundai i20 Magna 1.4 CRDi (Diesel),2014,409999,80000,Diesel,Individual,Manual,Second Owner,Hyundai,i20
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner,Hyundai,i20
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner,Maruti,800
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner,Hyundai,Creta


In [14]:
df['number_of_years'] = 2023 - df['year']
print('**********************************************************************************')
print('Data Types after Replacement:\n ', df.dtypes)
print(df.head().to_string())

**********************************************************************************
Data Types after Replacement:
  name               object
year                int64
selling_price       int64
km_driven           int64
fuel               object
seller_type        object
transmission       object
owner              object
manufacturer       object
model              object
number_of_years     int64
dtype: object
                       name  year  selling_price  km_driven    fuel seller_type transmission         owner manufacturer   model  number_of_years
0             Maruti 800 AC  2007          60000      70000  Petrol  Individual       Manual   First Owner       Maruti     800               16
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol  Individual       Manual   First Owner       Maruti   Wagon               16
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel  Individual       Manual   First Owner      Hyundai   Verna               11
3    

In [15]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,model,number_of_years
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner,Maruti,800,16
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner,Maruti,Wagon,16
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner,Hyundai,Verna,11
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner,Datsun,RediGO,6
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner,Honda,Amaze,9


In [16]:
df.drop(['year'],axis=1,inplace=True)

df.head()

Unnamed: 0,name,selling_price,km_driven,fuel,seller_type,transmission,owner,manufacturer,model,number_of_years
0,Maruti 800 AC,60000,70000,Petrol,Individual,Manual,First Owner,Maruti,800,16
1,Maruti Wagon R LXI Minor,135000,50000,Petrol,Individual,Manual,First Owner,Maruti,Wagon,16
2,Hyundai Verna 1.6 SX,600000,100000,Diesel,Individual,Manual,First Owner,Hyundai,Verna,11
3,Datsun RediGO T Option,250000,46000,Petrol,Individual,Manual,First Owner,Datsun,RediGO,6
4,Honda Amaze VX i-DTEC,450000,141000,Diesel,Individual,Manual,Second Owner,Honda,Amaze,9


In [17]:
df.dtypes

name               object
selling_price       int64
km_driven           int64
fuel               object
seller_type        object
transmission       object
owner              object
manufacturer       object
model              object
number_of_years     int64
dtype: object

In [18]:
df.drop('name',axis=1,inplace=True)

In [19]:
categorical_features = df.select_dtypes(include=['object', 'category'])

In [20]:
categorical_features=list(categorical_features.columns)

In [21]:
categorical_features

['fuel', 'seller_type', 'transmission', 'owner', 'manufacturer', 'model']

In [22]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [23]:
x=df.drop('selling_price',axis=1)
y=df['selling_price']
print(x.shape)
print(type(x))
print(y.shape)
print(type(y))

(4340, 8)
<class 'pandas.core.frame.DataFrame'>
(4340,)
<class 'pandas.core.series.Series'>


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train

Unnamed: 0,km_driven,fuel,seller_type,transmission,owner,manufacturer,model,number_of_years
159,175000,Diesel,Individual,Manual,First Owner,Chevrolet,Enjoy,8
4334,170000,Diesel,Individual,Manual,First Owner,Toyota,Innova,11
1657,110000,Petrol,Individual,Manual,First Owner,Maruti,Esteem,17
2188,66521,Petrol,Trustmark Dealer,Manual,First Owner,Honda,City,8
581,41000,Petrol,Dealer,Manual,First Owner,Chevrolet,Beat,7
...,...,...,...,...,...,...,...,...
1033,58500,Diesel,Dealer,Automatic,First Owner,Audi,A6,10
3264,110000,Diesel,Individual,Manual,First Owner,Chevrolet,Beat,11
1653,22000,Petrol,Individual,Manual,First Owner,Datsun,RediGO,7
2607,70000,Diesel,Individual,Manual,First Owner,Volkswagen,Polo,8


In [25]:
# Create the column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Create the pipeline with preprocessing and linear regression model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),  # Optionally, you can add a StandardScaler for feature scaling
    ('regressor', LinearRegression())
])

# Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

score = pipeline.score(x_train, y_train)
print(score)

# Evaluate the model on the test data (if needed)
score = pipeline.score(x_test, y_test)
print(score)

# Make predictions using the pipeline
ypred = pipeline.predict(x_test)


0.85887327522653
0.7937663017611465


In [26]:
import pickle

# Assuming 'pipeline' is the object you want to save
with open(r"C:\Users\Sachin Kamath\Downloads\pipeline.pkl", 'wb') as file:
    pickle.dump(pipeline, file)


In [27]:
pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: c:\users\sachin kamath\appdata\roaming\python\python39\site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: scikit-learn-intelex
Note: you may need to restart the kernel to use updated packages.


In [28]:
#pip install scikit-learn==1.2.2 --user