In [45]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [46]:
#Reading the Dataset
df = pd.read_csv('Resell_Cars_Data.csv')

In [47]:
# Drop unnecessary columns
df.drop(columns=['Ad ID','Car Name','Car documents','Assembly','Condition','Seller Location','Description','Car Features',"Images URL's",'Car Profile'],inplace=True)

In [48]:
df.shape

(9179, 10)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9179 entries, 0 to 9178
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Make               9179 non-null   object
 1   Model              9179 non-null   object
 2   Year               9179 non-null   int64 
 3   KM's driven        9179 non-null   int64 
 4   Price              9179 non-null   int64 
 5   Fuel               9179 non-null   object
 6   Registration city  9179 non-null   object
 7   Transmission       9179 non-null   object
 8   Age                9179 non-null   int64 
 9   Gender             9179 non-null   object
dtypes: int64(4), object(6)
memory usage: 717.2+ KB


In [50]:
# Clean column names
df.columns = df.columns.str.replace("'",'').str.replace(' ','_')
df.rename(columns={'KMs_driven':'Km_driven'},inplace=True)

In [51]:
# Convert datatypes and fill missing
df['Price'] = pd.to_numeric(df['Price'],errors='coerce').astype(int)
df['Km_driven'] = pd.to_numeric(df['Km_driven'],errors='coerce').astype(int)
df['Year'] = pd.to_numeric(df['Year'],errors='coerce').astype(int)
df['Age'] = pd.to_numeric(df['Age'],errors='coerce').astype(int)

df['Fuel'] = df['Fuel'].fillna('Unknown')
df['Transmission'] = df['Transmission'].fillna('Unknown')
df['Registration_city'] = df['Registration_city'].fillna('Unknown')
df['Gender'] = df['Gender'].fillna('Unknown')

df['Make'] = df['Make'].str.title()
df['Model'] = df['Model'].str.title()

In [52]:
# Check datatypes
df.dtypes

Unnamed: 0,0
Make,object
Model,object
Year,int64
Km_driven,int64
Price,int64
Fuel,object
Registration_city,object
Transmission,object
Age,int64
Gender,object


Checking all the unique values to confirm no anomalies

In [53]:
df['Make'].unique()

array(['Toyota', 'Suzuki', 'Daihatsu', 'Honda', 'Hyundai', 'Mitsubishi',
       'Kia', 'Changan', 'Faw', 'Mercedes', 'Chevrolet'], dtype=object)

In [54]:
df['Model'].unique()

array(['Passo', 'Ravi', 'Bolan', 'Move', 'Swift', 'Wagon R', 'Mira',
       'City Idsi', 'Cultus Vxr', 'Every', 'Corolla Xli', 'Baleno',
       'City Vario', 'Terios Kid', 'Civic Prosmetic', 'City Ivtec',
       'Santro', 'Corrolla Altis', 'City Aspire', 'Mehran Vxr', 'Cuore',
       'Pajero Mini', 'Picanto', 'Alto', 'Karvaan', 'X-Pv', 'Hijet',
       'Ek Wagon', 'Altis Grande', 'E Class', 'Yaris', 'Lancer',
       'Corolla Gli', 'Joy', 'Civic Oriel', 'Classic', 'Spectra', 'V2',
       'Minicab Bravo', 'Corolla Assista', 'Alsvin', 'C Class', 'Minica',
       'Exclusive', 'Civic Vti', 'Cervo', 'Sportage', 'Corolla Axio',
       'Every Wagon', 'Liana', 'Civic Exi', 'Surf', 'Mehran Vx',
       'Civic Vti Oriel', 'Khyber', 'Cultus Vxl', 'Prius', 'Isis'],
      dtype=object)

In [55]:
df['Fuel'].unique()

array(['Petrol', 'CNG', 'Hybrid', 'Diesel'], dtype=object)

In [56]:
df['Registration_city'].unique()

array(['Unregistered', 'Karachi', 'Lahore', 'Sindh', 'Faisalabad',
       'Islamabad', 'Sialkot', 'Gujranwala', 'Punjab', 'Multan',
       'Rawalpindi', 'Bahawalpur', 'Hyderabad', 'Rahimyar Khan',
       'Chishtian Mandi', 'Chiniot', 'Wazirabad', 'Sahiwal', 'Gujrat',
       'Abbottabad', 'Khanewal', 'Peshawar', 'Sargodha', 'Bannu',
       'Nowshera', 'Kasur', 'Arifwala', 'Jhang Sadar', 'Dera Ghazi Khan',
       'Mardan', 'Attock', 'Larkana', 'Unknown', 'Jhelum', 'Khanpur',
       'Wah', 'Okara', 'Bhakkar', 'Sheikhüpura', 'Swat',
       'Dera Ismail Khan', 'Khushab', 'Bhimber', 'Mansehra', 'Mirpur',
       'Quetta', 'Vehari', 'Hafizabad', 'Charsadda', 'Toba Tek singh',
       'Jaranwala', 'Pirmahal', 'Haripur', 'Mirpur Khas', 'Bahawalnagar',
       'Swabi', 'Dadu', 'Mianwali', 'Sukkar', 'Badin', 'Nawabshah'],
      dtype=object)

The unregistered cities can be considered as null values.

In [57]:
df['Transmission'].unique()

array(['Automatic', 'Manual'], dtype=object)

In [58]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [59]:
df.isnull().sum().sum()

np.int64(0)

In [60]:
df.dropna(inplace=True)
df.isnull().sum().sum()

np.int64(0)

In [61]:
# Removing the unregistered and unknown cities since they can be considered null values
cities_to_remove = df[(df['Registration_city'] == 'Unregistered') | (df['Registration_city'] == 'Unknown')].index
df.drop(cities_to_remove,inplace=True)

In [62]:
df['Registration_city'].unique()

array(['Karachi', 'Lahore', 'Sindh', 'Faisalabad', 'Islamabad', 'Sialkot',
       'Gujranwala', 'Punjab', 'Multan', 'Rawalpindi', 'Bahawalpur',
       'Hyderabad', 'Rahimyar Khan', 'Chishtian Mandi', 'Chiniot',
       'Wazirabad', 'Sahiwal', 'Gujrat', 'Abbottabad', 'Khanewal',
       'Peshawar', 'Sargodha', 'Bannu', 'Nowshera', 'Kasur', 'Arifwala',
       'Jhang Sadar', 'Dera Ghazi Khan', 'Mardan', 'Attock', 'Larkana',
       'Jhelum', 'Khanpur', 'Wah', 'Okara', 'Bhakkar', 'Sheikhüpura',
       'Swat', 'Dera Ismail Khan', 'Khushab', 'Bhimber', 'Mansehra',
       'Mirpur', 'Quetta', 'Vehari', 'Hafizabad', 'Charsadda',
       'Toba Tek singh', 'Jaranwala', 'Pirmahal', 'Haripur',
       'Mirpur Khas', 'Bahawalnagar', 'Swabi', 'Dadu', 'Mianwali',
       'Sukkar', 'Badin', 'Nawabshah'], dtype=object)

In [63]:
df.duplicated().sum()

np.int64(2)

In [64]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [65]:
df.describe()

Unnamed: 0,Year,Km_driven,Price,Age
count,9064.0,9064.0,9064.0,9064.0
mean,2012.179281,97125.149162,2021968.0,42.998676
std,6.017882,61928.266092,1154464.0,14.727178
min,1989.0,1.0,185000.0,18.0
25%,2007.0,55000.0,1015000.0,30.0
50%,2013.0,92437.5,1800000.0,43.0
75%,2017.0,126000.0,2725000.0,56.0
max,2024.0,533528.0,5000000.0,68.0


In [66]:
df.shape

(9064, 10)

In [67]:
# Use df_cleaned from now on
df_cleaned = df.copy()
df_cleaned.reset_index(drop=True , inplace=True)
df_cleaned.to_csv('Cleaned_Data.csv',index=False)

## Encoding the Data

### Label Encoding some columns to maintain anonymity

In [68]:
le = LabelEncoder()

def label_encode(columns):
  for col in columns:
    df_cleaned[col + '_encoded'] = le.fit_transform(df_cleaned[col].astype(str)).astype(int)
    print(f'Mapping for : {col}')
    encoding_map = pd.DataFrame({
        'Original Name' : le.classes_ ,
        'Encoded Number' : np.arange(len(le.classes_))
        })
    print(encoding_map.to_string(index=False))
    print('\n')

In [69]:
label_encode(['Make','Model','Registration_city','Gender'])

Mapping for : Make
Original Name  Encoded Number
      Changan               0
    Chevrolet               1
     Daihatsu               2
          Faw               3
        Honda               4
      Hyundai               5
          Kia               6
     Mercedes               7
   Mitsubishi               8
       Suzuki               9
       Toyota              10


Mapping for : Model
  Original Name  Encoded Number
         Alsvin               0
   Altis Grande               1
           Alto               2
         Baleno               3
          Bolan               4
        C Class               5
          Cervo               6
    City Aspire               7
      City Idsi               8
     City Ivtec               9
     City Vario              10
      Civic Exi              11
    Civic Oriel              12
Civic Prosmetic              13
      Civic Vti              14
Civic Vti Oriel              15
        Classic              16
Corolla Assista        

In [70]:
df_cleaned["Age_group"] = pd.cut(df_cleaned['Age'],
                                 bins = [0,18,35,50,65,80,120],
                                 labels = ['(0-18)', '(19-35)', '(36-50)', '(51-65)', '(66-80)', '(80+)'],
                                 right=True,
                                 include_lowest=True)

In [71]:
df_cleaned.drop(['Make','Model','Registration_city','Gender','Age'],axis=1,inplace=True)

In [72]:
df_cleaned

Unnamed: 0,Year,Km_driven,Price,Fuel,Transmission,Make_encoded,Model_encoded,Registration_city_encoded,Gender_encoded,Age_group
0,2018,95000,1300000,Petrol,Manual,9,47,25,1,(51-65)
1,2015,50000,800000,Petrol,Manual,9,4,25,0,(19-35)
2,2013,94000,2155000,Petrol,Automatic,2,42,30,0,(51-65)
3,2011,126544,1440000,Petrol,Manual,9,52,25,1,(51-65)
4,2020,54000,2830000,Petrol,Automatic,9,55,30,1,(36-50)
...,...,...,...,...,...,...,...,...,...,...
9059,2014,115000,2725000,Petrol,Automatic,9,55,21,1,(19-35)
9060,2017,100000,1850000,Petrol,Manual,9,55,30,1,(19-35)
9061,2001,123456,1100000,Petrol,Automatic,6,49,30,1,(19-35)
9062,2009,89000,970000,Petrol,Manual,9,2,30,0,(19-35)


### One Hot Encoding the rest of the columns

In [73]:
#One Hot Encoding the Fuel, Transmission, Age_group

df_encoded = pd.get_dummies(df_cleaned,columns=['Fuel','Transmission','Age_group'], dtype=int)

In [74]:
df_encoded.dtypes

Unnamed: 0,0
Year,int64
Km_driven,int64
Price,int64
Make_encoded,int64
Model_encoded,int64
Registration_city_encoded,int64
Gender_encoded,int64
Fuel_CNG,int64
Fuel_Diesel,int64
Fuel_Hybrid,int64


In [75]:
df_encoded

Unnamed: 0,Year,Km_driven,Price,Make_encoded,Model_encoded,Registration_city_encoded,Gender_encoded,Fuel_CNG,Fuel_Diesel,Fuel_Hybrid,Fuel_Petrol,Transmission_Automatic,Transmission_Manual,Age_group_(0-18),Age_group_(19-35),Age_group_(36-50),Age_group_(51-65),Age_group_(66-80),Age_group_(80+)
0,2018,95000,1300000,9,47,25,1,0,0,0,1,0,1,0,0,0,1,0,0
1,2015,50000,800000,9,4,25,0,0,0,0,1,0,1,0,1,0,0,0,0
2,2013,94000,2155000,2,42,30,0,0,0,0,1,1,0,0,0,0,1,0,0
3,2011,126544,1440000,9,52,25,1,0,0,0,1,0,1,0,0,0,1,0,0
4,2020,54000,2830000,9,55,30,1,0,0,0,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9059,2014,115000,2725000,9,55,21,1,0,0,0,1,1,0,0,1,0,0,0,0
9060,2017,100000,1850000,9,55,30,1,0,0,0,1,0,1,0,1,0,0,0,0
9061,2001,123456,1100000,6,49,30,1,0,0,0,1,1,0,0,1,0,0,0,0
9062,2009,89000,970000,9,2,30,0,0,0,0,1,0,1,0,1,0,0,0,0


### Standardising the data

In [76]:
df_scaled = df_encoded.copy()

In [77]:
df_scaled.dtypes

Unnamed: 0,0
Year,int64
Km_driven,int64
Price,int64
Make_encoded,int64
Model_encoded,int64
Registration_city_encoded,int64
Gender_encoded,int64
Fuel_CNG,int64
Fuel_Diesel,int64
Fuel_Hybrid,int64


In [78]:
numerical_col = ['Year','Km_driven']
categorical_col = ['Age_group']
age_group_backup = df_cleaned[['Age_group']].copy()
age_group_backup.to_csv('Age_Group_Backup.csv', index=False)
df_cleaned.drop('Age_group',axis=1,inplace=True)

In [79]:
# Spliting the data for training and testing
x = pd.DataFrame(df_scaled.drop('Price',axis=1).copy())
y = pd.DataFrame(df_scaled['Price'].copy())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=6969)

In [80]:
scaler = StandardScaler()

In [81]:
x_train[numerical_col] = scaler.fit_transform(x_train[numerical_col])
x_test[numerical_col] = scaler.transform(x_test[numerical_col])

y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

In [82]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(x_train, y_train)
LinearRegression_pred = linear_regression_model.predict(x_test)

LinearRegression_MSE = mean_squared_error(y_test, LinearRegression_pred)
LinearRegression_RSME = np.sqrt(LinearRegression_MSE)
LinearRegression_MAE = mean_absolute_error(y_test, LinearRegression_pred)
LinearRegression_R2 = r2_score(y_test, LinearRegression_pred)

print(f"RMSE: {LinearRegression_RSME:,.2f}")
print(f"MAE: {LinearRegression_MAE:,.2f}")
print(f"R2: {LinearRegression_R2:,.2f}")
print(f"MSE: {LinearRegression_MSE:,.2f}")

RMSE: 0.63
MAE: 0.51
R2: 0.62
MSE: 0.40


In [83]:
Random_Forest_Regressor_Model = RandomForestRegressor(n_estimators=100, random_state=6969, n_jobs=-1)
Random_Forest_Regressor_Model.fit(x_train, y_train)
Random_Forest_Regressor_Model_pred = Random_Forest_Regressor_Model.predict(x_test)

Random_Forest_Regressor_Model_MSE = mean_squared_error(y_test, Random_Forest_Regressor_Model_pred)
Random_Forest_Regressor_Model_RSME = np.sqrt(Random_Forest_Regressor_Model_MSE)
Random_Forest_Regressor_Model_MAE = mean_absolute_error(y_test, Random_Forest_Regressor_Model_pred)
Random_Forest_Regressor_Model_R2 = r2_score(y_test, Random_Forest_Regressor_Model_pred)

print(f"MSE: {Random_Forest_Regressor_Model_MSE:,.2f}")
print(f"RMSE: {Random_Forest_Regressor_Model_RSME:,.2f}")
print(f"MAE: {Random_Forest_Regressor_Model_MAE:,.2f}")
print(f"R2: {Random_Forest_Regressor_Model_R2:.2f}")

  return fit_method(estimator, *args, **kwargs)


MSE: 0.05
RMSE: 0.22
MAE: 0.15
R2: 0.95


In [85]:
GradientBoostingRegressor_Model = GradientBoostingRegressor(n_estimators=100, random_state=6969)
GradientBoostingRegressor_Model.fit(x_train, y_train)
GradientBoostingRegressor_Model_pred = GradientBoostingRegressor_Model.predict(x_test)

GradientBoostingRegressor_Model_MSE = mean_squared_error(y_test, GradientBoostingRegressor_Model_pred)
GradientBoostingRegressor_Model_RSME = np.sqrt(GradientBoostingRegressor_Model_MSE)
GradientBoostingRegressor_Model_MAE = mean_absolute_error(y_test, GradientBoostingRegressor_Model_pred)
GradientBoostingRegressor_Model_R2 = r2_score(y_test, GradientBoostingRegressor_Model_pred)
print(f"MSE: {GradientBoostingRegressor_Model_MSE:,.2f}")
print(f"RMSE: {GradientBoostingRegressor_Model_RSME:,.2f}")
print(f"MAE: {GradientBoostingRegressor_Model_MAE:,.2f}")
print(f"R2: {GradientBoostingRegressor_Model_R2:,.2f}")


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


MSE: 0.07
RMSE: 0.26
MAE: 0.18
R2: 0.94
