# Used Cars Data Analysis

## Importing Libraries and the Dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, classification_report, confusion_matrix, roc_curve, auc
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import (
    ColumnDataSource, HoverTool, DataTable, TableColumn,
    Select, RangeSlider, Tabs, Panel, ColorBar, LinearColorMapper
)
from bokeh.layouts import row, column
from bokeh.transform import factor_cmap

output_notebook()

In [2]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv("Cars_data.csv")

In [3]:
# Display the Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9179 entries, 0 to 9178
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Ad ID              9179 non-null   int64 
 1   Car Name           9179 non-null   object
 2   Make               9179 non-null   object
 3   Model              9179 non-null   object
 4   Year               9179 non-null   int64 
 5   KM's driven        9179 non-null   int64 
 6   Price              9179 non-null   int64 
 7   Fuel               9179 non-null   object
 8   Registration city  9179 non-null   object
 9   Car documents      9179 non-null   object
 10  Assembly           9179 non-null   object
 11  Transmission       9179 non-null   object
 12  Condition          9179 non-null   object
 13  Seller Location    9179 non-null   object
 14  Description        9179 non-null   object
 15  Car Features       9179 non-null   object
 16  Images URL's       9179 non-null   object


In [4]:
# Display the Dataset shape
df.shape

(9179, 18)

In [5]:
# Display the Dataset data types
df.dtypes

Ad ID                 int64
Car Name             object
Make                 object
Model                object
Year                  int64
KM's driven           int64
Price                 int64
Fuel                 object
Registration city    object
Car documents        object
Assembly             object
Transmission         object
Condition            object
Seller Location      object
Description          object
Car Features         object
Images URL's         object
Car Profile          object
dtype: object

In [6]:
# Display the first 5 rows
df.head()

Unnamed: 0,Ad ID,Car Name,Make,Model,Year,KM's driven,Price,Fuel,Registration city,Car documents,Assembly,Transmission,Condition,Seller Location,Description,Car Features,Images URL's,Car Profile
0,1079071571,fresh import Passo 2021model,Toyota,Passo,2021,54000,4190000,Petrol,Unregistered,Original,Imported,Automatic,Used,"Airline Avenue, Islamabad","it's 2021 model fresh import, perfect engine s...","ABS, Air Bags, AM/FM Radio, CD Player, Cassett...",['https://images.olx.com.pk/thumbnails/4039460...,https://www.olx.com.pk/item/fresh-import-passo...
1,1080125520,Suzuki ravi,Suzuki,Ravi,2018,95000,1300000,Petrol,Karachi,Original,Local,Manual,Used,"Kahuta, Rawalpindi",Suzuki ravi 2018 col,AM/FM Radio,['https://images.olx.com.pk/thumbnails/4102504...,https://www.olx.com.pk/item/suzuki-ravi-iid-10...
2,1080748789,Suzuki bolan 2015 contact 03112271054,Suzuki,Bolan,2015,50000,800000,Petrol,Karachi,Original,Local,Manual,Used,"Lyari Expressway, Karachi",Suzuki bolan model 2015 reg 2022 ghadi me koi ...,Rear speakers,['https://images.olx.com.pk/thumbnails/4139520...,https://www.olx.com.pk/item/suzuki-bolan-2015-...
3,1076081635,Diahatsu Move 2013,Daihatsu,Move,2013,94000,2155000,Petrol,Lahore,Original,Imported,Automatic,Used,"New Amir Town, Lahore",Move Push start \r\nHome Used car. \r\nModel 2...,"ABS, Air Bags, Air Conditioning, Alloy Rims, A...",['https://images.olx.com.pk/thumbnails/3865337...,https://www.olx.com.pk/item/diahatsu-move-2013...
4,1080812928,Suzuki Swift DLX 2011 miner tuchap,Suzuki,Swift,2011,126544,1440000,Petrol,Karachi,Original,Local,Manual,Used,"Shadman 2, Karachi",Suzuki Swift DLX\r\nAbS Break 100%\r\nengine 1...,"ABS, Air Conditioning, Alloy Rims, AM/FM Radio...",['https://images.olx.com.pk/thumbnails/4143509...,https://www.olx.com.pk/item/suzuki-swift-dlx-2...


## Cleaning Data with Pipeline

In [7]:
# Create the data cleaning pipeline
def cleaning_pipeline(df_in):
    print("Running Car Data Cleaning Pipeline:")
    
    # Create a copy to avoid modifying original
    df = df_in.copy()
    
    # Add synthetic features for age and gender
    df['Age'] = np.random.randint(18, 69, size=len(df))
    df['Gender'] = np.random.choice(['Male', 'Female'], size=len(df))
    
    # Drop unused columns
    columns_to_drop = ['Ad ID','Car Name','Car documents','Assembly','Condition','Seller Location',
                        'Description','Car Features',"Images URL's",'Car Profile']
    df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)
    
    # Clean and rename columns
    df.columns = df.columns.str.replace("'",'').str.replace(' ','_')
    df.rename(columns={'KMs_driven':'Km_driven'}, inplace=True)
    
    # Convert data types
    for col in ['Price', 'Km_driven', 'Year']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
    
    # Clean string features
    if 'Make' in df.columns:
        df['Make'] = df['Make'].astype(str).str.title()
    if 'Model' in df.columns:
        df['Model'] = df['Model'].astype(str).str.title()
    
    # Create Age Group feature
    if 'Age' in df.columns:
        bins = [0,18,35,50,65,80,120]
        labels = ['(0-18)', '(19-35)', '(36-50)', '(51-65)', '(66-80)', '(80+)']
        df["Age_group"] = pd.cut(df['Age'], bins, labels=labels, right=True, include_lowest=True)
    
    # Remove unwanted registration cities
    if 'Registration_city' in df.columns:
        mask = (df['Registration_city'] == 'Unregistered') | (df['Registration_city'] == 'Unknown')
        df = df[~mask]
    
    # Final cleanup
    initial_count = len(df)
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True)
    final_count = len(df)
    print(f"Removed {initial_count - final_count} rows with missing values or duplicates")
    
    print("Data cleaning done")
    return df

In [8]:
# Run the data cleaning pipeline
cleaned_df = cleaning_pipeline(df)

Running Car Data Cleaning Pipeline:
Removed 1 rows with missing values or duplicates
Data cleaning done


In [9]:
# Print cleaned data
print(f"\nCleaned data shape: {cleaned_df.shape}")
print("\nCleaned data info:")
cleaned_df.info()


Cleaned data shape: (9065, 11)

Cleaned data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9065 entries, 0 to 9064
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Make               9065 non-null   object  
 1   Model              9065 non-null   object  
 2   Year               9065 non-null   int16   
 3   Km_driven          9065 non-null   int32   
 4   Price              9065 non-null   int32   
 5   Fuel               9065 non-null   object  
 6   Registration_city  9065 non-null   object  
 7   Transmission       9065 non-null   object  
 8   Age                9065 non-null   int32   
 9   Gender             9065 non-null   object  
 10  Age_group          9065 non-null   category
dtypes: category(1), int16(1), int32(3), object(6)
memory usage: 558.1+ KB


In [10]:
# Print missing values
print("\nMissing values after cleaning:")
print(cleaned_df.isna().sum())


Missing values after cleaning:
Make                 0
Model                0
Year                 0
Km_driven            0
Price                0
Fuel                 0
Registration_city    0
Transmission         0
Age                  0
Gender               0
Age_group            0
dtype: int64


In [11]:
# Print first 5 rows
print("First 5 rows of cleaned data:")
print(cleaned_df.head())

First 5 rows of cleaned data:
       Make    Model  Year  Km_driven    Price    Fuel Registration_city  \
0    Suzuki     Ravi  2018      95000  1300000  Petrol           Karachi   
1    Suzuki    Bolan  2015      50000   800000  Petrol           Karachi   
2  Daihatsu     Move  2013      94000  2155000  Petrol            Lahore   
3    Suzuki    Swift  2011     126544  1440000  Petrol           Karachi   
4    Suzuki  Wagon R  2020      54000  2830000  Petrol            Lahore   

  Transmission  Age  Gender Age_group  
0       Manual   45  Female   (36-50)  
1       Manual   68    Male   (66-80)  
2    Automatic   40    Male   (36-50)  
3       Manual   54  Female   (51-65)  
4    Automatic   54  Female   (51-65)  


In [12]:
# Saving the cleaned data
cleaned_df.to_csv('cleaned_Cars_data.csv', index=False)
print("Cleaned data saved to 'cleaned_Cars_data.csv'")

Cleaned data saved to 'cleaned_Cars_data.csv'


## Data Preprocessing with Pipeline

In [13]:
def preprocessing_pipeline():
    print("Running Scikit-learn Preprocessing Pipeline:")
    
    # Define feature lists
    numerical_features = ['Year', 'Km_driven']
    categorical_features = ['Make', 'Model', 'Fuel', 'Transmission', 'Registration_city', 'Gender', 'Age_group']
    
    # Preprocessing for numerical features
    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )
    
    # Preprocessing for categorical features
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]
    )
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical", numeric_transformer, numerical_features),
            ("categorical", categorical_transformer, categorical_features),
        ],
        remainder="drop"
    )
    
    print("Preprocessing pipeline done")
    return preprocessor

In [14]:
# Run the preprocessing pipeline
preprocessing_pipeline = preprocessing_pipeline()

# Prepare data for preprocessing by dropping the target variable
features_df = cleaned_df.drop(columns=['Price'], errors='ignore')

Running Scikit-learn Preprocessing Pipeline:
Preprocessing pipeline done


In [15]:
# Print features shape
print(f"\nFeatures shape for preprocessing: {features_df.shape}")
print("Features columns:", list(features_df.columns))


Features shape for preprocessing: (9065, 10)
Features columns: ['Make', 'Model', 'Year', 'Km_driven', 'Fuel', 'Registration_city', 'Transmission', 'Age', 'Gender', 'Age_group']


In [16]:
# Fit and transform the data using the pipeline
transformed_data = preprocessing_pipeline.fit_transform(features_df)
print(f"Transformed data shape: {transformed_data.shape}")

Transformed data shape: (9065, 143)


In [17]:
def processed_data_to_csv(transformed_array, preprocessor, target_series, filename='processed_Cars_data.csv'):
    print(f"Saving processed data to '{filename}'...")   
    # Get feature names from the preprocessor
    feature_names = []
    
    # Add numerical features (already scaled)
    numerical_features = ['Year', 'Km_driven']
    feature_names.extend(numerical_features)
    
    # Get one-hot encoded categorical feature names
    categorical_transformer = preprocessor.named_transformers_['categorical']
    onehot_encoder = categorical_transformer.named_steps['onehot']
    categorical_cols = ['Make', 'Model', 'Fuel', 'Transmission', 'Registration_city', 'Gender', 'Age_group']
    categorical_feature_names = onehot_encoder.get_feature_names_out(categorical_cols)
    feature_names.extend(categorical_feature_names)
    
    # Create DataFrame from the transformed array
    processed_df = pd.DataFrame(transformed_array, columns=feature_names)
    
    # Add the Price column back
    processed_df['Price'] = target_series.values
    
    # Save to CSV
    processed_df.to_csv(filename, index=False)
    print(f"Processed data saved to '{filename}'")
  
    return processed_df

In [18]:
# Run the data to csv pipeline
processed_data = processed_data_to_csv(
    transformed_array=transformed_data, 
    preprocessor=preprocessing_pipeline, 
    target_series=cleaned_df['Price'],
    filename='processed_Cars_data.csv'
)

# Display first few rows of processed data
print("\nFirst 5 rows of processed data:")
print(processed_data.head())

Saving processed data to 'processed_Cars_data.csv'...
Processed data saved to 'processed_Cars_data.csv'

First 5 rows of processed data:
       Year  Km_driven  Make_Changan  Make_Chevrolet  Make_Daihatsu  Make_Faw  \
0  0.967672  -0.034485           0.0             0.0            0.0       0.0   
1  0.469115  -0.761250           0.0             0.0            0.0       0.0   
2  0.136743  -0.050635           0.0             0.0            1.0       0.0   
3 -0.195628   0.474962           0.0             0.0            0.0       0.0   
4  1.300044  -0.696649           0.0             0.0            0.0       0.0   

   Make_Honda  Make_Hyundai  Make_Kia  Make_Mercedes  ...  \
0         0.0           0.0       0.0            0.0  ...   
1         0.0           0.0       0.0            0.0  ...   
2         0.0           0.0       0.0            0.0  ...   
3         0.0           0.0       0.0            0.0  ...   
4         0.0           0.0       0.0            0.0  ...   

   Regist

## Model Training

In [19]:
# Spliting the data for training and testing
x = pd.DataFrame(processed_data.drop('Price',axis=1).copy())
y = pd.DataFrame(processed_data['Price'].copy())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=6969)

In [20]:
scaler = StandardScaler()
numerical_col = ['Year','Km_driven']
categorical_col = ['Age_group']

In [21]:
x_train[numerical_col] = scaler.fit_transform(x_train[numerical_col])
x_test[numerical_col] = scaler.transform(x_test[numerical_col])

y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

In [22]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(x_train, y_train.ravel())
LinearRegression_pred = linear_regression_model.predict(x_test)

LinearRegression_MSE = mean_squared_error(y_test, LinearRegression_pred)
LinearRegression_RSME = np.sqrt(LinearRegression_MSE)
LinearRegression_MAE = mean_absolute_error(y_test, LinearRegression_pred)
LinearRegression_R2 = r2_score(y_test, LinearRegression_pred)

print(f"RMSE: {LinearRegression_RSME:,.2f}")
print(f"MAE: {LinearRegression_MAE:,.2f}")
print(f"R2: {LinearRegression_R2:,.2f}")
print(f"MSE: {LinearRegression_MSE:,.2f}")

RMSE: 0.30
MAE: 0.22
R2: 0.91
MSE: 0.09


In [23]:
Random_Forest_Regressor_Model = RandomForestRegressor(n_estimators=100, random_state=6969, n_jobs=-1)
Random_Forest_Regressor_Model.fit(x_train, y_train.ravel())
Random_Forest_Regressor_Model_pred = Random_Forest_Regressor_Model.predict(x_test)

Random_Forest_Regressor_Model_MSE = mean_squared_error(y_test, Random_Forest_Regressor_Model_pred)
Random_Forest_Regressor_Model_RSME = np.sqrt(Random_Forest_Regressor_Model_MSE)
Random_Forest_Regressor_Model_MAE = mean_absolute_error(y_test, Random_Forest_Regressor_Model_pred)
Random_Forest_Regressor_Model_R2 = r2_score(y_test, Random_Forest_Regressor_Model_pred)

print(f"MSE: {Random_Forest_Regressor_Model_MSE:,.2f}")
print(f"RMSE: {Random_Forest_Regressor_Model_RSME:,.2f}")
print(f"MAE: {Random_Forest_Regressor_Model_MAE:,.2f}")
print(f"R2: {Random_Forest_Regressor_Model_R2:.2f}")

MSE: 0.05
RMSE: 0.22
MAE: 0.15
R2: 0.95


In [24]:
GradientBoostingRegressor_Model = GradientBoostingRegressor(n_estimators=100, random_state=6969)
GradientBoostingRegressor_Model.fit(x_train, y_train.ravel())
GradientBoostingRegressor_Model_pred = GradientBoostingRegressor_Model.predict(x_test)

GradientBoostingRegressor_Model_MSE = mean_squared_error(y_test, GradientBoostingRegressor_Model_pred)
GradientBoostingRegressor_Model_RSME = np.sqrt(GradientBoostingRegressor_Model_MSE)
GradientBoostingRegressor_Model_MAE = mean_absolute_error(y_test, GradientBoostingRegressor_Model_pred)
GradientBoostingRegressor_Model_R2 = r2_score(y_test, GradientBoostingRegressor_Model_pred)
print(f"MSE: {GradientBoostingRegressor_Model_MSE:,.2f}")
print(f"RMSE: {GradientBoostingRegressor_Model_RSME:,.2f}")
print(f"MAE: {GradientBoostingRegressor_Model_MAE:,.2f}")
print(f"R2: {GradientBoostingRegressor_Model_R2:,.2f}")


MSE: 0.06
RMSE: 0.25
MAE: 0.18
R2: 0.94
