In [None]:
# Importing libraries for data analysis
import pandas as  pd
import chardet
import re
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# Dataset source: https://www.kaggle.com/datasets/abdulmalik1518/cars-datasets-2025/data
# Importing dataset csv file
with open("Cars Datasets 2025.csv", 'rb') as file:
    encoding = chardet.detect(file.read())['encoding']
data = pd.read_csv("Cars Datasets 2025.csv", encoding=encoding)

# Data Preprocessing

In [None]:
#Cleaning the data
def data_cleaning(a):
    if pd.isna(a): 
        return None
    a = str(a).replace(",", "").replace("$", "").strip()
    num = [float(x) for x in re.findall(r"\d+\.?\d*", a)]
    if not num:
        return None
    return sum(num)/len(num)

columns = ["CC/Battery Capacity", "HorsePower", "Total Speed", "Performance(0 - 100 )KM/H", "Cars Prices", "Torque"]
for column in columns:
    data[column] = data[column].apply(data_cleaning)

data["Seats"] = pd.to_numeric(data["Seats"], errors="coerce").astype("Int64")
data["Company Names"] = data["Company Names"].str.strip().str.upper()
data["Cars Names"] = data["Cars Names"].str.strip().str.title()
data["Fuel Types"] = data["Fuel Types"].str.strip().str.lower()
data["Engines"] = data["Engines"].str.strip().str.upper()

In [None]:
# Displaying first 5 rows of the dataset
data.head()

In [None]:
# Displaying the last 5 rows of the dataset
data.tail()

In [None]:
# Displaying the shape of the dataset (Rows, Columns)
data.shape

In [None]:
# Discribing the dataset
data.describe()

In [None]:
# Displaying the data types
data.dtypes

In [None]:
# Displaying the number of duplicate rows
data.duplicated().sum()

In [None]:
# Displaying the duplicate rows if found
data[data.duplicated(keep=False)]

In [None]:
# Handling duplicate data by dropping the rows with duplicate values if found
data.drop_duplicates(inplace=True)
data.duplicated().sum()

In [None]:
# Displaying the number of missing values
data.isnull().sum()

In [None]:
# Displaying the rows with missing values if found
data[data.isnull().any(axis=1)]

In [None]:
# Handling missing data by dropping the rows with missing values if found
data.dropna(inplace=True)
data.isnull().sum()

# Exploratory Data Analysis (EDA)

In [None]:
# Displaying HorsePower descriptive statistics
data.HorsePower.describe()

In [None]:
# Displaying HorsePower and the count of cars distribution
sns.distplot(data['HorsePower'])
plt.title('Distribution of HorsePower')
plt.xlabel('HorsePower')
plt.ylabel('Count of Cars')
plt.tight_layout()
plt.show()
# Measure of distortion of symmetric distribution
print("Skewness: %f" % data['HorsePower'].skew())
print("Kurtosis: %f" % data['HorsePower'].kurt())

Explaination -
The distribution of HorsePower is skewed to the right with a positive skewness of 0.46. The Kurtosis of 3.64 indicates that the distribution is relatively normal with a heavy tail.

In [None]:
# Displaying the of CC/Battery Capacity, HorsePower, Total Speed, Performance(0 - 100 )KM/H and Torque correlation
factors = ['CC/Battery Capacity', 'HorsePower', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Torque']
data_clean = data[factors].copy()
corrmat = data_clean.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmin=-1, vmax=1, annot=True, cmap='BrBG')
plt.title('Performance Metrics Correlation Matrix\n')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Explaination -
The correlation matrix shows a strong positive correlation between HorsePower and Performance, and a weak positive correlation between HorsePower and Total Speed.

In [None]:
# Displaying the correlation between HorsePower and Total Speed using the scatterplot
plt.figure(figsize = (10,6))
sns.scatterplot(x='HorsePower',y='Total Speed',color='r',data=data)
plt.title('Relationship between HorsePower and Total Speed',size=18)
plt.xlabel('HorsePower',size=14)
plt.ylabel('Total Speed',size=14)
plt.show()

Explaination -
The scatterplot shows a strong positive correlation between HorsePower and Total Speed.

In [None]:
# Scaling data to Min-Max range [0, 1] for Histplot
numeric_columns = ['HorsePower', 'CC/Battery Capacity', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Torque', 'Cars Prices']

minmax_scaler = MinMaxScaler()
data_minmax = data.copy()
data_minmax[numeric_columns] = minmax_scaler.fit_transform(data[numeric_columns])

print("\nMinMax Scaling Results:")
print(f"Ranges after scaling:")
for col in numeric_columns:
    print(f"{col}: {data_minmax[col].min():.3f} to {data_minmax[col].max():.3f}")

# Compare distributions before and after scaling
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

# Original data
sns.histplot(data['CC/Battery Capacity'], ax=axes[0], kde=True)
axes[0].set_title('Original CC/Battery Capacity')

# Scaled data
sns.histplot(data_minmax['CC/Battery Capacity'], ax=axes[1], kde=True)
axes[1].set_title('MinMaxScaler CC/Battery Capacity')

plt.tight_layout()
plt.show()

Explaination -
The histogram for the original data shows a skewed distribution with a long tail. The histogram for the scaled data shows a normal distribution with a short tail.

In [None]:
# Scaling data to Min-Max range [0, 1] for Boxplot
numeric_columns = ['HorsePower', 'CC/Battery Capacity', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Torque', 'Cars Prices']

minmax_scaler = MinMaxScaler()
data_minmax = data.copy()
data_minmax[numeric_columns] = minmax_scaler.fit_transform(data[numeric_columns])

print("\nMinMax Scaling Results:")
print(f"Ranges after scaling:")
for col in numeric_columns:
    print(f"{col}: {data_minmax[col].min():.3f} to {data_minmax[col].max():.3f}")

# Compare distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Boxplot for original data
sns.boxplot(y=data['CC/Battery Capacity'], ax=ax1, color='lightblue')
ax1.set_title('Original CC/Battery Capacity\n(Boxplot)')
ax1.set_ylabel('CC/Battery Capacity')

# Boxplot for scaled data
sns.boxplot(y=data_minmax['CC/Battery Capacity'], ax=ax2, color='lightcoral')
ax2.set_title('MinMax Scaled CC/Battery Capacity\n(Boxplot)')
ax2.set_ylabel('Scaled CC/Battery Capacity (0-1)')


Explaination -
The boxplot for the original data shows a skewed distribution with a long tail. The boxplot for the scaled data shows a normal distribution with a short tail.

In [None]:
# Displaying the pairplot
sns.pairplot(data)

Explaination -
The pairplot shows a strong positive correlation between HorsePower and Performance, and a weak positive correlation between HorsePower and Total Speed.

# Machine Learning Analysis - Linear Regression

In [None]:
# Training and testing regression analysis model
factors = ['CC/Battery Capacity', 'HorsePower', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Torque']
X = pd.DataFrame(data[factors].copy())
y = data['Cars Prices'].copy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=324)
reg_model = LinearRegression()
reg_model.fit(X_train,y_train)
y_pred = reg_model.predict(X_test)
reg_model.score(X_test,y_test)
print("Mean Absolute Error is : $" , mean_absolute_error(y_test,y_pred))
print("R^2 is : ", r2_score(y_test,y_pred))


Explaination -
The regression analysis model has a mean absolute error of $194307.94220675508 and an R^2 of -0.20042211207549987