In [4]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv("Cars Datasets 2025.csv", encoding="ISO-8859-1")

# Function to extract numbers safely
def extract_number(val):
    if isinstance(val, str):
        nums = re.findall(r"\d+\.?\d*", val.replace(",", ""))
        if len(nums) == 0:
            return None
        if len(nums) == 1:
            return float(nums[0])
        # if range (like 70-85 hp), take average
        return sum([float(n) for n in nums]) / len(nums)
    return val

# Clean numeric columns
data["HorsePower"] = data["HorsePower"].apply(extract_number)
data["Total Speed"] = data["Total Speed"].apply(extract_number)
data["Performance(0 - 100 )KM/H"] = data["Performance(0 - 100 )KM/H"].apply(extract_number)
data["Cars Prices"] = data["Cars Prices"].apply(extract_number)
data["CC/Battery Capacity"] = data["CC/Battery Capacity"].apply(extract_number)
data["Torque"] = data["Torque"].apply(extract_number)
data["Seats"] = data["Seats"].apply(extract_number)

# Encode categorical features
le = LabelEncoder()
for col in ["Company Names", "Cars Names", "Engines", "Fuel Types"]:
    data[col] = le.fit_transform(data[col].astype(str))

print("After cleaning:\n", data.head())

# Features (X) and target (y)
X = data.drop(columns=["Cars Prices"])
y = data["Cars Prices"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Impute missing values in X
imputer_X = SimpleImputer(strategy='mean')
X_train = imputer_X.fit_transform(X_train)
X_test = imputer_X.transform(X_test)

# Impute missing values in y
imputer_y = SimpleImputer(strategy='mean')
y_train = imputer_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test = imputer_y.transform(y_test.values.reshape(-1, 1)).ravel()

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression MSE:", mse)
print("Linear Regression R²:", r2)

After cleaning:
    Company Names  Cars Names  Engines  CC/Battery Capacity  HorsePower  \
0              8         889      353               3990.0       963.0   
1             28         741      345               6749.0       563.0   
2              9         611       26               1200.0        77.5   
3             21           0      353               3982.0       630.0   
4              1         139      344               5204.0       602.0   

   Total Speed  Performance(0 - 100 )KM/H  Cars Prices  Fuel Types  Seats  \
0        340.0                        2.5    1100000.0          22    2.0   
1        250.0                        5.3     460000.0          13    5.0   
2        165.0                       10.5      13500.0          13    5.0   
3        250.0                        3.2     161000.0          13    4.0   
4        320.0                        3.6     253290.0          13    2.0   

   Torque  
0   800.0  
1   900.0  
2   120.0  
3   900.0  
4   560.0  
Lin