In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv("D:\\Projects\\Linear Regression\\Dataset\\cars_dataset.csv")

df.dropna(inplace=True)

# Reset index before applying outlier removal
df = df.reset_index(drop=True)

# Select numerical features for outlier detection
num_features = ["year", "mileage", "tax", "mpg", "engineSize"]

# Compute IQR
Q1 = df[num_features].quantile(0.25)
Q3 = df[num_features].quantile(0.75)
IQR = Q3 - Q1

# Mask for valid data (keeping only non-outliers)
mask = ~((df[num_features] < (Q1 - 1.5 * IQR)) | (df[num_features] > (Q3 + 1.5 * IQR))).any(axis=1)
df = df[mask].reset_index(drop=True)

In [None]:
features = ["year", "mileage", "tax", "mpg", "engineSize", "Make", "model", "transmission", "fuelType"]
target = "price"

X = df[features]
y = df[target]

In [None]:
y = np.log1p(y)  # Log transform target (price)
X["mileage"] = np.log1p(X["mileage"])


In [None]:
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
categorical_encoded = encoder.fit_transform(X[['Make', 'model', 'transmission', 'fuelType']])
categorical_columns = encoder.get_feature_names_out(['Make', 'model', 'transmission', 'fuelType'])
categorical_df = pd.DataFrame(categorical_encoded, columns=categorical_columns, index=X.index)

# Merge Encoded Features with Scaled Numerical Features
X = X.drop(columns=['Make', 'model', 'transmission', 'fuelType']).reset_index(drop=True)
X = pd.concat([X, categorical_df], axis=1)

In [None]:
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [None]:
selector = SelectKBest(score_func=mutual_info_regression, k=min(20, X.shape[1]))
X_selected = selector.fit_transform(X, y)


In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X_selected)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [None]:
model = Ridge(alpha=0.5)  # Fine-tuned alpha
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

# Convert predictions back from log scale
y_test_actual = np.expm1(y_test)
y_pred_actual = np.expm1(y_pred)


In [None]:
mae = mean_absolute_error(y_test_actual, y_pred_actual)
mse = mean_squared_error(y_test_actual, y_pred_actual)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, y_pred_actual)
mape = np.mean(np.abs((y_test_actual - y_pred_actual) / y_test_actual)) * 100

# Cross-validation for stability check
cv_scores = cross_val_score(model, X_poly, y, cv=5, scoring='r2')

# Print Performance Metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")
print(f"Cross-Validation R² Score: {np.mean(cv_scores)}")

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test_actual, y=y_pred_actual)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Car Prices")
plt.show()

residuals = y_test_actual - y_pred_actual
plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True)
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Residuals Distribution")
plt.show()

In [None]:
import pickle
import os

# Ensure the "model" directory exists
os.makedirs("model", exist_ok=True)

# Save the trained Ridge model
with open("model/ridge_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save preprocessing objects
with open("model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("model/encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

with open("model/feature_selector.pkl", "wb") as f:
    pickle.dump(selector, f)

with open("model/poly_transform.pkl", "wb") as f:
    pickle.dump(poly, f)

print("Model and preprocessing objects saved successfully!")
