In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score

Basic data Analysis

In [None]:
df = pd.read_csv("cars_ridge.csv")
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

Exploratory Data Analysis (EDA)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
numeric_cols = ['mileage', 'engine_size', 'horsepower', 'torque', 'doors', 'airbags', 'weight', 'fuel_efficiency', 'brand_score', 'luxury_index', 'price_k']
skewness = df[numeric_cols].skew()
print("Skewness of numeric columns:\n", skewness)

df[numeric_cols].hist(figsize=(12,10), color='skyblue')
plt.show()

In [None]:
numeric_cols = ['mileage', 'engine_size', 'horsepower', 'torque', 'doors', 'airbags', 'weight', 'fuel_efficiency', 'brand_score', 'luxury_index', 'price_k']
cor = df[numeric_cols].corr()
plt.figure(figsize=(6,4))
sns.heatmap(cor, annot=True, fmt='.2f', cmap='coolwarm', cbar=True)
plt.title("Correlation Heatmap")
plt.show()

Data Preprocessing

In [None]:
df['torque'] = np.log1p(df['torque'])
print("Skewness after log transform:")
print(df['torque'].skew())

df['torque'].hist(figsize=(6,3), color='skyblue', bins=20)
plt.suptitle("Histograms of Log-Transformed Columns", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = ['mileage', 'engine_size', 'horsepower', 'torque', 'doors', 'airbags', 'weight', 'fuel_efficiency', 'brand_score', 'luxury_index', 'price_k']
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers_count = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"{col}: {outliers_count} outliers")

plt.figure(figsize=(8,4))
df.boxplot(column=numeric_cols, grid=False)
plt.xticks(rotation=90)
plt.title("Boxplots of Numeric Columns")
plt.show()

In [None]:
discrete_cols = ['doors']
continuous_cols = ['weight', 'fuel_efficiency']

# Handle discrete columns (replace outliers with mode)
for col in discrete_cols:
    mode_val = df[col].mode()[0]
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    df[col] = df[col].apply(lambda x: mode_val if x < Q1 - 1.5*IQR or x > Q3 + 1.5*IQR else x)

# Handle continuous columns (cap outliers)
for col in continuous_cols:
    Q1, Q3 = df[col].quantile([0.25, 0.75])
    IQR = Q3 - Q1
    df[col] = df[col].clip(lower=Q1 - 1.5*IQR, upper=Q3 + 1.5*IQR)

In [None]:
for col in discrete_cols + continuous_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower) | (df[col] > upper)]
    print(f"{col}: {len(outliers)} outliers remaining")

In [None]:
df.head()

Feature Selection

In [None]:
X =  df[['mileage', 'engine_size', 'horsepower', 'torque', 'doors','airbags', 'weight', 'fuel_efficiency', 'brand_score', 'luxury_index']]
y = df['price_k']

In [None]:
print("Selected Features:")
X.columns.tolist()

In [None]:
print("Target Variable: price_k")

Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training and Testing Data Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

Scaling features

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Fit Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_train)

Fit RidgeCV

In [None]:
alphas = np.logspace(-3, 3, 50)
ridge_cv = RidgeCV(alphas=alphas, cv=5, scoring='r2')
ridge_cv.fit(X_train, y_train)
y_pred = ridge_cv.predict(X_test)

Evaluate the Model

In [None]:
print("Linear Regression Results:")
print("Intercept:", lr.intercept_)
print("Coefficients:", lr.coef_)
print("R² Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
print("RidgeCV Results:")
print("Best Alpha:", ridge_cv.alpha_)
print("R²:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))