In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("data/car_sales_data.csv")

In [None]:
print(df.shape)
df.info()

In [None]:
df.head()

In [None]:
print(df.describe())
print(df.isnull().sum())

In [None]:
df['Manufacturer'].value_counts()

In [None]:
df['Manufacturer'].nunique()

In [None]:
df['Model'].value_counts()

In [None]:
df['Model'].nunique()

In [None]:
df['Engine size'].value_counts()

In [None]:
df['Engine size'].nunique()

In [None]:
df['Year of manufacture'].value_counts()

In [None]:
df['Engine size'].max()

In [None]:
df['Year of manufacture'].nunique()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Price distribution
plt.figure(figsize=(6,4))
sns.histplot(df['Price'], bins=40, kde=True)
plt.title("Distribution of Car Price")
plt.show()

In [None]:
# Boxplot to detect outliers in Price
plt.figure(figsize=(6,2))
sns.boxplot(x=df['Price'])
plt.title("Boxplot of Price")
plt.show()

In [None]:
# Engine Size vs Price
plt.figure(figsize=(6,4))
sns.scatterplot(x='Engine size', y='Price', data=df)
plt.title("Price vs Engine Size")
plt.show()

In [None]:
# Mileage vs Price
plt.figure(figsize=(6,4))
sns.scatterplot(x='Mileage', y='Price', data=df)
plt.title("Price vs Mileage")
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.scatterplot(x='Year of manufacture', y='Price', data=df)
plt.title("Price vs Year of Manufacture")
plt.xlabel("Year of Manufacture")
plt.ylabel("Price")
plt.show()

In [None]:
#Handle Outliers using IQR Method

#List of numeric columns to check
numeric_cols = ['Engine size', 'Mileage', 'Price']

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    #limits 
    print(f"{col} → Lower: {lower_limit:.2f}, Upper: {upper_limit:.2f}")

    df = df[(df[col] >= lower_limit) & (df[col] <= upper_limit)]

#Checking new dataset shape after removing outliers
print("\nAfter removing outliers:")
print(df.shape)

In [None]:
#Visualize boxplots after removing outliers
plt.figure(figsize=(12,4))

for i, col in enumerate(['Engine size', 'Mileage', 'Price']):
    plt.subplot(1, 3, i+1)
    sns.boxplot(x=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()


In [None]:
CURRENT_YEAR = 2025  

df['Age'] = CURRENT_YEAR - df['Year of manufacture']

df['Mileage_per_year'] = df['Mileage'] / df['Age'].replace(0, np.nan)

print(df[['Year of manufacture', 'Age', 'Mileage', 'Mileage_per_year']].head())

In [None]:
#Handling categorical columns

categorical_cols = ['Manufacturer', 'Model', 'Fuel type']

for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(df[col].value_counts().head(10))  # show top 10
    print(f"Unique values in {col}: {df[col].nunique()}")

#Handling rare categories 

for col in categorical_cols:
    counts = df[col].value_counts()
    rare_labels = counts[counts < 50].index
    df[col] = df[col].replace(rare_labels, 'Other')

#Applying One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\n✅ After encoding:", df_encoded.shape)
df_encoded.head()

In [None]:
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#Create and train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

#Predictions
y_pred = lr.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Car Prices")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R2 Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

In [None]:
#Residuals
residuals = y_test - y_pred

#1)Residuals histogram (check normality / skew)
plt.figure(figsize=(6,4))
sns.histplot(residuals, kde=True)
plt.title("Residuals Distribution (y_test - y_pred)")
plt.xlabel("Residual")
plt.show()

In [None]:
#2)Residuals vs Predicted 
plt.figure(figsize=(6,4))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted Price")
plt.ylabel("Residual (Actual - Predicted)")
plt.title("Residuals vs Predicted")
plt.show()

In [None]:
#3)Q-Q plot for residuals 
import scipy.stats as stats
plt.figure(figsize=(6,4))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Q-Q plot of residuals")
plt.show()

In [None]:
#Log transform the target
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

#Retrain linear regression on log-transformed target
lr_log = LinearRegression()
lr_log.fit(X_train, y_train_log)

y_pred_log = lr_log.predict(X_test)

y_pred_exp = np.exp(y_pred_log)

print("After Log Transformation:")
print(f"R2 Score: {r2_score(y_test, y_pred_exp):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_exp):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_exp)):.2f}")


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred_exp, alpha=0.5)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price (After Log Transform)")
plt.title("Actual vs Predicted Car Prices (Log Model)")
plt.show()

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge = Ridge(alpha=1.0) 
ridge.fit(X_train, y_train_log)

#Prediction and revert back
y_pred_ridge_log = ridge.predict(X_test)
y_pred_ridge_exp = np.exp(y_pred_ridge_log)

print("🔹 Ridge Regression Results:")
print(f"R2 Score: {r2_score(y_test, y_pred_ridge_exp):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_ridge_exp):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_ridge_exp)):.2f}")

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train_log)

y_pred_lasso_log = lasso.predict(X_test)
y_pred_lasso_exp = np.exp(y_pred_lasso_log)

print("🔹 Lasso Regression Results:")
print(f"R2 Score: {r2_score(y_test, y_pred_lasso_exp):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lasso_exp):.2f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lasso_exp)):.2f}")

In [None]:
import joblib

#Saving the model and columns
joblib.dump(ridge, "car_price_model.pkl")
joblib.dump(X_train.columns.tolist(), "model_columns.pkl")

print("✅ Model and columns saved!")


In [None]:
#Loading the model
loaded_model = joblib.load("car_price_model.pkl")

#Make prediction using the loaded model
sample_pred = loaded_model.predict(X_test[:5])

sample_pred_exp = np.exp(sample_pred)

print("Predicted Prices:", sample_pred_exp)
print("Actual Prices:", y_test[:5].values)

In [None]:
#Example 
new_car = {
    'Manufacturer': ['Toyota'],
    'Model': ['Yaris'],
    'Engine size': [1.5],
    'Fuel type': ['Petrol'],
    'Year of manufacture': [2018],
    'Mileage': [45000]
}

#Convert to DataFrame
new_car_df = pd.DataFrame(new_car)


In [None]:
#Aligning the new input columns with the training set
new_car_encoded = pd.get_dummies(new_car_df, drop_first=True)

#Reindexing to match X_train columns
new_car_encoded = new_car_encoded.reindex(columns=X_train.columns, fill_value=0)


In [None]:
#Predicting (log scale) and then revert to normal price
predicted_log_price = loaded_model.predict(new_car_encoded)
predicted_price = np.exp(predicted_log_price)

print(f"Estimated Price for this car: ₹{predicted_price[0]:,.2f}")
