In [2]:
import kagglehub
import os
import pandas as pd
import shutil

# Download the dataset
path = kagglehub.dataset_download("sujay1844/used-car-prices")

# Get filename
filename = os.listdir(path)[0]
full_path = os.path.join(path, filename)

# Load DataFrame
df = pd.read_csv(full_path)

# Optionally delete the dataset folder so the code is reusable
shutil.rmtree(path)

ReadTimeout: HTTPSConnectionPool(host='www.kaggle.com', port=443): Read timed out. (read timeout=5)

In [None]:
df.head()

In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
df.head()

## Exploring Hyundai vehicles

### We are using hyundai vehicles because:
+ Hyundai is one of the most popular cars in India (Google)
+ Hyundai cars are a large enough population in our dataset for analysis and model building

In [None]:
# We decided to restrict hyundai 
hyundai_mask = df["Name"].str.capitalize().str.contains("Hyundai")
df2 = df[hyundai_mask]

hyundais = df2["Name"].value_counts()
hyundais_greater_than_5 = hyundais[hyundais > 5].index

df3 = df2[df2["Name"].isin(hyundais_greater_than_5)]
len(df3)#["Name"].value_counts()

### Data Preprocessing

In [None]:
df3.loc[:, "Mileage"] = df3["Mileage"].str.split(" ", expand=True)[0].astype(float)
df3.loc[:, "Power"] = df3["Power"].str.split(" ", expand=True)[0].astype(float)

In [None]:
#df3.info()
df3.describe()

In [None]:
df3["Year"].plot(kind="hist")

In [None]:
df3["Kilometers_Driven"].plot(kind="hist")

In [None]:
df3["Mileage"].plot(kind="hist")

### Removing Outliers

In [None]:
df4 = df3[df3["Year"] > 2010]
df4.head()

In [None]:
df5 = df4[df4["Kilometers_Driven"] < 120000]
print(df5.shape)
df5.head()

In [None]:
df6 = df5[df5["Mileage"] > 12]
print(df6.shape)
df6.head()

## Modelling

### Data Preparation

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Cars with the same name have the same HorsePower so the Power Column is redundant
df6.drop(columns=["Seats", "Engine", "New_Price", "Power"], inplace=True)

In [None]:
X = df6.drop(columns=["Price"])
y = df6["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Building and Evaluation

In [None]:
y_train_pred_baseline = [y_train.mean()] * len(y_train)
base_training_mae = mean_absolute_error(y_train_pred_baseline, y_train)
print("Baseline MAE on Training Set: ",base_training_mae)

In [None]:
categorical_cols = ["Name", "Location", "Fuel_Type", "Transmission", "Owner_Type"]

preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder='passthrough')

model = make_pipeline(preprocessor, LinearRegression())
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)

In [None]:
model_training_mae = mean_absolute_error(y_train_pred, y_train)
print("Model MAE on In-House Predictions: ",model_training_mae)

In [None]:
resid = y_train - y_train_pred 
resid.plot(kind="hist")

In [None]:
y_test_pred_baseline = [y_test.mean()] * len(y_test)
base_test_mae = mean_absolute_error(y_test_pred_baseline, y_test)
print("Baseline MAE on Test Set: ",base_test_mae)

In [None]:
y_test_pred = model.predict(X_test)

model_test_mae = mean_absolute_error(y_test_pred, y_test)
print("Model MAE on Out of Sample Data: ",model_test_mae)

### Iterate

In [None]:
df7 = df6.drop(columns=["Location"])

In [None]:
X2 = df7.drop(columns=["Price"])
y2 = df7["Price"]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [None]:
categorical_cols = ["Name", "Fuel_Type", "Transmission", "Owner_Type"]

preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)], remainder='passthrough')

model2 = make_pipeline(preprocessor, LinearRegression())
model2.fit(X_train2, y_train2)

y_train_pred2 = model2.predict(X_train2)

model_training_mae2 = mean_absolute_error(y_train_pred2, y_train2)
print("Model(Without Location Column) MAE on In-House Predictions: ",model_training_mae2)

In [None]:
resid2 = y_train2 - y_train_pred2 
resid.plot(kind="hist")

In [None]:
y_test_pred2 = model2.predict(X_test2)

model_test_mae2 = mean_absolute_error(y_test_pred2, y_test2)
print("Model(Without Location Column) MAE on Out of Sample Data: ",model_test_mae2)

In [None]:
from ipywidgets import Dropdown, FloatSlider, IntSlider, interact

In [None]:
X_train.head()

In [None]:
def make_prediction(year, kms_driven, mile, name, loc, fuel, trans, owners):
    df = pd.DataFrame({"Name": name,
                        "Location": loc,
                        "Year": year,
                        "Kilometers_Driven": kms_driven,
                        "Fuel_Type": fuel,
                        "Transmission": trans,
                        "Owner_Type": owners,
                        "Mileage": mile
                        }, index=[0])
    prediction = model.predict(df).round(2)[0]
    return f"Predicted car price: ₹{prediction}(${round(prediction * 1167.49, 2)})"

In [None]:
interact(
    make_prediction,
    year=IntSlider(
        min=X_train["Year"].min(),
        max=X_train["Year"].max(),
        value=X_train["Year"].mean(),
    ),
    kms_driven=FloatSlider(
        min=X_train["Kilometers_Driven"].min(),
        max=X_train["Kilometers_Driven"].max(),
        step=0.01,
        value=X_train["Kilometers_Driven"].mean(),
    ),
    mile=FloatSlider(
        min=X_train["Mileage"].min(),
        max=X_train["Mileage"].max(),
        step=0.01,
        value=X_train["Mileage"].mean(),
    ),
    name=Dropdown(options=sorted(X_train["Name"].unique())
    ),
    loc=Dropdown(options=sorted(X_train["Location"].unique())
    ),
    fuel=Dropdown(options=sorted(X_train["Fuel_Type"].unique())
    ),
    trans=Dropdown(options=sorted(X_train["Transmission"].unique())
    ),
    owners=Dropdown(options=sorted(X_train["Owner_Type"].unique())),
);