In [1]:
#1. Imports

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objs as go



In [2]:
# Step 2: Load & Preprocess Data
df = pd.read_csv('../data/car_data.csv')
df4=pd.read_csv('../data/car_data4.csv')
df = pd.concat([df4, df], ignore_index=True)

# Extract model name (exclude brand)
df['model'] = df['name'].apply(lambda x: ' '.join(x.split()[1:]))

# Car age instead of year
df['car_age'] = 2025 - df['year']

# Drop unnecessary columns
df.drop(['Unnamed: 0', 'name', 'year'], axis=1, inplace=True)

# Convert to category and store category codes
for col in ['company', 'fuel_type', 'model']:
    df[col] = df[col].astype('category')
    df[col + '_cat'] = df[col].cat.codes


In [3]:
# Step 3: Define Features & Targets

# Categorical columns for embedding
cat_cols = ['company_cat', 'fuel_type_cat', 'model_cat']
cat_dims = [df[col].nunique() for col in cat_cols]  # e.g., [14, 3, 187]
emb_dims = [(n, min(50, (n + 1) // 2)) for n in cat_dims]  # embedding size rule

# Numerical columns
num_cols = ['car_age', 'kms_driven']

# Target
target = 'Price'


In [4]:
# Step 4: Train-Test Split

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Separate features
X_cat = df[cat_cols].values
X_num = df[num_cols].values
y = df[target].values

# Scale numeric features
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
scaler = StandardScaler()


# Train-test split
X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    X_cat, X_num_scaled, y, test_size=0.2, random_state=42
)
X_num_train = scaler.fit_transform(X_num_train)
X_num_test = scaler.transform(X_num_test)


In [5]:
class CarDataset(Dataset):
    def __init__(self, X_cat, X_num, y):
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)  # for embedding
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.X_cat)

    def __getitem__(self, idx):
        return self.X_cat[idx], self.X_num[idx], self.y[idx]


# Datasets
train_ds = CarDataset(X_cat_train, X_num_train, y_train)
test_ds = CarDataset(X_cat_test, X_num_test, y_test)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32)


In [6]:
class CarPriceModel(nn.Module):
    def __init__(self, cat_dims, emb_dims, num_input_dim):
        super().__init__()
        self.emb_layers = nn.ModuleList([
            nn.Embedding(input_dim, emb_dim) 
            for input_dim, emb_dim in zip(cat_dims, emb_dims)
        ])
        total_emb_dim = sum(emb_dims)
        self.fc = nn.Sequential(
            nn.Linear(total_emb_dim + num_input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )


    def forward(self, x_cat, x_num):
        embeddings = [emb(x_cat[:, i]) for i, emb in enumerate(self.emb_layers)]
        x = torch.cat(embeddings + [x_num], dim=1)
        return self.fc(x)


In [7]:
cat_dims = [len(df[col].unique()) for col in cat_cols]
emb_dims = [(dim, min(50, (dim+1)//2)) for dim in cat_dims]  # common rule
num_input_dim = X_num_train.shape[1]

# Recompile model after adding dropout
model = CarPriceModel(cat_dims, [e[1] for e in emb_dims], num_input_dim)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)


# Training loop
for epoch in range(100):
    model.train()
    epoch_loss = 0
    for cat_x, num_x, y in train_dl:
        optimizer.zero_grad()
        preds = model(cat_x, num_x)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1} Loss: {epoch_loss/len(train_dl):.2f}')


Epoch 1 Loss: 495736722721.07
Epoch 2 Loss: 230740544428.71
Epoch 3 Loss: 115149671909.05
Epoch 4 Loss: 80548094637.93
Epoch 5 Loss: 63735716447.54
Epoch 6 Loss: 54140933668.75
Epoch 7 Loss: 47789863279.46
Epoch 8 Loss: 43128200299.79
Epoch 9 Loss: 39761866957.78
Epoch 10 Loss: 36918098015.54
Epoch 11 Loss: 35549488392.57
Epoch 12 Loss: 33060821579.94
Epoch 13 Loss: 31685145470.16
Epoch 14 Loss: 30628522941.86
Epoch 15 Loss: 29470667631.46
Epoch 16 Loss: 28739907995.56
Epoch 17 Loss: 27982522051.98
Epoch 18 Loss: 27286708285.24
Epoch 19 Loss: 26676931100.17
Epoch 20 Loss: 26108815413.89
Epoch 21 Loss: 25618597552.38
Epoch 22 Loss: 25109746864.38
Epoch 23 Loss: 24684107834.79
Epoch 24 Loss: 24234467945.34
Epoch 25 Loss: 23908827741.09
Epoch 26 Loss: 23322516952.80
Epoch 27 Loss: 23003284587.79
Epoch 28 Loss: 22707236241.76
Epoch 29 Loss: 22360885137.76
Epoch 30 Loss: 22067838946.60
Epoch 31 Loss: 21745764557.78
Epoch 32 Loss: 21447910595.98
Epoch 33 Loss: 21243456921.11
Epoch 34 Loss: 2

In [8]:
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for cat_x, num_x, y in test_dl:
        preds = model(cat_x, num_x)
        all_preds.append(preds.numpy())
        all_targets.append(y.numpy())

# Combine all batches
y_pred = np.vstack(all_preds).flatten()
y_true = np.vstack(all_targets).flatten()

# Metrics
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f'MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R² Score: {r2:.4f}')


MAE: 101482.95
RMSE: 192940.75
R² Score: 0.7477


In [10]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_true, y=y_pred, mode='markers', name='Predicted vs Actual'))
fig.add_trace(go.Scatter(x=y_true, y=y_true, mode='lines', name='Ideal Line'))
fig.update_layout(title='Actual vs Predicted Prices',
                  xaxis_title='Actual Price',
                  yaxis_title='Predicted Price',
                  height=500)
fig.show()
