In [None]:
# Import libraries
import os
import math
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

: 

In [None]:
# Define paths

USED_CAR = os.path.join(os.getcwd(), 'malaysia_used_cars.csv')

MODELS = os.path.join(os.getcwd(), 'models')

In [None]:
# Step 1: Data Loading
df = pd.read_csv(USED_CAR)
df.head()

In [None]:
df.info()
df.describe().T

In [None]:
categ = [
    "make",
    "model",
    "trim",
    "car_type",
    "transmission",
    "fuel_type",
    "engine_cc",
    "is_turbo",
    "origin_country",
    "location",
    "condition"
]

conti = [
    "year",
    "mileage",
    "battery_kWh",
    "retail_price(RM)",
    "current_price(RM)"
]

In [None]:
# Missing data inspection
df.isna().sum()

# Checking duplicated - none
df.duplicated().sum()

In [None]:
sns.countplot(df, y='make', hue='trim')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Step 2: Data inspection
n_cols = 2
n_rows = math.ceil(len(categ) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 5))
axes = axes.flatten()

for idx, col in enumerate(categ):
    value_counts = df[col].value_counts()
    
    # Horizontal bar plot
    value_counts.plot(kind='barh', ax=axes[idx], color='steelblue')
    axes[idx].set_title(f'{col}', fontweight='bold', fontsize=12)
    axes[idx].set_xlabel('Count')
    axes[idx].set_ylabel('')
    
    # Add value labels
    for i, v in enumerate(value_counts):
        axes[idx].text(v + 0.5, i, str(v), va='center')

# Hide extra subplots
for idx in range(len(categ), len(axes)):
    axes[idx].set_visible(False)

plt.suptitle('Categorical Variables Distribution', fontsize=16, fontweight='bold')
plt.show()

In [None]:
n_rows = math.ceil(len(conti) / n_cols)

fig = plt.figure(figsize=(n_cols * 5, n_rows * 5))

# Adjust these values to control spacing
horizontal_gap = 0.15  # Gap between columns (increase for more space)
vertical_gap = 0.12    # Gap between rows (increase for more space)

for idx, col in enumerate(conti):
    # Calculate position in grid
    row = idx // n_cols
    col_pos = idx % n_cols
    
    # Create GridSpec for this subplot (boxplot + histogram)
    gs = gridspec.GridSpec(2, 1, height_ratios=[1, 4], 
                          hspace=0.05,
                          left=col_pos/n_cols + horizontal_gap/(2*n_cols),
                          right=(col_pos+1)/n_cols - horizontal_gap/(2*n_cols),
                          bottom=1 - (row+1)/n_rows + vertical_gap/(2*n_rows),
                          top=1 - row/n_rows - vertical_gap/(2*n_rows))
    
    # Boxplot
    ax_box = fig.add_subplot(gs[0])
    sns.boxplot(x=df[col], ax=ax_box, color='lightblue')
    ax_box.set_xlabel('')
    ax_box.set_xticklabels([])
    
    # Calculate statistics
    mean_val = df[col].mean()
    median_val = df[col].median()
    
    ax_box.set_title(f'{col} (Mean: {mean_val:.2f})', fontweight='bold', fontsize=12)
    
    # Histogram
    ax_hist = fig.add_subplot(gs[1], sharex=ax_box)
    sns.histplot(df[col], kde=True, bins=30, ax=ax_hist)
    
    # Add mean and median lines
    ax_hist.axvline(mean_val, color='red', linestyle='--', linewidth=2, alpha=0.7, label=f'Mean: {mean_val:.2f}')
    ax_hist.axvline(median_val, color='green', linestyle='--', linewidth=2, alpha=0.7, label=f'Median: {median_val:.2f}')
    ax_hist.legend(fontsize=9)
    ax_hist.set_xlabel(col)

plt.suptitle('Distribution of Continuous Variables', fontsize=16, fontweight='bold', y=1.01)
plt.show()

In [None]:
for con in conti:
    sns.jointplot(data=df, x=con, y="current_price(RM)", kind="hist")
    plt.show()

In [None]:
for cat in categ:
    plt.figure(figsize=(10, 6))  # Create a new figure for each plot
    sns.boxplot(data=df, x=cat, y="current_price(RM)")  # Fixed: x and y were swapped
    plt.title(f'Box Plot: {cat} vs current_price(RM)', fontweight='bold')
    plt.xlabel(cat)
    plt.ylabel('current_price(RM)')
    plt.xticks(rotation=45, ha='right')  # Rotate labels if they're long
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Step 3: Data Cleaning      
        
# Label encode categorical features
bin_categ = categ.remove('engine_cc')

dft = df.copy()

for col in categ:
    
    if dft[col].isna().sum():
        dft[col].fillna(dft[col].mode()[0], inplace=True)

    le = LabelBinarizer() # turn word into numbers
    dft[col] = dft[col].astype(str)  # ensure consistent format (text format)
    dft[col] = le.fit_transform(dft[col]) # fit (learn all unique words) and transform word with number
    os.makedirs(MODELS, exist_ok=True)
    with open(os.path.join(MODELS, f'{col}_encoder.pkl'), 'wb') as f:
        pickle.dump(le, f)

In [None]:
for con in conti:
    dft[con].fillna(dft[con].mean(), inplace=True)

In [None]:
X = dft.drop(["current_price(RM)"], axis=1)
y = dft["current_price(RM)"]

# --- Continuous vs Continuous (Correlation with Target) ---
print("\n--- Continuous vs Continuous Correlation ---")
df_conti = dft[conti].drop(["current_price(RM)"], axis=1)
cor = df_conti.corrwith(y)  # correlation with target
cor = cor.sort_values(ascending=False)
print(cor)

# Visualize correlation with target
plt.figure(figsize=(8, 6))
sns.barplot(x=cor.values, y=cor.index)
plt.title('Feature Correlation with Target (Continuous)')
plt.xlabel('Correlation Coefficient')
plt.show()

# Select features above threshold
threshold = 0.3
sel_features = cor[abs(cor) > threshold].index.tolist()

# --- Continuous vs Categorical (via Linear Regression) ---
print("\n--- Continuous vs Categorical ---")
for i in categ:
    lr = LinearRegression()
    lr.fit(np.expand_dims(dft[i], axis=-1), y)
    score = lr.score(np.expand_dims(dft[i], axis=-1), y)
    print(f"{i} : {score:.3f}")
    if score > 0.05:  # low threshold for categorical influence
        sel_features.append(i)

# Remove duplicates
sel_features = list(set(sel_features))

print("\n✅ Selected Features:")
print(sel_features)

In [None]:
# Step 4: Features selection

X = X[sel_features]

# Initialize scaler
scaler = MinMaxScaler()

# Fit and transform
dft[sel_features] = scaler.fit_transform(dft[sel_features])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
X.columns

In [None]:
# Step 5: Training & Prediction

# Train a model (Random Forest)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions
final_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, final_pred)
rmse = np.sqrt(mean_squared_error(y_test, final_pred))
r2 = r2_score(y_test, final_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

In [None]:
print("\n" + "="*70)
print("MODEL PERFORMANCE METRICS")
print("="*70)

# Predictions
train_pred = model.predict(X_train)
final_pred = model.predict(X_test)

# Training Set Metrics
train_mae = mean_absolute_error(y_train, train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
train_r2 = r2_score(y_train, train_pred)

# Test Set Metrics
test_mae = mean_absolute_error(y_test, final_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, final_pred))
test_r2 = r2_score(y_test, final_pred)

# Additional Metrics
mape = np.mean(np.abs((y_test - final_pred) / y_test)) * 100
residuals = y_test - final_pred

print("\nTRAINING SET:")
print(f"  MAE:  {train_mae:.2f}")
print(f"  RMSE: {train_rmse:.2f}")
print(f"  R²:   {train_r2:.3f}")

print("\nTEST SET:")
print(f"  MAE:  {test_mae:.2f}")
print(f"  RMSE: {test_rmse:.2f}")
print(f"  R²:   {test_r2:.3f}")
print(f"  MAPE: {mape:.2f}%")

print("\nOVERFITTING CHECK:")
print(f"R² Difference: {train_r2 - test_r2:.3f}")
if (train_r2 - test_r2) > 0.1:
    print("Possible overfitting detected!")
else:
    print("Model generalizes well")

In [None]:
# Save model
os.makedirs(MODELS, exist_ok=True)
with open(os.path.join(MODELS, f'RF_regression.pkl'), 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Step 6: Visualization

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=final_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
         color='red', linestyle='--', label='Perfect Prediction')
plt.xlabel("Actual Price (RM)")
plt.ylabel("Predicted Price (RM)")
plt.title(f"Actual vs Predicted Prices")
plt.legend()
plt.grid(True)
plt.show()