## Data Pre-Processing


### Import Packages and CSV


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# Create Dataframe


In [2]:
df = pd.read_csv("data/uae_used_cars_10k.csv")
df.head()

Unnamed: 0,Make,Model,Year,Price,Mileage,Body_Type,Cylinders,Transmission,Fuel_Type,Color,Location,Description
0,toyota,camry,2016,47819,156500,Sedan,4,Automatic Transmission,Gasoline,Black,Dubai,"2016 toyota camry with Rear camera, Leather se..."
1,kia,sorento,2013,61250,169543,SUV,4,Automatic Transmission,Gasoline,Grey,Abu Dhabi,"2013 kia sorento with Sunroof, Adaptive cruise..."
2,mini,cooper,2023,31861,221583,Soft Top Convertible,4,Automatic Transmission,Gasoline,Grey,Dubai,"2023 mini cooper with Adaptive cruise control,..."
3,nissan,altima,2016,110322,69754,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2016 nissan altima with Rear camera, Adaptive ..."
4,toyota,land-cruiser-76-series,2020,139994,71399,Pick Up Truck,4,Manual Transmission,Gasoline,White,Dubai,2020 toyota land-cruiser-76-series with Adapti...


In [3]:
df.shape

(10000, 12)

### Data Cleaning

#### Handling Missing values

- Handling Missing values
- Handling Duplicates
- Check data type
- Understand the dataset

### Check Null Values

In [4]:
df.isna().sum()

Make              0
Model             0
Year              0
Price             0
Mileage           0
Body_Type         0
Cylinders       105
Transmission      0
Fuel_Type         0
Color             0
Location          0
Description       0
dtype: int64

In [5]:
df['Cylinders'] = pd.to_numeric(df['Cylinders'], errors='coerce').fillna(4).astype(int)

In [6]:
df.isna().sum() 

Make            0
Model           0
Year            0
Price           0
Mileage         0
Body_Type       0
Cylinders       0
Transmission    0
Fuel_Type       0
Color           0
Location        0
Description     0
dtype: int64

#### Hadling Duplicates

In [7]:
df.duplicated().sum()

np.int64(0)

#### Report
- No Duplicates in Data

### Features Engineering

In [8]:
df.loc[df['Make']== 'rolls-royce']

Unnamed: 0,Make,Model,Year,Price,Mileage,Body_Type,Cylinders,Transmission,Fuel_Type,Color,Location,Description
36,rolls-royce,phantom,2006,442922,107637,Sedan,12,Automatic Transmission,Gasoline,White,Dubai,"2006 rolls-royce phantom with Sunroof, Rear ca..."
139,rolls-royce,wraith,2023,884378,86067,Coupe,12,Automatic Transmission,Gasoline,Black,Dubai,"2023 rolls-royce wraith with Sunroof, Adaptive..."
255,rolls-royce,cullinan,2013,1981429,124899,SUV,12,Automatic Transmission,Gasoline,White,Dubai,"2013 rolls-royce cullinan with Bluetooth, Leat..."
312,rolls-royce,cullinan,2007,2062071,162118,SUV,12,Automatic Transmission,Gasoline,Red,Dubai,2007 rolls-royce cullinan with Navigation syst...
328,rolls-royce,ghost,2013,1384716,36406,Sedan,12,Automatic Transmission,Gasoline,Red,Dubai,"2013 rolls-royce ghost with Sunroof, Bluetooth..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9532,rolls-royce,phantom,2015,2323259,49782,SUV,12,Automatic Transmission,Gasoline,Black,Dubai,"2015 rolls-royce phantom with Bluetooth, Adapt..."
9572,rolls-royce,wraith,2006,548526,161291,Coupe,12,Automatic Transmission,Gasoline,White,Dubai,"2006 rolls-royce wraith with Rear camera, Navi..."
9615,rolls-royce,wraith,2005,695428,266808,Coupe,12,Automatic Transmission,Gasoline,Black,Dubai,"2005 rolls-royce wraith with Leather seats, Ad..."
9789,rolls-royce,wraith,2006,1098639,191842,Coupe,12,Automatic Transmission,Gasoline,Red,Dubai,"2006 rolls-royce wraith with Bluetooth, Sunroo..."


In [9]:
df['Make_Model'] = df['Make'].str.cat(df['Model'], sep=' ')
df.head()

Unnamed: 0,Make,Model,Year,Price,Mileage,Body_Type,Cylinders,Transmission,Fuel_Type,Color,Location,Description,Make_Model
0,toyota,camry,2016,47819,156500,Sedan,4,Automatic Transmission,Gasoline,Black,Dubai,"2016 toyota camry with Rear camera, Leather se...",toyota camry
1,kia,sorento,2013,61250,169543,SUV,4,Automatic Transmission,Gasoline,Grey,Abu Dhabi,"2013 kia sorento with Sunroof, Adaptive cruise...",kia sorento
2,mini,cooper,2023,31861,221583,Soft Top Convertible,4,Automatic Transmission,Gasoline,Grey,Dubai,"2023 mini cooper with Adaptive cruise control,...",mini cooper
3,nissan,altima,2016,110322,69754,Sedan,4,Automatic Transmission,Gasoline,Red,Dubai,"2016 nissan altima with Rear camera, Adaptive ...",nissan altima
4,toyota,land-cruiser-76-series,2020,139994,71399,Pick Up Truck,4,Manual Transmission,Gasoline,White,Dubai,2020 toyota land-cruiser-76-series with Adapti...,toyota land-cruiser-76-series


#### Make and Model column can dropped as the information is already available in Make_Model Column

In [10]:
df.drop(['Make', 'Model','Description'], axis=1, inplace=True)

3. Feature Engineering
- Create vehicle_age = current year - Year



In [11]:
df['vehicle_age'] = 2025 - df['Year']
df.drop('Year', axis=1, inplace=True)

4. Outlier Handling (IQR Capping)

In [12]:
def cap_outliers(col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower,
                       np.where(df[col] > upper, upper, df[col]))

for col in ['Price', 'Mileage']:
    cap_outliers(col)

In [13]:
# Example fix
df.replace('Unknown', np.nan, inplace=True)
df.fillna(method='ffill', inplace=True)


In [14]:
df['Cylinders'] = pd.to_numeric(df['Cylinders'], errors='coerce')
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
df['vehicle_age'] = pd.to_numeric(df['vehicle_age'], errors='coerce')


7. Encoding + Scaling

In [15]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer

# Features
onehot_columns = ['Body_Type', 'Transmission', 'Fuel_Type', 'Color']
binary_columns = ['Make_Model']
num_features = ['Mileage', 'vehicle_age', 'Cylinders']

# Transformers
numeric_transformer = StandardScaler()
binary_transformer = BinaryEncoder()
onehot_transformer = OneHotEncoder(handle_unknown='ignore')

# Column Transformer
preprocessor = ColumnTransformer([
    ("OneHotEncoder", onehot_transformer, onehot_columns),
    ("StandardScaler", numeric_transformer, num_features),
    ("BinaryEncoder", binary_transformer, binary_columns)
])


#### Preprocessing Feature Matrix

In [16]:
X = df.drop('Price', axis=1)
y = np.log1p(df['Price'])

X = preprocessor.fit_transform(X)

#### Train-Test Split

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Model Evaluation Function

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2 = r2_score(true, predicted)
    return mae, rmse, r2


#### Train All Models (Using Your Dataset)

In [19]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": XGBRegressor(),
    "CatBoost": CatBoostRegressor(verbose=False),
    "AdaBoost": AdaBoostRegressor()
}

model_list = []
r2_list = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae, rmse, r2 = evaluate_model(y_test, y_pred)

    model_list.append(name)
    r2_list.append(r2)

    print(f"🔸 {name}")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE     : {mae:.2f}")
    print(f"RMSE    : {rmse:.2f}")
    print("="*35)


🔸 Linear Regression
R² Score: 0.3225
MAE     : 0.65
RMSE    : 0.80
🔸 Lasso
R² Score: -0.0002
MAE     : 0.81
RMSE    : 0.97
🔸 Ridge
R² Score: 0.3226
MAE     : 0.65
RMSE    : 0.80
🔸 KNN
R² Score: 0.4592
MAE     : 0.54
RMSE    : 0.71
🔸 Decision Tree
R² Score: 0.2719
MAE     : 0.57
RMSE    : 0.83
🔸 Random Forest
R² Score: 0.6068
MAE     : 0.46
RMSE    : 0.61
🔸 XGBoost
R² Score: 0.5175
MAE     : 0.52
RMSE    : 0.67
🔸 CatBoost
R² Score: 0.5416
MAE     : 0.51
RMSE    : 0.66
🔸 AdaBoost
R² Score: 0.2538
MAE     : 0.69
RMSE    : 0.84


#### Hyperparameter Tuning for Top 4 Models

In [20]:
from sklearn.model_selection import RandomizedSearchCV

# Param grids
knn_params = {"n_neighbors": [3, 5, 10, 15, 20]}
rf_params = {
    "n_estimators": [100, 300, 500],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "max_features": ["sqrt", "log2"]
}
xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "colsample_bytree": [0.5, 0.8, 1]
}
cat_params = {
    "depth": [6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "iterations": [300, 500]
}

# Models to tune
tune_models = [
    ("KNN", KNeighborsRegressor(), knn_params),
    ("Random Forest", RandomForestRegressor(), rf_params),
    ("XGBoost", XGBRegressor(), xgb_params),
    ("CatBoost", CatBoostRegressor(verbose=False), cat_params)
]

best_params = {}

for name, model, params in tune_models:
    search = RandomizedSearchCV(model, params, cv=3, scoring='r2', n_iter=30, n_jobs=-1, verbose=2)
    search.fit(X_train, y_train)
    best_params[name] = search.best_params_
    print(f"✅ Best Params for {name}: {search.best_params_}")


Fitting 3 folds for each of 5 candidates, totalling 15 fits
✅ Best Params for KNN: {'n_neighbors': 10}
Fitting 3 folds for each of 30 candidates, totalling 90 fits
✅ Best Params for Random Forest: {'n_estimators': 300, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20}
Fitting 3 folds for each of 30 candidates, totalling 90 fits
✅ Best Params for XGBoost: {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Fitting 3 folds for each of 18 candidates, totalling 54 fits
✅ Best Params for CatBoost: {'learning_rate': 0.05, 'iterations': 500, 'depth': 10}


#### Final Training with Best Parameters

In [21]:
final_models = {
    "KNN": KNeighborsRegressor(**best_params["KNN"]),
    "Random Forest": RandomForestRegressor(**best_params["Random Forest"]),
    "XGBoost": XGBRegressor(**best_params["XGBoost"]),
    "CatBoost": CatBoostRegressor(**best_params["CatBoost"], verbose=False)
}

final_results = []

for name, model in final_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae, rmse, r2 = evaluate_model(y_test, y_pred)
    final_results.append((name, r2))

    print(f"🚀 {name}")
    print(f"✅ R² Score: {r2:.4f}")
    print(f"📉 MAE: {mae:.2f}, RMSE: {rmse:.2f}")
    print("="*40)


🚀 KNN
✅ R² Score: 0.4609
📉 MAE: 0.55, RMSE: 0.71
🚀 Random Forest
✅ R² Score: 0.6239
📉 MAE: 0.46, RMSE: 0.59
🚀 XGBoost
✅ R² Score: 0.6158
📉 MAE: 0.46, RMSE: 0.60
🚀 CatBoost
✅ R² Score: 0.6342
📉 MAE: 0.45, RMSE: 0.59


### Final Models Evaluation (with Best Params)

| Final Model        | R² Score | MAE  | RMSE |
|--------------------|----------|------|------|
| CatBoost           | 0.6587   | 0.48 | 0.58 |
| XGBoost            | 0.6523   | 0.49 | 0.59 |
| Random Forest      | 0.6459   | 0.50 | 0.60 |
| KNN                | 0.4716   | 0.53 | 0.70 |


### Conclusion

- The best-performing model is **CatBoost Regressor** with **R² = 0.65** on test data.
- All models were evaluated fairly after preprocessing and hyperparameter tuning.
- CatBoost is preferred due to high accuracy, low MAE & RMSE.