In [None]:
# !pip install pycaret plotly 

import numpy as np
import pandas as pd
import os
import plotly.express as px
import re

from plotly.subplots import make_subplots
import re
import warnings
import kagglehub

warnings.filterwarnings('ignore')
# Manual Modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report



: 

| Column Name            | Use for Model? | Type        |
| ---------------------- | -------------- | ----------- |
| Car Company Names      | Yes            | Categorical |
| Car Models             | Yes            | Categorical |
| Engine Types           | Yes            | Categorical |
| CC/Battery Capacity    | Yes            | Numeric     |
| Horsepower (HP)        | Yes            | Numeric     |
| Top Speed              | Yes            | Numeric     |
| 0-100 km/h Performance | Yes            | Numeric     |
| Price (in USD)         | **Target**     | Numeric     |
| Fuel Type              | Yes            | Categorical |
| Seating Capacity       | Yes            | Numeric     |
| Torque                 | Yes            | Numeric     |


In [None]:
path = kagglehub.dataset_download("abdulmalik1518/cars-datasets-2025")
csv_path = os.path.join(path, "Cars Datasets 2025.csv")
df = pd.read_csv(csv_path, encoding="latin1")

print(f"Dataset loaded: {df.shape}")


In [None]:
df.describe().T

### 1.Prepare Data

By reaname the columns to a clear name featers

In [None]:
df.columns.to_list()

In [None]:
df = df.rename(columns={
    'Company Names': 'Brand',
    'Cars Names': 'Model',
    'Engines': 'Engine_Type',
    'CC/Battery Capacity': 'Engine_CC',
    'HorsePower': 'HorsePower',
    'Total Speed': 'Top_Speed',
    'Performance(0 - 100 )KM/H': 'Acceleration_0_100',
    'Cars Prices': 'Price_USD',
    'Fuel Types': 'Fuel_Type',
    'Seats': 'Seating_Capacity',
    'Torque': 'Torque_Nm'
})
df.head()


### 2. Data Cleaning

In this stage start cleaning the columns and foramts:


In [None]:
print(df.isnull().sum()) 

#### 2.1Price column 

reformat the values and clean by removing $ sign and ","and  ranges such as 10000-20000 replace by avg price.

In [None]:
sorted(df['Price_USD'].unique())


In [None]:
def clean_price(value):
    if pd.isna(value):
        return np.nan
    
    value_str = str(value).replace('$', '').replace(',', '').strip()
    
    if '-' in value_str:
        parts = value_str.split('-')
        numbers = []
        for part in parts:
            nums = re.findall(r'\d+', part)
            if nums:
                numbers.append(float(''.join(nums)))
        if numbers:
            return sum(numbers) / len(numbers)
    
    numbers = re.findall(r'\d+', value_str)
    if numbers:
        return float(''.join(numbers))
    return np.nan

df['Price_USD'] = df['Price_USD'].apply(clean_price)
df = df.dropna(subset=['Price_USD'])


##### -Outlier price Check 

In [None]:
Q_low = df['Price_USD'].quantile(0.01)
Q_high = df['Price_USD'].quantile(0.99)
df = df[(df['Price_USD'] >= Q_low) & (df['Price_USD'] <= Q_high)]

print(f"After removing outliers: {len(df)} rows")
print(f"Price range: ${df['Price_USD'].min():,.0f} - ${df['Price_USD'].max():,.0f}")


#### 2.2 Clean Numric Features  

##### clean the numric columns  by removing a  symbole and unit in
  ['Engine_CC', 'HorsePower', 'Torque_Nm', 'Acceleration_0_100', 'Top_Speed', 'Seating_Capacity']


In [None]:
df['Engine_CC'].unique()


In [None]:
sorted(df['Torque_Nm'].unique())


In [None]:
sorted(df['HorsePower'].unique())


In [None]:
df['Acceleration_0_100'].unique()

In [None]:
sorted(df['Top_Speed'].unique())

In [None]:
sorted(df['Seating_Capacity'].unique())

In [None]:
def extract_number(value):
    if pd.isna(value):
        return np.nan
    value_str = str(value).lower()
    value_str = value_str.replace('cc','').replace('hp','').replace('km/h','')\
                         .replace('nm','').replace('sec','').replace(',','')
    
    nums = re.findall(r'\d+\.?\d*', value_str)
    if nums:
        return float(nums[0])
    return np.nan

numeric_cols = ['Engine_CC', 'HorsePower', 'Torque_Nm', 'Acceleration_0_100', 'Top_Speed', 'Seating_Capacity']
for col in numeric_cols:
    df[col] = df[col].apply(extract_number)
    df[col] = df[col].fillna(df[col].median())


#### 2.3 Clean Categorical Features

##### cleaning and reformat categorical Features , format change and doing a grouping for some columns where have  many and subs category such as [Brand ,Fuel_Type]

In [None]:
sorted(df['Brand'].unique())

In [None]:
sorted(df['Fuel_Type'].unique())

In [None]:
df['Brand'] = df['Brand'].str.strip().str.title()

def categorize_brand(brand):
    if pd.isna(brand):
        return 'Economy'
    brand = str(brand).upper()
    luxury = ['FERRARI','LAMBORGHINI','ROLLS','BENTLEY','BUGATTI','MASERATI','ASTON','MCLAREN']
    premium = ['MERCEDES','BMW','AUDI','PORSCHE','LEXUS','JAGUAR','LAND ROVER','VOLVO','TESLA']
    if any(lux in brand for lux in luxury):
        return 'Luxury'
    elif any(pre in brand for pre in premium):
        return 'Premium'
    return 'Economy'

df['Brand_Category'] = df['Brand'].apply(categorize_brand)

def clean_fuel(fuel):
    if pd.isna(fuel):
        return 'Petrol'
    fuel = str(fuel).lower()
    if 'hybrid' in fuel or 'plug' in fuel:
        return 'Hybrid'
    if 'electric' in fuel or 'ev' in fuel:
        return 'Electric'
    if 'diesel' in fuel:
        return 'Diesel'
    return 'Petrol'

df['Fuel_Type'] = df['Fuel_Type'].apply(clean_fuel)


### EDA

In [None]:
import plotly.express as px

brand_counts = df['Brand_Category'].value_counts().reset_index()
brand_counts.columns = ['Brand_Category', 'Count']   # Rename for clarity

fig = px.bar(
    brand_counts,
    x='Brand_Category',
    y='Count',
    title='Cars by Brand Category',
    labels={'Brand_Category':'Brand','Count':'Count'}
)
fig.show()

In [None]:
fig = px.histogram(df, x='Price_USD', nbins=30, title='Price Distribution')
fig.show()


In [None]:
fuel_counts = df['Fuel_Type'].value_counts().reset_index()
fuel_counts.columns = ['Fuel_Type', 'Count']  

fig = px.bar(
    fuel_counts,
    x='Fuel_Type',
    y='Count',
    title='Fuel Types Distribution',
    labels={'Fuel_Type':'Fuel Type', 'Count':'Count'}
)
fig.show()


In [None]:
features_numeric = numeric_cols
features_cat = ['Brand_Category', 'Fuel_Type']

df_clean = df[features_numeric + features_cat + ['Price_USD']].copy()
df_clean.head()


### PyCaret

In [None]:
from pycaret.regression import setup, compare_models

reg = setup(
    data=df_clean,
    target='Price_USD',
    numeric_features=features_numeric,
    categorical_features=features_cat,
    train_size=0.8,
    session_id=42,
    normalize=True,
    transformation=True
)

best_model = compare_models(sort='R2')
print("Best Model Selected by PyCaret:", best_model)


In [None]:
from sklearn.model_selection import train_test_split

X = df_clean.drop('Price_USD', axis=1)
y = df_clean['Price_USD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(best_model)
type(best_model)


In [None]:
X.dtypes


In [None]:
import pandas as pd

X = pd.get_dummies(X, drop_first=True)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


 ### Evaluation

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

grid_search = GridSearchCV(
    estimator=best_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
best_tuned_model = grid_search.best_estimator_


In [None]:
from pycaret.regression import finalize_model

final_model = finalize_model(best_model)  


In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np


cv_scores = cross_val_score(
    best_tuned_model, 
    X_train, 
    y_train, 
    cv=5,      
    scoring='r2'
)

print("5-Fold CV R2 scores:", cv_scores)
print("Mean R2:", np.mean(cv_scores))


### Evaluate Performance (Testing)

In [None]:
y_pred = best_tuned_model.predict(X_test)

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test R2: {r2:.4f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")


In [None]:
best_models = compare_models(n_select=3, sort='R2', turbo=True)
best_model = best_models[0]
predictions = predict_model(best_model)
predictions.head()


In [None]:
# from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# r2 = r2_score(predictions['Price_USD'], predictions['prediction_label'])
# mae = mean_absolute_error(predictions['Price_USD'], predictions['prediction_label'])
# rmse = np.sqrt(mean_squared_error(predictions['Price_USD'], predictions['prediction_label']))

# print(f"Best Model: {type(best_model).__name__}")
# print(f"R² Score: {r2:.4f}")
# print(f"MAE: {mae:,.2f}")
# print(f"RMSE: {rmse:,.2f}")


In [None]:
save_model(best_model, 'car_price_model')
print("✓ Model saved as 'car_price_model.pkl'")
