In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [None]:
# 1. Load Dataset
df = pd.read_csv("/content/Cars_Datasets_2025[2].csv", encoding='latin1')
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm


In [None]:
df.isnull().sum()

Unnamed: 0,0
Company Names,0
Cars Names,0
Engines,0
CC/Battery Capacity,3
HorsePower,0
Total Speed,0
Performance(0 - 100 )KM/H,6
Cars Prices,0
Fuel Types,0
Seats,0


In [None]:
# 2. Drop Missing Values
df.dropna(inplace=True)


In [None]:
# For numeric columns → fill with median (better for skewed data)
for col in ["CC/Battery Capacity", "Performance(0 - 100 )KM/H", "Torque"]:
    # Ensure the column is treated as string before applying string methods
    df[col] = df[col].astype(str).str.replace(r'[^\d.-]+', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    # Fill missing values with the median
    df[col] = df[col].fillna(df[col].median())

In [None]:
# 5. Fill Missing Categorical Columns with Mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])


In [None]:
# 4. Clean and Convert Torque Column
# Extract numeric part from strings like '250Nm@5000rpm'
df['Torque'] = df['Torque'].astype(str).str.extract(r'(\d+\.?\d*)')[0]
df['Torque'] = pd.to_numeric(df['Torque'], errors='coerce')

In [None]:
# Extract the first numeric part from Torque (handles commas and decimals)
df['Torque'] = df['Torque'].astype(str).str.extract(r'(\d+(?:,\d{3})*(?:\.\d+)?)')[0]

# Remove commas and convert to numeric
df['Torque'] = df['Torque'].str.replace(',', '', regex=False).astype(float)

In [None]:
df["Torque"].head()

Unnamed: 0,Torque
0,800.0
1,900.0
2,380.0
3,900.0
4,560.0


In [None]:
print("Data type:", df['Torque'].dtype)


Data type: float64


In [None]:
print("Any NaN in Torque after cleaning?", df['Torque'].isna().sum())


Any NaN in Torque after cleaning? 0


In [None]:
# 3. Encode Non-Numeric Columns
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

In [None]:

# 4. Features & Target
target_column = "Torque"
X = df.drop(columns=[target_column])
y = df[target_column]

In [None]:
# 2. Check for NaN or infinite values in features
print("Any NaN in features?:", X.isna().sum().sum())
print("Any infinite values in features?:", np.isinf(X).sum().sum())

Any NaN in features?: 0
Any infinite values in features?: 0


In [None]:
print("Any NaN in target?:", y.isna().sum())
print("Any infinite values in target?:", np.isinf(y).sum())

Any NaN in target?: 0
Any infinite values in target?: 0


In [None]:
# 8. Set hyperparameter grid
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 301),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

In [None]:
# 9. Grid Search with Cross Validation
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=30,  # number of random combos to try
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [None]:
# 10. Fit grid search
random_search.fit(X, y)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [None]:
# 11. Best params and best score
print("Best hyperparameters:", random_search.best_params_)
print("Best RMSE score:", -random_search.best_score_)  # negate because greater_is_better=False


Best hyperparameters: {'max_depth': 20, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 222}
Best RMSE score: 14712.931032556085


In [None]:
from sklearn.ensemble import RandomForestRegressor
# 2. Extract best parameters from random_search
best_params = random_search.best_params_

In [None]:
# 3. Create model with best params
best_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    random_state=42
)

In [None]:
# 4. Fit the model on full data
best_model.fit(X, y)

In [None]:
# 5. Predict target values on same dataset (or new data X_new)
y_pred = best_model.predict(X)

In [None]:
# 6. Print or inspect predictions
print("First 5 predictions:", y_pred[:5])

First 5 predictions: [925.98135278 896.6454106  268.13409481 903.30599383 580.39561103]
