In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor


In [None]:
# Path in Colab
df = pd.read_csv("/content/Cars_Datasets_2025[2].csv", encoding='latin-1')
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm


In [None]:
df.isnull().sum()

Unnamed: 0,0
Company Names,0
Cars Names,0
Engines,0
CC/Battery Capacity,3
HorsePower,0
Total Speed,0
Performance(0 - 100 )KM/H,6
Cars Prices,0
Fuel Types,0
Seats,0


In [None]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm


In [None]:
df

Unnamed: 0,Company Names,Cars Names,Engines,CC/Battery Capacity,HorsePower,Total Speed,Performance(0 - 100 )KM/H,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990 cc,963 hp,340 km/h,2.5 sec,"$1,100,000",plug in hyrbrid,2,800 Nm
1,ROLLS ROYCE,PHANTOM,V12,6749 cc,563 hp,250 km/h,5.3 sec,"$460,000",Petrol,5,900 Nm
2,Ford,KA+,1.2L Petrol,"1,200 cc",70-85 hp,165 km/h,10.5 sec,"$12,000-$15,000",Petrol,5,100 - 140 Nm
3,MERCEDES,GT 63 S,V8,"3,982 cc",630 hp,250 km/h,3.2 sec,"$161,000",Petrol,4,900 Nm
4,AUDI,AUDI R8 Gt,V10,"5,204 cc",602 hp,320 km/h,3.6 sec,"$253,290",Petrol,2,560 Nm
...,...,...,...,...,...,...,...,...,...,...,...
1213,Toyota,Crown Signia,2.5L Hybrid I4,2487 cc,240 hp,180 km/h,7.6 sec,"$43,590  $48,000",Hybrid (Gas + Electric),5,239 Nm
1214,Toyota,4Runner (6th Gen),2.4L Turbo I4 (i-FORCE MAX Hybrid),2393 cc + Battery,326 hp,180 km/h,6.8 sec,"$50,000",Hybrid,7,630 Nm
1215,Toyota,Corolla Cross,2.0L Gas / 2.0L Hybrid,1987 cc / Hybrid batt,169  196 hp,190 km/h,8.0  9.2 sec,"$25,210  $29,135",Gas / Hybrid,5,190  210 Nm
1216,Toyota,C-HR+,1.8L / 2.0L Hybrid,1798 / 1987 cc + batt,140  198 hp,180 km/h,7.9  10.5 sec," 33,000",Hybrid,5,190  205 Nm


In [None]:
df.dropna(inplace=True)

In [None]:
# For numeric columns → fill with median (better for skewed data)
for col in ["CC/Battery Capacity", "Performance(0 - 100 )KM/H", "Torque"]:
    # Ensure the column is treated as string before applying string methods
    df[col] = df[col].astype(str).str.replace(r'[^\d.-]+', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    # Fill missing values with the median
    df[col] = df[col].fillna(df[col].median())

In [None]:
# 4. Clean and Convert Torque Column
# Extract numeric part from strings like '250Nm@5000rpm'
df['Torque'] = df['Torque'].astype(str).str.extract(r'(\d+\.?\d*)')[0]
df['Torque'] = pd.to_numeric(df['Torque'], errors='coerce')

In [None]:
# Extract the first numeric part from Torque (handles commas and decimals)
df['Torque'] = df['Torque'].astype(str).str.extract(r'(\d+(?:,\d{3})*(?:\.\d+)?)')[0]

# Remove commas and convert to numeric
df['Torque'] = df['Torque'].str.replace(',', '', regex=False).astype(float)

In [None]:
df["Torque"].head(5)

Unnamed: 0,Torque
0,800.0
1,900.0
2,380.0
3,900.0
4,560.0


In [None]:
print("Data type:", df['Torque'].dtype)


Data type: float64


In [None]:
print("Any NaN in Torque after cleaning?", df['Torque'].isna().sum())


Any NaN in Torque after cleaning? 0


In [None]:
# 5. Fill Missing Categorical Columns with Mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])


In [None]:
# 6. Encode Non-Numeric Columns
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

In [None]:
# 7.Features & Target
target_column = "Torque"
X = df.drop([target_column], axis=1)
y = df[target_column]

In [None]:
# 2. Check for NaN or infinite values in features
print("Any NaN in features?:", X.isna().sum().sum())
print("Any infinite values in features?:", np.isinf(X).sum().sum())

Any NaN in features?: 0
Any infinite values in features?: 0


In [None]:
print("Any NaN in target?:", y.isna().sum())
print("Any infinite values in target?:", np.isinf(y).sum())

Any NaN in target?: 0
Any infinite values in target?: 0


In [None]:
# 8.Model & K-Fold

kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
scores = []
model = RandomForestRegressor(random_state=42)



In [None]:
for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds)
    scores.append(rmse)

In [None]:
print("RMSE scores:", scores)
print("Average RMSE:", np.mean(scores))

RMSE scores: [19922109.323895045, 394475851.3496334, 653659170.1642013, 130708565.91275829, 258605661.38735166]
Average RMSE: 291474271.62756795


In [None]:
# Final training on the full dataset

model.fit(X, y)


In [None]:
# Final predictions (replace X_new with your actual new data)
final_predictions = model.predict(X)

In [None]:
final_predictions[:5]

array([818.6 , 899.7 , 328.13, 903.2 , 577.51])