<a href="https://colab.research.google.com/github/Sabir123556/Batch22_1506/blob/main/project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
Car Price Prediction - Regression + Classification (binned price)
Loads: /mnt/data/a52be7b2-b050-473c-b69e-56bd3e3f992d.csv

Outputs:
 - Regression metrics for multiple regressors
 - Classification reports (precision/recall/f1) after binning price into categories
"""

import re
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer, LabelEncoder

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix

# ---------- Utilities to parse messy columns ----------
def extract_first_number(s):
    """Extract first float/integer from a string, return np.nan if none"""
    if pd.isnull(s):
        return np.nan
    s = str(s)
    m = re.search(r'[\d]+(?:\.\d+)?', s.replace(',', ''))
    return float(m.group()) if m else np.nan

def parse_engine_capacity(val):
    """Handle values like '1998 cc', '2.0L', '75 kWh' -> numeric (we'll keep kWh as numeric too)"""
    if pd.isnull(val):
        return np.nan
    s = str(val).lower()
    # if contains 'kwh' treat as numeric battery size
    if 'kwh' in s:
        return extract_first_number(s)  # battery size in kWh
    # liters -> convert to cc if necessary
    if 'l' in s and not 'cc' in s:
        n = extract_first_number(s)
        # if value < 30 assume liters
        if n is not None and n < 30:
            return n * 1000.0  # liters -> cc
    # default: extract number (often in cc)
    return extract_first_number(s)

def parse_horsepower(val):
    """Extract horsepower numeric (hp)"""
    return extract_first_number(val)

def parse_torque(val):
    """Extract torque numeric (assumes Nm)"""
    return extract_first_number(val)

def parse_top_speed(val):
    """Extract top speed numeric (km/h)"""
    return extract_first_number(val)

def parse_acceleration(val):
    """Extract 0-100 km/h seconds (numeric)"""
    return extract_first_number(val)

def find_price_col(df):
    """Attempt to find price column by name containing 'price' or '₹' or 'inr' etc."""
    for c in df.columns:
        if 'price' in c.lower() or '₹' in c or 'inr' in c.lower() or 'rs' in c.lower():
            return c
    # fallback: last column
    return df.columns[-1]

# ---------- Load dataset ----------
csv_path = "/content/Cars Datasets 2025.csv"
df = pd.read_csv(csv_path, encoding='latin-1')

print("Dataset loaded. Shape:", df.shape)
print("Columns:", list(df.columns))

# ---------- Normalize column names (helpful) ----------
df.columns = [c.strip() for c in df.columns]

# Find likely column names (flexible)
price_col = find_price_col(df)
print("Detected price column:", price_col)

# Example possible column name patterns
# company name, car name, engine type/config, engine capacity, horsepower, torque, top speed, acceleration, fuel type, seating capacity
# We'll attempt to map automatically where possible:
col_map = {}
cols_lower = {c.lower(): c for c in df.columns}

def find_col_by_keywords(keywords):
    keys = keywords if isinstance(keywords, list) else [keywords]
    for k in keys:
        for col_lower, orig in cols_lower.items():
            if k in col_lower:
                return orig
    return None

col_map['company'] = find_col_by_keywords(['company', 'maker', 'manufacturer', 'brand'])
col_map['car_name'] = find_col_by_keywords(['car name', 'model', 'car'])
col_map['engine_type'] = find_col_by_keywords(['engine type', 'engine config', 'configuration', 'engine'])
col_map['capacity'] = find_col_by_keywords(['capacity', 'cc', 'kwh', 'battery', 'displacement'])
col_map['horsepower'] = find_col_by_keywords(['horsepower', 'hp', 'bhp'])
col_map['torque'] = find_col_by_keywords(['torque', 'nm'])
col_map['top_speed'] = find_col_by_keywords(['top speed', 'top_speed', 'topspeed', 'km/h', 'kmh'])
col_map['acceleration'] = find_col_by_keywords(['0-100', '0–100', '0 to 100', 'acceleration', '0-60', '0–60'])
col_map['fuel_type'] = find_col_by_keywords(['fuel', 'fuel type', 'fuel_type'])
col_map['seating'] = find_col_by_keywords(['seat', 'seating', 'capacity'])

# Print mapping for user awareness
print("\nAuto-detected column mapping:")
for k, v in col_map.items():
    print(f"  {k}: {v}")

# ---------- Create working dataframe with selected columns ----------
# Keep company, engine_type, capacity, horsepower, torque, top_speed, acceleration, fuel_type, seating, price
working_cols = []
for key in ['company', 'engine_type', 'capacity', 'horsepower', 'torque', 'top_speed', 'acceleration', 'fuel_type', 'seating', 'car_name']:
    if col_map.get(key):
        working_cols.append(col_map[key])
# ensure price included
if price_col not in working_cols:
    working_cols.append(price_col)

# If nothing detected, use all columns as fallback
if len(working_cols) < 3:
    print("Few columns auto-detected. Using all original columns as fallback.")
    working_df = df.copy()
else:
    working_df = df[working_cols].copy()

# Rename columns to friendly names
rename_map = {}
for key in ['company','car_name','engine_type','capacity','horsepower','torque','top_speed','acceleration','fuel_type','seating']:
    if col_map.get(key):
        rename_map[col_map[key]] = key
rename_map[price_col] = 'price'
working_df = working_df.rename(columns=rename_map)
print("\nWorking dataframe columns after rename:", working_df.columns.tolist())

# ---------- Feature parsing ----------
# Parse numeric fields
if 'capacity' in working_df.columns:
    working_df['capacity_num'] = working_df['capacity'].apply(parse_engine_capacity)
else:
    working_df['capacity_num'] = np.nan

if 'horsepower' in working_df.columns:
    working_df['hp_num'] = working_df['horsepower'].apply(parse_horsepower)
else:
    working_df['hp_num'] = np.nan

if 'torque' in working_df.columns:
    working_df['torque_num'] = working_df['torque'].apply(parse_torque)
else:
    working_df['torque_num'] = np.nan

if 'top_speed' in working_df.columns:
    working_df['topspeed_num'] = working_df['top_speed'].apply(parse_top_speed)
else:
    working_df['topspeed_num'] = np.nan

if 'acceleration' in working_df.columns:
    working_df['accel_num'] = working_df['acceleration'].apply(parse_acceleration)
else:
    working_df['accel_num'] = np.nan

# seating numeric
if 'seating' in working_df.columns:
    working_df['seating_num'] = working_df['seating'].apply(extract_first_number)
else:
    working_df['seating_num'] = np.nan

# Price: extract numeric (if ranges like '20-25L' or '₹ 20,00,000' handle heuristically)
def parse_price(val):
    if pd.isnull(val):
        return np.nan
    s = str(val).lower().replace(',', '').replace('inr', '').replace('rs.', '').replace('₹','').strip()
    # handle ranges like '20-25 lakh' or '20-25 lakh' or '20-25L'
    # try to extract numbers and multipliers
    nums = re.findall(r'[\d]+(?:\.\d+)?', s)
    if not nums:
        return np.nan
    nums = [float(x) for x in nums]
    # infer multiplier words
    multiplier = 1.0
    if 'lakh' in s or 'lac' in s or 'l' in s and 'k' not in s:
        # interpret as lakhs -> convert to rupees (but keeping numeric unit is fine)
        # We'll convert everything to a common currency unit: assume numbers are in lakhs or in thousands?
        # Simpler approach: if numbers < 100 and 'lakh' in s assume it's lakhs -> multiply by 100000
        if any(x < 100 for x in nums):
            multiplier = 100000.0
    if 'crore' in s:
        multiplier = 10000000.0
    if 'k' in s and 'km' not in s:
        multiplier = 1000.0
    # take average if range
    val = np.mean(nums) * multiplier
    return val

working_df['price_num'] = working_df['price'].apply(parse_price)

# If price is still NaN (maybe already numeric), attempt direct cast
if working_df['price_num'].isna().sum() > 0:
    try:
        working_df['price_num'] = working_df['price_num'].fillna(pd.to_numeric(working_df['price'], errors='coerce'))
    except Exception:
        pass

# Drop rows without price
before = working_df.shape[0]
working_df = working_df.dropna(subset=['price_num']).reset_index(drop=True)
after = working_df.shape[0]
print(f"\nDropped {before-after} rows without parsable price. Remaining: {after}")

# ---------- Select features for modeling ----------
# Choose categorical features: company, engine_type, fuel_type, maybe car_name (but can be high-cardinality)
cat_features = [c for c in ['company', 'engine_type', 'fuel_type', 'car_name'] if c in working_df.columns]
num_features = ['capacity_num', 'hp_num', 'torque_num', 'topspeed_num', 'accel_num', 'seating_num']

# Ensure numeric columns exist
num_features = [c for c in num_features if c in working_df.columns]

print("\nCategorical features:", cat_features)
print("Numeric features:", num_features)

# Create final feature DataFrame
X = working_df[cat_features + num_features].copy()
y_reg = working_df['price_num'].astype(float).copy()

# Simple stats
print("\nTarget price stats (num):")
print(y_reg.describe().apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x))

# ---------- Split data ----------
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

# ---------- Preprocessing pipelines ----------
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
], remainder='drop')

# ---------- Regression models ----------
regressors = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.01, max_iter=5000),
    'SVR': SVR(kernel='rbf', C=1.0, gamma='scale'),
    'KNNRegressor': KNeighborsRegressor(n_neighbors=5),
    'DecisionTreeRegressor': DecisionTreeRegressor(max_depth=6, random_state=42),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1),
    'GradientBoostingRegressor': GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
}

def eval_regressor(name, model, X_train, X_test, y_train, y_test):
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    print(f"\n{name} Results:")
    print(f"  MAE  : {mae:,.2f}")
    print(f"  RMSE : {rmse:,.2f}")
    print(f"  R^2  : {r2:.4f}")
    return pipe, preds

print("\n=== REGRESSION: Training and evaluating models ===")
regression_results = {}
for name, model in regressors.items():
    try:
        pipe, preds = eval_regressor(name, model, X_train, X_test, y_train_reg, y_test_reg)
        regression_results[name] = {'pipeline': pipe, 'preds': preds}
    except Exception as e:
        print(f"  ERROR training {name}: {e}")

# ---------- Classification: bin price into categories and run classifiers ----------
# We'll create price categories (e.g., 4 bins: Low / Mid / High / Luxury) using quantiles by default
n_bins = 4
# KBinsDiscretizer can create bins but we'll use quantiles manually for labeling clarity
bins = np.quantile(y_reg, np.linspace(0, 1, n_bins + 1))
# Ensure unique edges
bins = np.unique(bins)
print("\nPrice bin edges (numeric):", bins)

# Create category labels
labels = [f"Bin_{i+1}" for i in range(len(bins)-1)]
working_df['price_bin'] = pd.cut(working_df['price_num'], bins=bins, labels=labels, include_lowest=True)
print("\nPrice bin counts:")
print(working_df['price_bin'].value_counts())

# Use same X but align y_class with X rows (we already had X aligned earlier)
y_class_all = working_df['price_bin'].astype(str)
# Rebuild X_all to ensure alignment
X_all = working_df[cat_features + num_features].copy()

# Split for classification (stratify by bins to preserve distribution)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_all, y_class_all, test_size=0.2, random_state=42, stratify=y_class_all)

# Define classifiers
classifiers = {
    'LogisticRegression': LogisticRegression(max_iter=2000),
    'DecisionTree': DecisionTreeClassifier(max_depth=8, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42),
    'SVC': SVC(kernel='rbf', probability=True),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5),
    'GaussianNB': GaussianNB()
}

def eval_classifier(name, clf, X_train, X_test, y_train, y_test):
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('clf', clf)])
    pipe.fit(X_train, y_train)
    ypred = pipe.predict(X_test)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, ypred, digits=4))
    cm = confusion_matrix(y_test, ypred, labels=np.unique(y_test))
    print(f"Confusion Matrix (rows=truth, cols=pred) shape {cm.shape}")
    return pipe, ypred

print("\n=== CLASSIFICATION: Training and printing classification reports ===")
classification_results = {}
for name, clf in classifiers.items():
    try:
        pipe, ypred = eval_classifier(name, clf, Xc_train, Xc_test, yc_train, yc_test)
        classification_results[name] = {'pipeline': pipe, 'ypred': ypred}
    except Exception as e:
        print(f"  ERROR training {name}: {e}")

# ---------- Quick cross-validation summary (optional) ----------
print("\n=== OPTIONAL: 5-fold CV scores (R^2) for top regression models ===")
from sklearn.model_selection import cross_val_score
for name in ['RandomForestRegressor', 'GradientBoostingRegressor', 'Ridge']:
    if name in regression_results:
        model_pipeline = regression_results[name]['pipeline']
        scores = cross_val_score(model_pipeline, X, y_reg, cv=5, scoring='r2', n_jobs=-1)
        print(f"{name} CV R^2 mean: {scores.mean():.4f}, std: {scores.std():.4f}")

print("\nScript finished. You can extend this script to do hyperparameter tuning (GridSearchCV), feature importance plotting, or saving models with joblib.")

Dataset loaded. Shape: (1218, 11)
Columns: ['Company Names', 'Cars Names', 'Engines', 'CC/Battery Capacity', 'HorsePower', 'Total Speed', 'Performance(0 - 100 )KM/H', 'Cars Prices', 'Fuel Types', 'Seats', 'Torque']
Detected price column: Cars Names

Auto-detected column mapping:
  company: Company Names
  car_name: Cars Names
  engine_type: Engines
  capacity: CC/Battery Capacity
  horsepower: HorsePower
  torque: Torque
  top_speed: Performance(0 - 100 )KM/H
  acceleration: None
  fuel_type: Fuel Types
  seating: Seats

Working dataframe columns after rename: ['company', 'engine_type', 'capacity', 'horsepower', 'torque', 'top_speed', 'fuel_type', 'seating', 'price']

Dropped 793 rows without parsable price. Remaining: 425

Categorical features: ['company', 'engine_type', 'fuel_type']
Numeric features: ['capacity_num', 'hp_num', 'torque_num', 'topspeed_num', 'accel_num', 'seating_num']

Target price stats (num):
count         425.00
mean       622775.81
std       4212965.42
min        