In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Cardetails.csv")
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500.0,Diesel,Individual,Manual,First Owner,,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000.0,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000.0,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000.0,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000.0,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [3]:
if 'engine' in df.columns:
    df['engine'] = df['engine'].str.replace('CC', '', regex=False).str.strip()
    df['engine'] = pd.to_numeric(df['engine'], errors='coerce')

if 'max_power' in df.columns:
    df['max_power'] = df['max_power'].str.replace('bhp', '', regex=False).str.strip()
    df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

if 'mileage' in df.columns:
    df['mileage'] = df['mileage'].str.replace('kmpl', '', regex=False).str.strip()
    df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

In [4]:
df['engine'] = df['engine'].fillna(df['engine'].median())
df['max_power'] = df['max_power'].fillna(df['max_power'].median())
df['mileage'] = df['mileage'].fillna(df['mileage'].median())

In [5]:
import pandas as pd
import re

def extract_torque_and_mean_rpm(torque_str):

    if pd.isna(torque_str) or torque_str == "":
        return None, None
    
    torque_str = str(torque_str)
    torque_nm = None
    mean_rpm = None
    
    # Extract torque value and unit
    torque_match = re.search(r'(\d+\.?\d*)\s*(Nm|kgm|KGM)', torque_str, re.IGNORECASE)
    if torque_match:
        torque_value = float(torque_match.group(1))
        unit = torque_match.group(2).lower()
        
        # Convert to Nm (1 kgm = 9.80665 Nm)
        if unit in ['kgm', 'kgm']:
            torque_nm = round(torque_value * 9.80665, 1)
        else:  # Nm
            torque_nm = torque_value
    
    # Extract RPM and calculate mean
    # Case 1: RPM range "1500-2500rpm"
    range_match = re.search(r'(\d+)\s*-\s*(\d+)\s*rpm', torque_str, re.IGNORECASE)
    if range_match:
        rpm_low = int(range_match.group(1))
        rpm_high = int(range_match.group(2))
        mean_rpm = (rpm_low + rpm_high) / 2
    else:
        # Case 2: Single RPM value "2000rpm"
        single_match = re.search(r'(\d+)\s*rpm', torque_str, re.IGNORECASE)
        if single_match:
            mean_rpm = float(single_match.group(1))
    
    return torque_nm, mean_rpm

# Apply to your dataframe
def apply_simple_torque_extraction(df):
    """Apply the extraction function to the dataframe"""
    results = df['torque'].apply(extract_torque_and_mean_rpm)
    
    # Create new columns
    df['torque_nm'] = results.apply(lambda x: x[0] if x else None)
    df['mean_rpm'] = results.apply(lambda x: x[1] if x else None)
    
    return df

# Apply the function
df = apply_simple_torque_extraction(df)

In [6]:
def preprocess_car_data(df):
    """Preprocess categorical features for car dataset"""
    
    # Create a copy
    df_processed = df.copy()
    
    # 1. LABEL ENCODING (ordinal/high cardinality)
    # Transmission
    df_processed['transmission_encoded'] = df_processed['transmission'].map({'Manual': 0, 'Automatic': 1})
    
    # Owner (with logical order)
    owner_mapping = {
        'First Owner': 0,
        'Second Owner': 1, 
        'Third Owner': 2,
        'Fourth & Above Owner': 3
    }
    df_processed['owner_encoded'] = df_processed['owner'].map(owner_mapping)
    
    # Seller Type
    seller_mapping = {
        'Individual': 0,
        'Dealer': 1,
        'Trustmark Dealer': 2
    }
    df_processed['seller_type_encoded'] = df_processed['seller_type'].map(seller_mapping)
    
    # 2. ONE-HOT ENCODING (nominal) - Convert to 0/1
    fuel_dummies = pd.get_dummies(df_processed['fuel'], prefix='fuel')
    fuel_dummies = fuel_dummies.astype(int)  # Convert True/False to 1/0
    
    # Concatenate with main dataframe
    df_processed = pd.concat([df_processed, fuel_dummies], axis=1)
    
    # Drop original categorical columns
    df_processed = df_processed.drop(['transmission', 'owner', 'seller_type', 'fuel'], axis=1)
    
    return df_processed

# Apply preprocessing
df_processed = preprocess_car_data(df)

In [7]:
Y=df_processed['selling_price']
X=df_processed.drop(['name','torque','selling_price'],axis=1)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [9]:
# from sklearn.linear_model import LinearRegression

# # First, let's check for NaN values in our data
# print("NaN values before cleaning:")
# print(f"X_train: {X_train.isna().sum().sum()} NaN values")
# print(f"X_test: {X_test.isna().sum().sum()} NaN values")
# print(f"Y_train: {Y_train.isna().sum()} NaN values")
# print(f"Y_test: {Y_test.isna().sum()} NaN values")

# # Check which columns have NaN values
# nan_columns = X_train.columns[X_train.isna().any()].tolist()
# print(f"Columns with NaN values: {nan_columns}")

# # Fill NaN values with median for numerical columns
# for column in X_train.columns:
#     if X_train[column].dtype in ['float64', 'int64']:
#         median_val = X_train[column].median()
#         X_train[column] = X_train[column].fillna(median_val)
#         X_test[column] = X_test[column].fillna(median_val)

# # Verify no more NaN values
# print("\nNaN values after cleaning:")
# print(f"X_train: {X_train.isna().sum().sum()} NaN values")
# print(f"X_test: {X_test.isna().sum().sum()} NaN values")

# # Now proceed with Linear Regression
# linear_model = LinearRegression()

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# linear_model.fit(
#     X_train_scaled, Y_train
# )

# # Make predictions
# y_pred = linear_model.predict(X_test_scaled)

# # Print model coefficients
# print(f"\n=== Linear Regression Results ===")
# print(f"Intercept: {linear_model.intercept_:.4f}")
# print(f"Number of coefficients: {len(linear_model.coef_)}")

# # Show feature importance (absolute coefficient values)
# feature_importance = pd.DataFrame({
#     'feature': X_train.columns,
#     'coefficient': linear_model.coef_,
#     'abs_coefficient': np.abs(linear_model.coef_)
# }).sort_values('abs_coefficient', ascending=False)

# print("\nTop 10 most important features:")
# print(feature_importance.head(10))

# # Calculate metrics
# from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# r2 = r2_score(Y_test, y_pred)
# mae = mean_absolute_error(Y_test, y_pred)
# mse = mean_squared_error(Y_test, y_pred)
# rmse = np.sqrt(mse)

# print(f"\nModel Performance:")
# print(f"R² Score: {r2:.4f}")
# print(f"MAE: {mae:.2f}")
# print(f"MSE: {mse:.2f}")
# print(f"RMSE: {rmse:.2f}")

NaN values before cleaning:
X_train: 1854 NaN values
X_test: 478 NaN values
Y_train: 0 NaN values
Y_test: 0 NaN values
Columns with NaN values: ['km_driven', 'seats', 'torque_nm', 'mean_rpm', 'owner_encoded']

NaN values after cleaning:
X_train: 0 NaN values
X_test: 0 NaN values

=== Linear Regression Results ===
Intercept: 638656.8528
Number of coefficients: 15

Top 10 most important features:
                 feature    coefficient  abs_coefficient
4              max_power  424757.006309    424757.006309
8   transmission_encoded  159298.256975    159298.256975
0                   year  153336.065579    153336.065579
7               mean_rpm -124984.186515    124984.186515
6              torque_nm   72571.948926     72571.948926
1              km_driven  -51091.584570     51091.584570
10   seller_type_encoded   50395.847761     50395.847761
12           fuel_Diesel  -46736.502894     46736.502894
5                  seats  -45715.658382     45715.658382
14           fuel_Petrol   42100.779991     42100.779991

Model Performance:
R² Score: 0.6786
MAE: 270096.17
MSE: 210657854339.71
RMSE: 458974.79


In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Create polynomial features pipeline WITH IMPUTER
poly_linear_model = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Add this line to handle NaN values
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('linear', LinearRegression())
])

In [11]:
# Fit the model
poly_linear_model.fit(X_train, Y_train)

# Make predictions
y_pred_poly = poly_linear_model.predict(X_test)

In [12]:
# Calculate metrics
r2_poly = r2_score(Y_test, y_pred_poly)
mae_poly = mean_absolute_error(Y_test, y_pred_poly)
rmse_poly = np.sqrt(mean_squared_error(Y_test, y_pred_poly))

print("=== Polynomial Regression (Degree 2) Results ===")
print(f"R² Score: {r2_poly:.4f}")
print(f"MAE: {mae_poly:.2f}")
print(f"RMSE: {rmse_poly:.2f}")

# Check how many features were created
poly_features = poly_linear_model.named_steps['poly']
print(f"Original features: {X_train.shape[1]}")
print(f"Polynomial features: {poly_features.n_output_features_}")

=== Polynomial Regression (Degree 2) Results ===
R² Score: 0.9086
MAE: 143899.95
RMSE: 244713.28
Original features: 15
Polynomial features: 135
