In [31]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


In [32]:
data = pd.read_csv('Car details.csv')

In [33]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [34]:
data['brand'] = data['name'].apply(lambda x: x.split()[0])

In [35]:
def clean_number(x):
    """Extract numeric part and convert to float."""
    try:
        return float(str(x).split()[0])
    except:
        return np.nan


In [36]:
data['mileage'] = data['mileage'].apply(clean_number)
data['engine'] = data['engine'].apply(clean_number)
data['max_power'] = data['max_power'].apply(clean_number)

In [37]:
data = data.drop(columns=['name', 'torque'])

In [38]:
data = data.dropna(subset=['mileage', 'engine', 'max_power', 'seats'])

In [39]:
X = data[['year', 'km_driven', 'fuel', 'seller_type', 'transmission',
          'owner', 'mileage', 'engine', 'max_power', 'seats', 'brand']]
y = data['selling_price']

In [40]:
numeric_features = ['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
categorical_features = ['fuel', 'seller_type', 'transmission', 'owner', 'brand']

In [41]:
numeric_transformer = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [42]:
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [43]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])


In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [45]:
pipe.fit(X_train, y_train)


In [46]:
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [47]:
print("✅ Model trained successfully!")
print(f"R² Score: {r2:.3f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

✅ Model trained successfully!
R² Score: 0.905
MAE: 130394.51
RMSE: 257062.02


In [48]:
with open('pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [49]:
print("✅ Saved trained pipeline as 'pipe.pkl'")

✅ Saved trained pipeline as 'pipe.pkl'
