In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
df=pd.read_csv('C:\\Users\\vedan\\Downloads\\cardekho_dataset.csv\\cardekho_dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [4]:
df.columns

Index(['Unnamed: 0', 'car_name', 'brand', 'model', 'vehicle_age', 'km_driven',
       'seller_type', 'fuel_type', 'transmission_type', 'mileage', 'engine',
       'max_power', 'seats', 'selling_price'],
      dtype='object')

In [5]:
def clean_car_name(car_name):
    return car_name.split(" ")[0].lower()
df['car_name'] = df['car_name'].apply(clean_car_name)
# see the unique values after fixing
df['car_name'].unique()

array(['maruti', 'hyundai', 'ford', 'renault', 'mini', 'mercedes-benz',
       'toyota', 'volkswagen', 'honda', 'mahindra', 'datsun', 'tata',
       'kia', 'bmw', 'audi', 'land', 'jaguar', 'mg', 'isuzu', 'porsche',
       'skoda', 'volvo', 'lexus', 'jeep', 'maserati', 'bentley', 'nissan',
       'ferrari', 'mercedes-amg', 'rolls-royce', 'force'], dtype=object)

In [6]:
X = df[["brand", "model", "vehicle_age", "km_driven", "seller_type",
             "fuel_type", "transmission_type", "mileage", "engine", "max_power", "seats"]]
y = df["selling_price"]


numeric_features = ["vehicle_age", "km_driven", "mileage", "engine", "max_power", "seats"]
categorical_features = ["brand", "model", "seller_type", "fuel_type", "transmission_type"]


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Transform features
X_transformed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)



In [7]:
mean_values = {col: df[col].mean() for col in numeric_features}
most_frequent_values = {col: df[col].mode()[0] for col in categorical_features}

def preprocess_input(data, preprocessor):
    input_df = pd.DataFrame([data])  
    required_columns = ['brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
                    'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power', 'seats']
    for col in required_columns:
        if col not in data:
            if col in numeric_features:
                data[col] = mean_values[col]  
            else:
                data[col] = most_frequent_values[col] 

    print("Input DataFrame Columns:", input_df.columns) 
    print("Input DataFrame:", input_df)  
    print("Expected Columns:", preprocessor.transformers_)
    print("Input DataFrame Columns:", input_df.columns)


    # Apply the preprocessor
    transformed_data = preprocessor.transform(input_df)
    if hasattr(transformed_data, "toarray"): 
        transformed_data = transformed_data.toarray()

    return transformed_data


In [41]:

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)  
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [42]:
# Convert sparse matrix to dense array
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Train the model
history = model.fit(X_train_dense, y_train, epochs=50, batch_size=32, validation_split=0.2)



Epoch 1/50
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 1290997858304.0000 - mae: 757312.0000 - val_loss: 1362355421184.0000 - val_mae: 730629.5000
Epoch 2/50
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 982102114304.0000 - mae: 616470.5000 - val_loss: 612897128448.0000 - val_mae: 285285.9062
Epoch 3/50
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 787951976448.0000 - mae: 283085.3750 - val_loss: 445558325248.0000 - val_mae: 265968.5938
Epoch 4/50
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 370445156352.0000 - mae: 259930.8438 - val_loss: 369856348160.0000 - val_mae: 253745.5000
Epoch 5/50
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 397399687168.0000 - mae: 245253.4688 - val_loss: 334114291712.0000 - val_mae: 248733.0938
Epoch 6/50
[1m309/309[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [47]:
text = {
    "brand": "Maruti",
    "model": "Alto",
    "vehicle_age": 5,
    "km_driven": 45000,
    "seller_type": "Individual",
    "fuel_type": np.nan,
    "transmission_type": "Manual",
    "mileage":19.7,
    "engine": np.nan,
    "max_power": 46.0,
    "seats": 5
}

processed_input = preprocess_input(text, preprocessor)
predicted_price = model.predict(processed_input)
print("Predicted Price:", predicted_price[0][0])

Input DataFrame Columns: Index(['brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
       'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power',
       'seats'],
      dtype='object')
Input DataFrame:     brand model  vehicle_age  km_driven seller_type  fuel_type  \
0  Maruti  Alto            5      45000  Individual        NaN   

  transmission_type  mileage  engine  max_power  seats  
0            Manual     19.7     NaN       46.0      5  
Expected Columns: [('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']), ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['brand', 'model', 'seller_type', 'fuel_type', 'transmission_type'])]
Input DataFrame Columns: Index(['brand', 'model', 'vehicle_age', 'km_driven', 'seller_type',
       'fuel_type', 'transmission_type'



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Predicted Price: 270723.84


In [37]:
from tensorflow.keras.models import load_model

# Load the model
model.save('model.h5')



from tensorflow.keras.models import load_model

# Load the model and map 'mse' to tf.keras.losses.mean_squared_error
model = load_model('model.h5', compile=False)  # Use compile=False to avoid loss dependency

# Recompile the model with recognized metrics and loss functions
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [46]:
model.summary()
