**code number1**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib  # For saving the model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the cleaned dataset
file_path = "cleaned_car_dataset.csv"  # Change this to your dataset path
df_cleaned = pd.read_csv(file_path)

# Split features (X) and target (y)
X = df_cleaned.drop(columns=["price"])
y = df_cleaned["price"]

# Encode categorical columns
for col in X.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define best models with tuned parameters
best_xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6)
best_lgbm = LGBMRegressor(n_estimators=200, learning_rate=0.1, num_leaves=100)

# Train models
best_xgb.fit(X_train_scaled, y_train)
best_lgbm.fit(X_train_scaled, y_train)

# Make predictions
y_pred_xgb = best_xgb.predict(X_test_scaled)
y_pred_lgbm = best_lgbm.predict(X_test_scaled)

# Evaluate models
def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"\n🔹 Model: {model_name}")
    print(f"✅ MAE: {mae:.2f}")
    print(f"✅ MSE: {mse:.2f}")
    print(f"✅ RMSE: {rmse:.2f}")
    print(f"✅ R² Score: {r2:.5f}")

evaluate_model(y_test, y_pred_xgb, "XGBoost")
evaluate_model(y_test, y_pred_lgbm, "LightGBM")

# Save the best model
joblib.dump(best_xgb, "best_xgb_model.pkl")
joblib.dump(best_lgbm, "best_lgbm_model.pkl")
print("\n✅ Models saved successfully as 'best_xgb_model.pkl' and 'best_lgbm_model.pkl'")



  if is_sparse(data):


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2392
[LightGBM] [Info] Number of data points in the train set: 11797, number of used features: 24
[LightGBM] [Info] Start training from score 17538.921166

🔹 Model: XGBoost
✅ MAE: 49.25
✅ MSE: 6867.48
✅ RMSE: 82.87
✅ R² Score: 0.99982

🔹 Model: LightGBM
✅ MAE: 53.27
✅ MSE: 15655.69
✅ RMSE: 125.12
✅ R² Score: 0.99959

✅ Models saved successfully as 'best_xgb_model.pkl' and 'best_lgbm_model.pkl'




In [2]:
#save scaller
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Load dataset
file_path = "cleaned_car_dataset.csv"
df_cleaned = pd.read_csv(file_path)

# Split features and target
X = df_cleaned.drop(columns=["price"])
y = df_cleaned["price"]

# Encode categorical columns and save encoders
encoders = {}
for col in X.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le  # Store the encoder

# Save encoders
joblib.dump(encoders, "label_encoders.pkl")

# Normalize numerical features and save scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train models
best_xgb = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6)
best_lgbm = LGBMRegressor(n_estimators=200, learning_rate=0.1, num_leaves=100)

best_xgb.fit(X_train, y_train)
best_lgbm.fit(X_train, y_train)

# Save trained models
joblib.dump(best_xgb, "best_xgb_model.pkl")
joblib.dump(best_lgbm, "best_lgbm_model.pkl")

print("✅ Models, encoders, and scaler saved successfully!")


  if is_sparse(data):


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000979 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2392
[LightGBM] [Info] Number of data points in the train set: 11797, number of used features: 24
[LightGBM] [Info] Start training from score 17538.921166
✅ Models, encoders, and scaler saved successfully!


In [3]:
#save encoder
import joblib
import pandas as pd
import numpy as np

# Load the saved models
best_xgb = joblib.load("best_xgb_model.pkl")
best_lgbm = joblib.load("best_lgbm_model.pkl")

# Load preprocessing objects
encoders = joblib.load("label_encoders.pkl")  # Load saved LabelEncoders
scaler = joblib.load("scaler.pkl")  # Load saved StandardScaler

def preprocess_user_input(user_input):
    """Preprocess user input to match the trained model format."""
    user_df = pd.DataFrame([user_input])

    # Encode categorical columns using saved encoders
    categorical_cols = ["make_model", "body_type", "Type", "Fuel", "Gearing_Type", "Drive_chain", "Paint_Type", "Upholstery_type"]
    for col in categorical_cols:
        if col in user_df.columns and col in encoders:
            user_df[col] = encoders[col].transform(user_df[col])

    # Scale numerical features using saved scaler
    numerical_cols = ["km", "age", "Previous_Owners", "hp_kW", "Displacement_cc", "Weight_kg", "cons_comb"]
    user_df[numerical_cols] = scaler.transform(user_df[numerical_cols])

    return user_df

def predict_price(user_input):
    """Predicts car price based on user input."""
    user_df = preprocess_user_input(user_input)
    predicted_price = best_xgb.predict(user_df)
    return predicted_price[0]

print("✅ Prediction script ready to use!")


✅ Prediction script ready to use!


In [13]:
#script3
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load trained models
best_xgb = joblib.load("best_xgb_model.pkl")

# Load preprocessing objects (LabelEncoders & Scaler)
encoders = joblib.load("label_encoders.pkl")
scaler = joblib.load("scaler.pkl")

# Load dataset
dataset = pd.read_csv("cleaned_car_dataset.csv")

# Expected columns
expected_numerical_cols = [
    "km", "age", "Previous_Owners", "hp_kW", "Displacement_cc", 
    "Weight_kg", "cons_comb", "estimated_resale_price", "price_per_km", "age_mileage_score"
]
expected_categorical_cols = [
    "make_model", "body_type", "Type", "Fuel", "Gearing_Type", 
    "Drive_chain", "Paint_Type", "Upholstery_type"
]

# Hints for user input
car_models = ["Audi A1", "Audi A2", "Audi A3", "Opel Astra", "Opel Corsa", "Opel Insignia",
              "Renault Clio", "Renault Duster", "Renault Espace"]
fuel_types = ["Diesel", "Benzine", "LPG/CNG", "Electric"]
body_types = ["Sedans", "Station wagon", "Compact", "Coupe", "Van", "Off-Road", "Convertible", "Transporter"]

def preprocess_user_input(user_input):
    """Preprocess user input to match trained model format."""
    user_df = pd.DataFrame([user_input])

    # Ensure missing numerical columns are filled with default values (0)
    for col in expected_numerical_cols:
        if col not in user_df:
            user_df[col] = 0  

    # Ensure missing categorical columns are filled with "Unknown"
    for col in expected_categorical_cols:
        if col not in user_df:
            user_df[col] = "Unknown"

    # Keep only relevant columns
    user_df = user_df[expected_numerical_cols + expected_categorical_cols]

    # Encode categorical columns
    for col in expected_categorical_cols:
        if col in encoders:
            le = encoders[col]
            known_labels = set(le.classes_)
            user_df[col] = user_df[col].apply(lambda x: le.transform([x])[0] if x in known_labels else -1)

    # Convert categorical columns to numeric
    user_df[expected_categorical_cols] = user_df[expected_categorical_cols].apply(pd.to_numeric, errors='coerce')

    # Fill NaN values
    user_df.fillna(0, inplace=True)

    # Ensure numerical columns match the scaler
    user_df[scaler.feature_names_in_] = scaler.transform(user_df[scaler.feature_names_in_])

    return user_df

def predict_price(user_input):
    """Predicts the price of a car based on user specifications."""
    user_df = preprocess_user_input(user_input)
    predicted_price = best_xgb.predict(user_df)
    return predicted_price[0]

def recommend_cars(user_input, dataset):
    """Recommends cars based on user preferences."""

    # Ensure min_mileage is not greater than max_mileage
    if user_input["min_mileage"] > user_input["max_mileage"]:
        print("⚠️ Warning: Min mileage is greater than Max mileage. Swapping values.")
        user_input["min_mileage"], user_input["max_mileage"] = user_input["max_mileage"], user_input["min_mileage"]

    # Normalize input case
    user_input["make_model"] = user_input["make_model"].strip().title()
    user_input["Fuel"] = user_input["Fuel"].strip().title()
    user_input["body_type"] = user_input["body_type"].strip().title()

    # Check if model exists in dataset
    if user_input["make_model"] not in dataset["make_model"].unique():
        print(f"⚠️ Model '{user_input['make_model']}' not found. Showing closest matches.")

    # Filtering based on user input
    filtered_cars = dataset[
        (dataset["make_model"].str.lower() == user_input["make_model"].lower()) &
        (dataset["km"] >= user_input["min_mileage"]) &
        (dataset["km"] <= user_input["max_mileage"]) &
        (dataset["price"] <= user_input["budget"]) &
        (dataset["Fuel"].str.lower().str.contains(user_input["Fuel"].lower(), na=False)) &
        (dataset["body_type"].str.lower() == user_input["body_type"].lower())
    ]

    # If no matches, expand search criteria
    if filtered_cars.empty:
        print("⚠️ No exact matches found. Expanding search criteria...")
        filtered_cars = dataset[
            (dataset["km"] <= user_input["max_mileage"] * 1.5) &  # Allow 50% higher mileage
            (dataset["price"] <= user_input["budget"] * 1.2)      # Increase budget flexibility
        ]

    # Sort results based on best resale value and price
    filtered_cars = filtered_cars.sort_values(by=["estimated_resale_price", "price"], ascending=[False, True])

    return filtered_cars.head(5)



def main():
    """Main function to run the car recommendation system."""
    dataset = pd.read_csv("cleaned_car_dataset.csv")

    print("\n **Car Recommendation System** \n")
    
    user_input = {
        "make_model": input(f"Enter car model (e.g., {', '.join(car_models[:3])}, ...): "),
        "min_mileage": float(input("Enter min mileage (e.g., 20000): ") or 0),
        "max_mileage": float(input("Enter max mileage (e.g., 60000): ") or float('inf')),
        "budget": float(input("Enter budget (e.g., 25000): ") or float('inf')),
        "Fuel": input(f"Enter fuel type ({', '.join(fuel_types)}): "),
        "body_type": input(f"Enter body type ({', '.join(body_types)}): ")
    }

    # Debugging: Check user input and filtering process
    print("User Input:", user_input)
    
    recommendations = recommend_cars(user_input, dataset)

    print("Filtered Cars Count:", len(recommendations))
    print("Filtered Cars Sample:")
    print(recommendations[["make_model", "km", "price", "estimated_resale_price"]].head())

    # Print recommendations
    if not recommendations.empty:
        print("\n **Recommended Cars:**")
        print(recommendations[["make_model", "km", "price", "estimated_resale_price"]])
    else:
        print("\n No cars found matching your criteria. Try adjusting your preferences.")


if __name__ == "__main__":
    main()



 **Car Recommendation System** 

User Input: {'make_model': 'audi a1', 'min_mileage': 10000.0, 'max_mileage': 12000.0, 'budget': 250000.0, 'Fuel': 'diesel', 'body_type': 'sedans'}
Filtered Cars Count: 5
Filtered Cars Sample:
     make_model       km    price  estimated_resale_price
729     Audi A1  10500.0  22500.0                19125.00
1661    Audi A1  11432.0  21660.0                18411.00
1077    Audi A1  10775.0  19977.0                16980.45
1020    Audi A1  10771.0  19950.0                16957.50
1024    Audi A1  11380.0  19900.0                16915.00

 **Recommended Cars:**
     make_model       km    price  estimated_resale_price
729     Audi A1  10500.0  22500.0                19125.00
1661    Audi A1  11432.0  21660.0                18411.00
1077    Audi A1  10775.0  19977.0                16980.45
1020    Audi A1  10771.0  19950.0                16957.50
1024    Audi A1  11380.0  19900.0                16915.00
