In [6]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df_cars_final = pd.read_csv('All_cities_cleaned_data.csv')

# Limit the dataset to the first 10000 rows (for example)
df_cars_final_small = df_cars_final.head(10000)

# Remove duplicate rows based on all columns to keep only unique rows
df_cars_final_small_unique = df_cars_final_small.drop_duplicates()

# Identify categorical and numerical features
categorical_features = df_cars_final_small_unique.select_dtypes(include=['object']).columns
numerical_features = df_cars_final_small_unique.select_dtypes(include=['number']).columns

# Separate the target variable
X = df_cars_final_small_unique.drop('price', axis=1)
y = df_cars_final_small_unique['price']

# Initialize a LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to certain categorical features (optional: choose those that have an ordinal relationship)
for feature in categorical_features:
    if X[feature].nunique() <= 10:
        X[feature] = label_encoder.fit_transform(X[feature])

# Apply One-Hot Encoding to other categorical features
X_encoded = pd.get_dummies(X, columns=[feature for feature in categorical_features if X[feature].nunique() > 10], drop_first=True)

# Split the data into training and testing sets without stratification
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

# Define the models to use with RandomizedSearchCV
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'XGBoost': XGBRegressor(),
    'Random Forest': RandomForestRegressor(n_jobs=-1)
}

# RandomizedSearchCV parameters for each model
param_distributions = {
    'Linear Regression': {},
    'Decision Tree': {
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10],
        'learning_rate': [0.01, 0.05, 0.1],
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 20],
        'min_samples_split': [2, 5, 10],
    },
}

# Store the best models
best_models = {}

# Perform RandomizedSearchCV and save the best models
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Set up RandomizedSearchCV for the current model
    random_search = RandomizedSearchCV(model, param_distributions[model_name], n_iter=10, cv=3, random_state=42, n_jobs=-1)
    
    # Fit the model
    random_search.fit(X_train, y_train)

    # Save the best model for each type
    best_models[model_name] = random_search.best_estimator_

    # Save the best model using joblib
    joblib.dump(random_search.best_estimator_, f'best_{model_name.lower().replace(" ", "_")}_model.pkl')
    print(f"Best {model_name} model saved as 'best_{model_name.lower().replace(' ', '_')}_model.pkl'")

# Evaluate the best models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    results = {
        'MSE_train': mean_squared_error(y_train, y_train_pred),
        'MAE_train': mean_absolute_error(y_train, y_train_pred),
        'R2_train': r2_score(y_train, y_train_pred),
        'MAPE_train': mean_absolute_percentage_error(y_train, y_train_pred) * 100,
        'MSE_test': mean_squared_error(y_test, y_test_pred),
        'MAE_test': mean_absolute_error(y_test, y_test_pred),
        'R2_test': r2_score(y_test, y_test_pred),
        'MAPE_test': mean_absolute_percentage_error(y_test, y_test_pred) * 100
    }

    print(f"Results for {model_name}:")
    print(pd.DataFrame(results, index=[0]))

# Evaluate all the best models
for model_name, model in best_models.items():
    evaluate_model(model, X_train, X_test, y_train, y_test, model_name)


  df_cars_final = pd.read_csv('All_cities_cleaned_data.csv')


Training Linear Regression...




Best Linear Regression model saved as 'best_linear_regression_model.pkl'
Training Decision Tree...
Best Decision Tree model saved as 'best_decision_tree_model.pkl'
Training XGBoost...
Best XGBoost model saved as 'best_xgboost_model.pkl'
Training Random Forest...
Best Random Forest model saved as 'best_random_forest_model.pkl'
Results for Linear Regression:
      MSE_train     MAE_train  R2_train  MAPE_train      MSE_test  \
0  5.613861e+09  43649.603862  0.994285    6.486219  3.144243e+13   

       MAE_test    R2_test   MAPE_test  
0  1.749446e+06 -50.200308  444.532998  
Results for Decision Tree:
      MSE_train     MAE_train  R2_train  MAPE_train      MSE_test  \
0  1.548877e+10  46726.542768  0.984231     4.72422  1.315525e+11   

      MAE_test   R2_test  MAPE_test  
0  172043.0839  0.785782  23.213467  
Results for XGBoost:
      MSE_train    MAE_train  R2_train  MAPE_train      MSE_test  \
0  1.898090e+10  93399.83489  0.980676   14.764995  1.305698e+11   

        MAE_test   R

In [7]:
import pandas as pd

# Optionally, you can save the unique rows to a new CSV file
df_cars_final_small_unique.to_csv('All_cities_unique_data.csv', index=False)


In [8]:
df_cars_final_small_unique['url_model'] = ""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cars_final_small_unique['url_model'] = ""


In [9]:
df_cars_final_small_unique['Original Equipment Manufacturer'].unique()

array(['Maruti', 'Ford', 'Tata', 'Hyundai', 'Jeep', 'Datsun', 'Honda',
       'Mahindra', 'Mercedes-Benz', 'BMW', 'Renault', 'Audi', 'Toyota',
       'Mini', 'Kia', 'Skoda', 'Volkswagen', 'Volvo', 'MG', 'Nissan',
       'Fiat', 'Mahindra Ssangyong', 'Mitsubishi', 'Jaguar', 'Land Rover',
       'Chevrolet', 'Citroen', 'Opel', 'Mahindra Renault'], dtype=object)

In [10]:
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Maruti', 'url_model'] = 'https://logos-world.net/wp-content/uploads/2022/12/Maruti-Suzuki-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Ford', 'url_model'] = 'https://logos-world.net/wp-content/uploads/2021/05/Ford-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Tata','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/11/Tata-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Hyundai','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/03/Hyundai-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Jeep','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/09/Jeep-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Honda','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/03/Honda-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Mahindra','url_model'] = 'https://logos-world.net/wp-content/uploads/2022/12/Mahindra-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Renault','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/04/Renault-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Mercedes-Benz','url_model'] = 'https://logos-world.net/wp-content/uploads/2020/05/Mercedes-Benz-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Kia','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/03/Kia-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Skoda','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/06/Skoda-logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Volkswagen','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/04/Volkswagen-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'MG','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/09/MG-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Toyota','url_model'] = 'https://logos-world.net/wp-content/uploads/2020/04/Toyota-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Audi','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/03/Audi-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'BMW','url_model'] = 'https://logos-world.net/wp-content/uploads/2020/04/BMW-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Datsun','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/08/Datsun-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Volvo','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/06/Volvo-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Nissan','url_model'] = 'https://logos-world.net/wp-content/uploads/2020/04/Nissan-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Fiat','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/03/Fiat-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Chevrolet','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/03/Chevrolet-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Citroen','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/09/Citroen-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Jaguar','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/06/Jaguar-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Land Rover','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/10/Land-Rover-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Lexus','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/10/Lexus-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Hindustan Motors','url_model'] = 'https://logos-world.net/wp-content/uploads/2023/08/Hindustan-Motors-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Isuzu','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/09/Isuzu-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Mahindra SsangYong','url_model'] = 'https://logos-world.net/wp-content/uploads/2022/12/Mahindra-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Mitsubishi','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/09/Mitsubishi-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Mahindra Renault','url_model'] = 'https://logos-world.net/wp-content/uploads/2022/12/Mahindra-Logo.png'
df_cars_final_small_unique.loc[df_cars_final_small_unique['Original Equipment Manufacturer'] == 'Mini','url_model'] = 'https://logos-world.net/wp-content/uploads/2021/04/Mini-Logo.png'

In [11]:
df_cars_final_small_unique.to_excel('all_cities_cars_with_url.xlsx', index=False)
