In [17]:
# Data Preparation

import pandas as pd

# Load the data
df = pd.read_csv('All_cities_cleaned_data.csv', low_memory= False)


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552269 entries, 0 to 552268
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ft                   552269 non-null  object 
 1   bt                   552269 non-null  object 
 2   km                   552269 non-null  int64  
 3   transmission         552269 non-null  object 
 4   oem                  552269 non-null  object 
 5   model                552269 non-null  object 
 6   modelYear            552269 non-null  float64
 7   price                552269 non-null  float64
 8   Fuel Type            552269 non-null  object 
 9   Seats                552269 non-null  int64  
 10  Kms Driven           552269 non-null  int64  
 11  Ownership            552269 non-null  object 
 12  Year of Manufacture  552269 non-null  float64
 13  Mileage              552269 non-null  float64
 14  Engine               552269 non-null  int64  
 15  Color            

In [19]:
df.rename(columns = {
    'ft': 'Fuel type', 
    'bt': 'Body type', 
    'km': 'Kilometers driven', 
    'oem': 'Original Equipment Manufacturer'}, inplace =True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552269 entries, 0 to 552268
Data columns (total 22 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Fuel type                        552269 non-null  object 
 1   Body type                        552269 non-null  object 
 2   Kilometers driven                552269 non-null  int64  
 3   transmission                     552269 non-null  object 
 4   Original Equipment Manufacturer  552269 non-null  object 
 5   model                            552269 non-null  object 
 6   modelYear                        552269 non-null  float64
 7   price                            552269 non-null  float64
 8   Fuel Type                        552269 non-null  object 
 9   Seats                            552269 non-null  int64  
 10  Kms Driven                       552269 non-null  int64  
 11  Ownership                        552269 non-null  object 
 12  Ye

In [21]:
df_unique = df.drop_duplicates()

In [22]:
print(len(df))
print(len(df_unique))

552269
8258


In [7]:
# Save the cleaned DataFrame to a CSV file

df_unique.to_csv('Final_data1.csv', index=False)

In [23]:
# Define feature lists
numeric_features = ['Kilometers driven', 'modelYear', 'price', 'Seats', 'Kms Driven', 'Year of Manufacture', 'Mileage', 'Engine', 'Displacement', 'Length', 'Width', 'Height', 'Wheel Base']
categorical_features = ['Fuel type', 'Body type', 'transmission', 'Original Equipment Manufacturer', 'model', 'Fuel Type', 'Ownership', 'Color', 'City']

# Create DataFrames for numeric and categorical features
df_numeric = df_unique[numeric_features]
df_categorical = df_unique[categorical_features]

Regularization

In [24]:
import numpy as np

# Define the pattern
pattern = np.array([1, 0, 1, 0, 1, 0])

# Create the repeating pattern array
repeated_pattern = np.tile(pattern, len(df_categorical) // len(pattern))
remaining_length = len(df_categorical) % len(pattern)
if remaining_length > 0:
    repeated_pattern = np.concatenate([repeated_pattern, pattern[:remaining_length]])

# Adding the new column
df_categorical['Bought'] = repeated_pattern

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical['Bought'] = repeated_pattern


Merged Code

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


# ----------------------- Regression Models -----------------------

# Define features and target variable for regression
X = df_numeric.drop('price', axis=1)  # Features
y = df_numeric['price']               # Target variable

# Define the numerical features
numerical_features = [
    'Kilometers driven', 'modelYear', 'Seats', 'Kms Driven', 'Year of Manufacture', 
    'Mileage', 'Engine', 'Displacement', 'Length', 'Width', 'Height', 'Wheel Base'
]

# Filter X to keep only the numerical features
X = X[numerical_features]

# Create a preprocessing pipeline for numerical data
preprocessor = StandardScaler()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)
print()  # Blank line for spacing
print("Linear Regressor with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_lr)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_lr)}")
print(f"R-squared: {r2_score(y_test, y_pred_lr)}")

# Ridge Regression
# Define the parameter grid for Ridge
param_grid_ridge = {'regressor__alpha': [1.0]}

# Set up the pipeline with Ridge regression
pipeline_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(solver='auto', max_iter=10))  # Increase max_iter to allow for convergence
])

# Perform Grid Search with 20-fold cross-validation
grid_search_ridge = GridSearchCV(pipeline_ridge, param_grid_ridge, cv=20, n_jobs=-1)
grid_search_ridge.fit(X_train, y_train)

# Output the best parameters
print()  # Blank line for spacing
print("Best parameters for Ridge:", grid_search_ridge.best_params_)

# Make predictions
y_pred_ridge = grid_search_ridge.best_estimator_.predict(X_test)

# Output results
print()  # Blank line for spacing
print("Ridge Regression with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_ridge)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_ridge)}")
print(f"R-squared: {r2_score(y_test, y_pred_ridge)}")


# Lasso Regression
# Define the parameter grid for Lasso
param_grid_lasso = {'regressor__alpha': [0.1, 1, 10]}

# Set up the pipeline
pipeline_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso())
])

# Perform Grid Search
grid_search_lasso = GridSearchCV(estimator=pipeline_lasso, param_grid=param_grid_lasso, cv=5)
grid_search_lasso.fit(X_train, y_train)

# Make predictions
y_pred_lasso = grid_search_lasso.best_estimator_.predict(X_test)

# Output results
print()  # Blank line for spacing
print("Lasso Regression with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_lasso)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_lasso)}")
print(f"R-squared: {r2_score(y_test, y_pred_lasso)}")


# Elastic Net Regression
# Define the parameter grid for Elastic Net
param_grid_en = {
    'regressor__alpha': [0.1, 1, 10],
    'regressor__l1_ratio': [0.1, 0.5, 0.9]
}

# Set up the pipeline
pipeline_en = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])

# Perform Grid Search
grid_search_en = GridSearchCV(estimator=pipeline_en, param_grid=param_grid_en, cv=5)
grid_search_en.fit(X_train, y_train)

# Make predictions
y_pred_elastic_net = grid_search_en.best_estimator_.predict(X_test)

# Output results
print()  # Blank line for spacing
print("Elastic Net Regression with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_elastic_net)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_elastic_net)}")
print(f"R-squared: {r2_score(y_test, y_pred_elastic_net)}")


# Random Forest Regressor
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
print()  # Blank line for spacing
print("Random Forest Regressor with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_rf)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_rf)}")
print(f"R-squared: {r2_score(y_test, y_pred_rf)}")

# Decision Tree Regressor
pipeline_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])
pipeline_dt.fit(X_train, y_train)
y_pred_dt = pipeline_dt.predict(X_test)
print()  # Blank line for spacing
print("Decision Tree Regressor with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_dt)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_dt)}")
print(f"R-squared: {r2_score(y_test, y_pred_dt)}")

# Gradient Boosting Regressor
pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])
pipeline_gb.fit(X_train, y_train)
y_pred_gb = pipeline_gb.predict(X_test)
print()  # Blank line for spacing
print("Gradient Boosting Regressor with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_gb)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_gb)}")
print(f"R-squared: {r2_score(y_test, y_pred_gb)}")

# XGBoost Regressor
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(n_estimators=100, random_state=42))
])
pipeline_xgb.fit(X_train, y_train)
y_pred_xgb = pipeline_xgb.predict(X_test)
print()  # Blank line for spacing
print("XGBoost Regressor with Numeric Features:")
print()  # Blank line for spacing
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred_xgb)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred_xgb)}")
print(f"R-squared: {r2_score(y_test, y_pred_xgb)}")

# ----------------------- Classification Models -----------------------

# Define categorical features and target variable for classification
categorical_features = [
    'Fuel type', 'Body type', 'transmission', 'Original Equipment Manufacturer',
    'model', 'Fuel Type', 'Ownership', 'Color', 'City', 'Bought'
]

# Extract features and target variable for classification
X_categorical = df_categorical[categorical_features]
y_categorical = df_categorical['Bought']

# Create a preprocessing pipeline for categorical data
preprocessor_cat = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Use dense array
        ]), categorical_features)
    ]
)

# Split the data into training and testing sets
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_categorical, y_categorical, test_size=0.2, random_state=42)

# KNN Classifier
pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor_cat),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])
param_grid_knn = {
    'knn__n_neighbors': [5],
    'knn__weights': ['distance'],
    'knn__p': [1]
}
grid_search_knn = GridSearchCV(pipeline_knn, param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_knn.fit(X_train_cat, y_train_cat)
print()  # Blank line for spacing
print("Best Parameters for KNN:", grid_search_knn.best_params_)
print()  # Blank line for spacing
y_pred_knn = grid_search_knn.best_estimator_.predict(X_test_cat)
print()  # Blank line for spacing
print("KNN Classifier with Hyperparameter Tuning Evaluation:")
print()  # Blank line for spacing
print(f"Accuracy: {accuracy_score(y_test_cat, y_pred_knn)}")
print(f"Precision: {precision_score(y_test_cat, y_pred_knn, average='weighted')}")
print(f"Recall: {recall_score(y_test_cat, y_pred_knn, average='weighted')}")
print(f"F1 Score: {f1_score(y_test_cat, y_pred_knn, average='weighted')}")
print("Confusion Matrix:\n", confusion_matrix(y_test_cat, y_pred_knn))

# Random Forest Classifier
pipeline_rf_clf = Pipeline(steps=[
    ('preprocessor', preprocessor_cat),
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline_rf_clf.fit(X_train_cat, y_train_cat)
y_pred_rf_clf = pipeline_rf_clf.predict(X_test_cat)
print()  # Blank line for spacing
print("Random Forest Classifier with Categorical Features:")
print()  # Blank line for spacing
print(f"Accuracy: {accuracy_score(y_test_cat, y_pred_rf_clf)}")
print(f"Precision: {precision_score(y_test_cat, y_pred_rf_clf, average='weighted')}")
print(f"Recall: {recall_score(y_test_cat, y_pred_rf_clf, average='weighted')}")
print(f"F1 Score: {f1_score(y_test_cat, y_pred_rf_clf, average='weighted')}")
print("Confusion Matrix:\n", confusion_matrix(y_test_cat, y_pred_rf_clf))

# Gradient Boosting Classifier
pipeline_gb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor_cat),
    ('scaler', StandardScaler()),
    ('clf', GradientBoostingClassifier(n_estimators=100, random_state=42))
])
pipeline_gb_clf.fit(X_train_cat, y_train_cat)
y_pred_gb_clf = pipeline_gb_clf.predict(X_test_cat)
print()  # Blank line for spacing
print("Gradient Boosting Classifier with Categorical Features:")
print()  # Blank line for spacing
print(f"Accuracy: {accuracy_score(y_test_cat, y_pred_gb_clf)}")
print(f"Precision: {precision_score(y_test_cat, y_pred_gb_clf, average='weighted')}")
print(f"Recall: {recall_score(y_test_cat, y_pred_gb_clf, average='weighted')}")
print(f"F1 Score: {f1_score(y_test_cat, y_pred_gb_clf, average='weighted')}")
print("Confusion Matrix:\n", confusion_matrix(y_test_cat, y_pred_gb_clf))

# XGBoost Classifier
pipeline_xgb_clf = Pipeline(steps=[
    ('preprocessor', preprocessor_cat),
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(eval_metric='mlogloss'))
])
pipeline_xgb_clf.fit(X_train_cat, y_train_cat)
y_pred_xgb_clf = pipeline_xgb_clf.predict(X_test_cat)
print()  # Blank line for spacing
print("XGBoost Classifier with Categorical Features:")
print()  # Blank line for spacing
print(f"Accuracy: {accuracy_score(y_test_cat, y_pred_xgb_clf)}")
print(f"Precision: {precision_score(y_test_cat, y_pred_xgb_clf, average='weighted')}")
print(f"Recall: {recall_score(y_test_cat, y_pred_xgb_clf, average='weighted')}")
print(f"F1 Score: {f1_score(y_test_cat, y_pred_xgb_clf, average='weighted')}")
print("Confusion Matrix:\n", confusion_matrix(y_test_cat, y_pred_xgb_clf))

# CatBoost Classifier
pipeline_catboost_clf = Pipeline(steps=[
    ('preprocessor', preprocessor_cat),
    ('scaler', StandardScaler()),
    ('clf', CatBoostClassifier(silent=True))
])
pipeline_catboost_clf.fit(X_train_cat, y_train_cat)
y_pred_catboost_clf = pipeline_catboost_clf.predict(X_test_cat)
print()  # Blank line for spacing
print("CatBoost Classifier with Categorical Features:")
print()  # Blank line for spacing
print(f"Accuracy: {accuracy_score(y_test_cat, y_pred_catboost_clf)}")
print(f"Precision: {precision_score(y_test_cat, y_pred_catboost_clf, average='weighted')}")
print(f"Recall: {recall_score(y_test_cat, y_pred_catboost_clf, average='weighted')}")
print(f"F1 Score: {f1_score(y_test_cat, y_pred_catboost_clf, average='weighted')}")
print("Confusion Matrix:\n", confusion_matrix(y_test_cat, y_pred_catboost_clf))



Linear Regressor with Numeric Features:

Mean Absolute Error (MAE): 476132.1350134356
Mean Squared Error (MSE): 557106151070.4108
R-squared: 0.6111832981721214

Best parameters for Ridge: {'regressor__alpha': 1.0}

Ridge Regression with Numeric Features:

Mean Absolute Error (MAE): 472555.2507111026
Mean Squared Error (MSE): 553865022468.844
R-squared: 0.61344535349253


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Lasso Regression with Numeric Features:

Mean Absolute Error (MAE): 472636.97735188075
Mean Squared Error (MSE): 553995851588.613
R-squared: 0.6133540449568875

Elastic Net Regression with Numeric Features:

Mean Absolute Error (MAE): 468238.4119907266
Mean Squared Error (MSE): 548249536400.45245
R-squared: 0.6173645253919557

Random Forest Regressor with Numeric Features:

Mean Absolute Error (MAE): 149440.06698484372
Mean Squared Error (MSE): 178678298404.85098
R-squared: 0.8752964645238477

Decision Tree Regressor with Numeric Features:

Mean Absolute Error (MAE): 189696.5433817595
Mean Squared Error (MSE): 253417100544.32922
R-squared: 0.8231346018508118

Gradient Boosting Regressor with Numeric Features:

Mean Absolute Error (MAE): 194069.68592407947
Mean Squared Error (MSE): 298679420765.5149
R-squared: 0.7915450277065219

XGBoost Regressor with Numeric Features:

Mean Absolute Error (MAE): 157108.52823282385
Mean Squared Error (MSE): 219277440590.89648
R-squared: 0.846961425444

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Define features and target variable for regression
X = df_numeric.drop('price', axis=1)  # Features
y = df_numeric['price']               # Target variable

# Define the numerical features
numerical_features = [
    'Kilometers driven', 'modelYear', 'Seats', 'Kms Driven', 'Year of Manufacture', 
    'Mileage', 'Engine', 'Displacement', 'Length', 'Width', 'Height', 'Wheel Base'
]

# Filter X to keep only the numerical features
X = X[numerical_features]

# Feature selection with RFE
model = LinearRegression()
rfe = RFE(model, n_features_to_select=12)
X_reduced = rfe.fit_transform(X, y)

# Create a preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred)}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred)}")
print(f"R-squared: {r2_score(y_test, y_pred)}")


Mean Absolute Error (MAE): 475480.9426774719
Mean Squared Error (MSE): 556563938636.6627
R-squared: 0.611561720219293


In [27]:
from sklearn.preprocessing import StandardScaler
import joblib

# Assuming you already have `X_train` and `X_test` defined

# Fit the preprocessor on the training data
preprocessor = StandardScaler()
X_train_transformed = preprocessor.fit_transform(X_train)

# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor_Reg.pkl')

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)


In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import joblib

# Load the preprocessor
preprocessor = joblib.load('preprocessor_Reg.pkl')

# Apply the preprocessor to the training data
X_train_transformed = preprocessor.transform(X_train)

# Create a pipeline with the preprocessor and RandomForestRegressor
model_pipeline_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Fit the pipeline on the transformed training data
model_pipeline_reg.fit(X_train_transformed, y_train)

# Save the entire pipeline including the model
joblib.dump(model_pipeline_reg, 'model_pipeline_reg.pkl')


['model_pipeline_reg.pkl']

In [29]:
# Discretize continuous target into bins
y_train_binned = pd.qcut(y_train, q=5, labels=False)


In [30]:
# Fit the preprocessor on training data and save it
preprocessor.fit(X_train)
joblib.dump(preprocessor, 'preprocessor_Cls.pkl')

# Later, when you want to use this preprocessor with RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Load the preprocessor
preprocessor = joblib.load('preprocessor_Cls.pkl')

# Create a pipeline with the preprocessor and RandomForestClassifier
model_pipeline_cls = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier())
])

# Fit the pipeline on training data
model_pipeline_cls.fit(X_train, y_train_binned)

# Save the entire pipeline including the model
joblib.dump(model_pipeline_cls, 'model_pipeline_cls.pkl')

['model_pipeline_cls.pkl']