#  Classification Models - Machine Learning

In [None]:
# Importing necessary libraries
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib


In [None]:
# Load the cleaned dataset
data_path = "cleaned_final_data.csv"  # Ensure this file exists
data = pd.read_csv(data_path)
print("Data Loaded Successfully!")

In [None]:
# Convert target variable into categories for classification
target = "price_category"  # Modify based on dataset
X = data.drop(columns=[target])
y = LabelEncoder().fit_transform(data[target])

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardizing numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##  Step 1: Training Logistic Regression Model


In [None]:
# Train Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)

# Model Evaluation
print("\nLogistic Regression Evaluation:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
# Logistic Regression is used for classification problems. It predicts probabilities for different classes
# and assigns the class with the highest probability. Here, we evaluate it using accuracy and precision metrics.

##  Step 2: Training Decision Tree 


In [None]:
# Train Decision Tree 
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Model Evaluation
print("\nDecision Tree Evaluation:")
print(classification_report(y_test, y_pred))

In [None]:
# Decision Trees split the data based on feature conditions to classify data points.
# We use a max depth of 5 to prevent overfitting and ensure generalizability.

##  Step 3: Training The Random Forest 


In [None]:
# Train Random Forest with Hyperparameter Tuning
rf_grid = GridSearchCV(RandomForestClassifier(), {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15]}, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

# Predictions
y_pred = best_rf.predict(X_test)

# Model Evaluation
print("\nRandom Forest Evaluation:")
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest is an ensemble model that combines multiple Decision Trees.
# It prevents overfitting by averaging the predictions from different trees.
# We used GridSearchCV to tune the number of trees (n_estimators) and depth (max_depth) to find the best model.

In [None]:
# Save Best Model
joblib.dump(best_rf, "random_forest.pkl")

In [None]:
# Trained Logistic Regression, Decision Tree, and Random Forest models.
# Applied Hyperparameter Tuning for Random Forest.
# Saved the best model for later use.