In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay, mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df=pd.read_csv('Train_cleaned.csv')

In [None]:
target_col='cluster_category'
y=df[target_col]
X=df.drop(columns=[target_col])

In [None]:
X.columns

In [None]:
# (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
from sklearn import preprocessing
#Initiating the label encoder
label_encoder = preprocessing.LabelEncoder()
#Encoding the variables for easier analysis
columns_to_encode = ['Customer_ID', 'outlet_city', 'luxury_sales', 'fresh_sales', 'dry_sales', 'Area']

# Loop through each column and apply label encoding
# Apply encoding to the entire DataFrame before splitting
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])

# Now split the encoded DataFrame
target_col='cluster_category'
y=df[target_col]
X=df.drop(columns=[target_col])

# (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


# Train Random Forest Classifier with the now encoded data
rf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE

# SMOTE resampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Improved hyperparameter distribution
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'ccp_alpha': [0.0, 0.01, 0.02]
}

# StratifiedKFold for class balance
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized Search with more iterations
rf_rscv = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_distributions=param_dist,
    n_iter=30,                      # increase to explore more combinations
    cv=cv_strategy,
    scoring='balanced_accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Fit model
rf_rscv.fit(X_train_resampled, y_train_resampled)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [None]:
rf_rscv.fit(X_train, y_train)
print("Best Parameters:", rf_rscv.best_params_)

In [None]:
best_model= rf_rscv.best_estimator_
y_pred_rf = best_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_best = accuracy_score(y_test, y_pred_rf)
print(f"Best Model Accuracy: {accuracy_best * 100:.2f}%")

In [None]:
# Classification model after optimizing
y_pred_rf = best_model.predict(X_test)
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_rf))

In [None]:
rf_best = rf_rscv.best_estimator_

In [None]:
print(rf_rscv.best_params_)

In [None]:
#prediction on the test data
y_pred_rf = rf_best.predict(X_test)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
print("confusion_matrix for RF")
rf_cm=confusion_matrix(y_test,y_pred_rf)
disp=ConfusionMatrixDisplay(confusion_matrix=rf_cm,display_labels = rf_best.classes_)
disp.plot()

In [None]:
from sklearn.model_selection import learning_curve
# learning curve
train_sizes, train_scores, test_scores = learning_curve(
    best_model, X_train, y_train,
    cv=5,
    scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 5),
    n_jobs=-1
)

# mean and standard deviation
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
# Plot training accuracy
plt.plot(train_sizes, train_mean, 'o-', color="blue", label="Training Score")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="blue")
# Plot validation accuracy
plt.plot(train_sizes, test_mean, 'o-', color="red", label="Validation Score")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="red")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy")
plt.title("Learning Curve for Random Forest")
plt.legend(loc="lower right")
plt.grid()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

# Train Accuracy
train_accuracy = accuracy_score(y_train, rf_best.predict(X_train))
print(f"Train Accuracy: {train_accuracy:.4f}")

# Test Accuracy
test_accuracy = accuracy_score(y_test, rf_best.predict(X_test))
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
# Feature Importance
importances = rf_best.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:\n", feature_importance_df)

In [None]:
# Feature Importance Plot
plt.figure(figsize=(12, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance in Random Forest')
plt.gca().invert_yaxis()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarize the target variable for multiclass ROC
y_test_bin = label_binarize(y_test, classes=rf_rscv.classes_)
y_pred_prob = rf_rscv.best_estimator_.predict_proba(X_test)

# Calculate and plot ROC curve for each class
plt.figure(figsize=(8, 6))
for i in range(y_test_bin.shape[1]):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'Class {rf_rscv.classes_[i]} (AUC = {roc_auc:.2f})')

# Plot chance level (random classifier)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf})
print(comparison_df.head(10))