<a href="https://colab.research.google.com/github/Pankajjoshi11/walmart_retail_analysis/blob/main/random_forest_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install pandas
import pandas as pd
df=pd.read_csv("/walmart_updated.csv")



In [4]:
# Import ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


**Final code to apply pipeline**

In [44]:
from sklearn.preprocessing import LabelEncoder

# Encode y
le = LabelEncoder()
y = le.fit_transform(df['rating_level'])

# Drop target leakage columns
X = df.drop(['rating', 'rating_level', 'invoice_id', 'date', 'time'], axis=1)


In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_cols = X.select_dtypes(include='object').columns.tolist()
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')


In [46]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Fit & transform preprocessing for SMOTE
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTE only to training
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)


In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Final pipeline (Preprocessing already applied before SMOTE)
model_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(
        n_estimators=100, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=42
    ))
])

# Fit on resampled data
model_pipeline.fit(X_train_resampled, y_train_resampled)


In [48]:
from sklearn.metrics import classification_report

y_pred = model_pipeline.predict(X_test_processed)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

        High       0.99      0.85      0.92       161
         Low       0.97      1.00      0.98       844
      Medium       1.00      1.00      1.00       837

    accuracy                           0.99      1842
   macro avg       0.99      0.95      0.97      1842
weighted avg       0.99      0.99      0.99      1842



In [50]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Preprocess entire X
X_processed = preprocessor.fit_transform(X)

# Resample the entire dataset using SMOTE
X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_processed, y)

# Cross-validation scores
cv_scores = cross_val_score(
    model_pipeline,
    X_resampled,
    y_resampled,
    cv=5,
    scoring='f1_macro',  # or 'accuracy'
    n_jobs=-1
)

# Print results
print(" 5-Fold Cross-Validation F1 Scores:", np.round(cv_scores, 4))
print(" Mean F1 Score:", round(cv_scores.mean(), 4))
print(" Std Dev:", round(cv_scores.std(), 4))


 5-Fold Cross-Validation F1 Scores: [0.928  0.998  0.9957 0.9957 0.9972]
 Mean F1 Score: 0.9829
 Std Dev: 0.0275


**Experimenting with pipelines**

In [51]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Full pipeline with SMOTE inside
smote_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        n_estimators=100, max_depth=10, min_samples_split=5,
        min_samples_leaf=2, random_state=42
    ))
])

# Fit on original (non-resampled) data
smote_pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [52]:
y_pred = smote_pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

        High       0.99      0.85      0.92       161
         Low       0.97      1.00      0.98       844
      Medium       1.00      1.00      1.00       837

    accuracy                           0.99      1842
   macro avg       0.99      0.95      0.97      1842
weighted avg       0.99      0.99      0.99      1842



In [53]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(smote_pipeline, X, y, cv=5, scoring='f1_macro')
print("CV F1 Macro:", cv_scores.mean())


CV F1 Macro: 0.9142565529408664
