In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [4]:
df = pd.read_csv('Ecommerce_Delivery_Analytics_New.csv')
df.head()

Unnamed: 0,Order ID,Customer ID,Platform,Order Date & Time,Delivery Time (Minutes),Product Category,Order Value (INR),Customer Feedback,Service Rating,Delivery Delay,Refund Requested
0,ORD000001,CUST2824,JioMart,19:29.5,30,Fruits & Vegetables,382,"Fast delivery, great service!",5,No,No
1,ORD000002,CUST1409,Blinkit,54:29.5,16,Dairy,279,Quick and reliable!,5,No,No
2,ORD000003,CUST5506,JioMart,21:29.5,25,Beverages,599,Items missing from order.,2,No,Yes
3,ORD000004,CUST5012,JioMart,19:29.5,42,Beverages,946,Items missing from order.,2,Yes,Yes
4,ORD000005,CUST4657,Blinkit,49:29.5,30,Beverages,334,"Fast delivery, great service!",5,No,No


In [5]:
# Remove duplicates
df = df.drop_duplicates()

In [6]:
# Encode Categorical Features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical_cols = ['Platform', 'Product Category']
categorical_encoded = encoder.fit_transform(df[categorical_cols])
categorical_feature_names = encoder.get_feature_names_out(categorical_cols)
categorical_df = pd.DataFrame(categorical_encoded, columns=categorical_feature_names)

In [11]:
# Scale Numerical Features
scaler = MinMaxScaler()
numerical_cols = ['Order Value (INR)', 'Delivery Time (Minutes)', 'Service Rating']
numerical_scaled = scaler.fit_transform(df[numerical_cols])
numerical_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)

In [13]:
# Process Text Data using NLP (TF-IDF)
tfidf = TfidfVectorizer(max_features=50)
text_features = tfidf.fit_transform(df['Customer Feedback'].fillna(""))
tfidf_df = pd.DataFrame(text_features.toarray(), columns=tfidf.get_feature_names_out())

In [14]:
# Combine Processed Data
X = pd.concat([categorical_df, numerical_df, tfidf_df], axis=1)
y_delay = df['Delivery Delay'].apply(lambda x: 1 if x == 'Yes' else 0)
y_refund = df['Refund Requested'].apply(lambda x: 1 if x == 'Yes' else 0)

In [16]:
# Ensure proper train-test split
X_train, X_test, y_delay_train, y_delay_test, y_refund_train, y_refund_test = train_test_split(
    X, y_delay, y_refund, test_size=0.2, random_state=42, stratify=y_delay)

In [19]:
# Reduce dimensionality using PCA
pca = PCA(n_components=0.95)  # Preserve 95% variance
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


In [21]:
# Train Models with Tuned Parameters
models = {
    "RandomForest": RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=50, learning_rate=0.05, max_depth=3, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, learning_rate=0.3, random_state=42)
}

trained_models_delay = {}
trained_models_refund = {}
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier()
adaboost = AdaBoostClassifier(algorithm="SAMME.R")


In [23]:
for name, model in models.items():
    print(f"Training {name} for Delivery Delay Prediction...")
    model.fit(X_train, y_delay_train)
    y_delay_pred = model.predict(X_test)
    print(f"{name} Accuracy for Delivery Delay:", accuracy_score(y_delay_test, y_delay_pred))
    print(classification_report(y_delay_test, y_delay_pred))
    trained_models_delay[name] = model
    
    print(f"Training {name} for Refund Request Prediction...")
    model.fit(X_train, y_refund_train)
    y_refund_pred = model.predict(X_test)
    print(f"{name} Accuracy for Refund Request:", accuracy_score(y_refund_test, y_refund_pred))
    print(classification_report(y_refund_test, y_refund_pred))
    trained_models_refund[name] = model

    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_delay_train, cv=5)
    print(f"{name} Cross-validation accuracy for Delivery Delay: {cv_scores.mean()}")
    
    cv_scores = cross_val_score(model, X_train, y_refund_train, cv=5)
    print(f"{name} Cross-validation accuracy for Refund Request: {cv_scores.mean()}")


Training RandomForest for Delivery Delay Prediction...
RandomForest Accuracy for Delivery Delay: 0.8672
              precision    recall  f1-score   support

           0       0.87      1.00      0.93     17266
           1       1.00      0.03      0.06      2734

    accuracy                           0.87     20000
   macro avg       0.93      0.51      0.49     20000
weighted avg       0.88      0.87      0.81     20000

Training RandomForest for Refund Request Prediction...
RandomForest Accuracy for Refund Request: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10988
           1       1.00      1.00      1.00      9012

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

RandomForest Cross-validation accuracy for Delivery Delay: 0.8667374999999999
RandomForest Cross-validation accuracy for Refund Request: 1.0




AdaBoost Accuracy for Delivery Delay: 0.86545
              precision    recall  f1-score   support

           0       0.87      1.00      0.93     17266
           1       1.00      0.02      0.03      2734

    accuracy                           0.87     20000
   macro avg       0.93      0.51      0.48     20000
weighted avg       0.88      0.87      0.81     20000

Training AdaBoost for Refund Request Prediction...




AdaBoost Accuracy for Refund Request: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10988
           1       1.00      1.00      1.00      9012

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000





AdaBoost Cross-validation accuracy for Delivery Delay: 0.8648000000000001




AdaBoost Cross-validation accuracy for Refund Request: 1.0


## After Tuning

In [25]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

In [34]:
from sklearn.model_selection import RandomizedSearchCV 
params_abc = {'n_estimators': [int(x) for x in np.linspace(start = 1, stop = 50, num = 15)],
          'learning_rate': [(0.97 + x / 100) for x in range(0, 8)],
          }   
abc_random = RandomizedSearchCV(random_state=49,estimator=AdaBoostClassifier,param_distributions = params_abc,n_iter =50,cv=5,n_jobs=-1)


In [36]:
params_abc

{'n_estimators': [1, 4, 8, 11, 15, 18, 22, 25, 29, 32, 36, 39, 43, 46, 50],
 'learning_rate': [0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04]}

In [42]:
print('Accuracy of GradientBoosting(tuned)=', accuracy_score(y_delay_test, pred_grad_tuned))


Accuracy of GradientBoosting(tuned)= 0.8798


In [44]:
import joblib

# Save the tuned model
joblib.dump(grad_tuned, 'gradient_boosting_tuned.pkl')
print("Tuned Gradient Boosting model saved successfully!")


Tuned Gradient Boosting model saved successfully!


In [46]:
# Load the model
grad_tuned_loaded = joblib.load('gradient_boosting_tuned.pkl')
print("Tuned Gradient Boosting model loaded successfully!")

# Example usage (predict on test data)
predictions = grad_tuned_loaded.predict(X_test)
print('Accuracy of Loaded Model:', accuracy_score(y_delay_test, predictions))


Tuned Gradient Boosting model loaded successfully!
Accuracy of Loaded Model: 0.8798
