In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [3]:
df = pd.read_csv('Ecommerce_Delivery_Analytics_New.csv')
df.head()

Unnamed: 0,Order ID,Customer ID,Platform,Order Date & Time,Delivery Time (Minutes),Product Category,Order Value (INR),Customer Feedback,Service Rating,Delivery Delay,Refund Requested
0,ORD000001,CUST2824,JioMart,19:29.5,30,Fruits & Vegetables,382,"Fast delivery, great service!",5,No,No
1,ORD000002,CUST1409,Blinkit,54:29.5,16,Dairy,279,Quick and reliable!,5,No,No
2,ORD000003,CUST5506,JioMart,21:29.5,25,Beverages,599,Items missing from order.,2,No,Yes
3,ORD000004,CUST5012,JioMart,19:29.5,42,Beverages,946,Items missing from order.,2,Yes,Yes
4,ORD000005,CUST4657,Blinkit,49:29.5,30,Beverages,334,"Fast delivery, great service!",5,No,No


In [5]:
df.tail()

Unnamed: 0,Order ID,Customer ID,Platform,Order Date & Time,Delivery Time (Minutes),Product Category,Order Value (INR),Customer Feedback,Service Rating,Delivery Delay,Refund Requested
99995,ORD099996,CUST5324,JioMart,49:29.5,24,Dairy,289,Packaging could be better.,3,No,No
99996,ORD099997,CUST1677,JioMart,18:29.5,19,Snacks,322,Good quality products.,4,No,No
99997,ORD099998,CUST8198,JioMart,27:29.5,41,Dairy,135,"Fast delivery, great service!",5,Yes,No
99998,ORD099999,CUST9975,JioMart,14:29.5,31,Grocery,973,Quick and reliable!,5,No,No
99999,ORD100000,CUST3748,JioMart,41:29.5,34,Fruits & Vegetables,453,Packaging could be better.,3,No,No


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Order ID                 100000 non-null  object
 1   Customer ID              100000 non-null  object
 2   Platform                 100000 non-null  object
 3   Order Date & Time        100000 non-null  object
 4   Delivery Time (Minutes)  100000 non-null  int64 
 5   Product Category         100000 non-null  object
 6   Order Value (INR)        100000 non-null  int64 
 7   Customer Feedback        100000 non-null  object
 8   Service Rating           100000 non-null  int64 
 9   Delivery Delay           100000 non-null  object
 10  Refund Requested         100000 non-null  object
dtypes: int64(3), object(8)
memory usage: 8.4+ MB


In [29]:
# Encode Categorical Features
encoder = OneHotEncoder(sparse_output=False)
categorical_cols = ['Platform', 'Product Category']
categorical_encoded = encoder.fit_transform(df[categorical_cols])
categorical_feature_names = encoder.get_feature_names_out(categorical_cols)
categorical_df = pd.DataFrame(categorical_encoded, columns=categorical_feature_names)
categorical_df

Unnamed: 0,Platform_Blinkit,Platform_JioMart,Platform_Swiggy Instamart,Product Category_Beverages,Product Category_Dairy,Product Category_Fruits & Vegetables,Product Category_Grocery,Product Category_Personal Care,Product Category_Snacks
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
99995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
99996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99997,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
99998,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
# Scale Numerical Features
scaler = MinMaxScaler()
numerical_cols = ['Order Value (INR)', 'Delivery Time (Minutes)', 'Service Rating']
numerical_scaled = scaler.fit_transform(df[numerical_cols])
numerical_df = pd.DataFrame(numerical_scaled, columns=numerical_cols)
numerical_df

Unnamed: 0,Order Value (INR),Delivery Time (Minutes),Service Rating
0,0.170256,0.352113,1.00
1,0.117436,0.154930,1.00
2,0.281538,0.281690,0.25
3,0.459487,0.521127,0.25
4,0.145641,0.352113,1.00
...,...,...,...
99995,0.122564,0.267606,0.50
99996,0.139487,0.197183,0.75
99997,0.043590,0.507042,1.00
99998,0.473333,0.366197,1.00


In [33]:
# Process Text Data using NLP (TF-IDF)
tfidf = TfidfVectorizer(max_features=50)
text_features = tfidf.fit_transform(df['Customer Feedback'].fillna(""))
tfidf_df = pd.DataFrame(text_features.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,again,and,be,better,could,delivered,delivery,disappointed,easy,excellent,...,reliable,rude,satisfied,service,the,to,very,was,with,wrong
0,0.0,0.00000,0.0,0.0,0.0,0.0,0.39228,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.455024,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.57735,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.57735,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.00000,0.0,0.0,0.0,0.0,0.39228,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.455024,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.00000,0.5,0.5,0.5,0.0,0.00000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.00000,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.0,0.00000,0.0,0.0,0.0,0.0,0.39228,0.0,0.0,0.0,...,0.00000,0.0,0.0,0.455024,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.0,0.57735,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,...,0.57735,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Combine Processed Data
X = pd.concat([categorical_df, numerical_df, tfidf_df], axis=1)
y_delay = df['Delivery Delay'].apply(lambda x: 1 if x == 'Yes' else 0)
y_refund = df['Refund Requested'].apply(lambda x: 1 if x == 'Yes' else 0)
y_refund

0        0
1        0
2        1
3        1
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: Refund Requested, Length: 100000, dtype: int64

In [55]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y_delay):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_delay_train, y_delay_test = y_delay.iloc[train_index], y_delay.iloc[test_index]


In [57]:
RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)


In [59]:
GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.05, random_state=42)


In [61]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RandomForestClassifier(), X, y_delay, cv=5, scoring='accuracy')
print("Cross-validation accuracy:", scores.mean())


Cross-validation accuracy: 1.0


In [53]:
for name, model in models.items():
    print(f"Training {name} for Delivery Delay Prediction...")
    model.fit(X_train, y_delay_train)
    y_delay_pred = model.predict(X_test)
    print(f"{name} Accuracy for Delivery Delay:", accuracy_score(y_delay_test, y_delay_pred))
    print(classification_report(y_delay_test, y_delay_pred))
    trained_models_delay[name] = model
    
    print(f"Training {name} for Refund Request Prediction...")
    model.fit(X_train, y_refund_train)
    y_refund_pred = model.predict(X_test)
    print(f"{name} Accuracy for Refund Request:", accuracy_score(y_refund_test, y_refund_pred))
    print(classification_report(y_refund_test, y_refund_pred))
    trained_models_refund[name] = model

Training RandomForest for Delivery Delay Prediction...
RandomForest Accuracy for Delivery Delay: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17330
           1       1.00      1.00      1.00      2670

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Training RandomForest for Refund Request Prediction...
RandomForest Accuracy for Refund Request: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10904
           1       1.00      1.00      1.00      9096

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Training GradientBoosting for Delivery Delay Prediction...
GradientBoosting Accuracy for Delivery Delay: 1.0
              precision    recall  



AdaBoost Accuracy for Delivery Delay: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17330
           1       1.00      1.00      1.00      2670

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

Training AdaBoost for Refund Request Prediction...




AdaBoost Accuracy for Refund Request: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10904
           1       1.00      1.00      1.00      9096

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [63]:
numerical_cols = ['Order Value (INR)']  # Removed Delivery Time & Service Rating


In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, df[['Delivery Delay', 'Refund Requested']],
                                                    test_size=0.2, random_state=42)
y_delay_train, y_delay_test = y_train['Delivery Delay'], y_test['Delivery Delay']
y_refund_train, y_refund_test = y_train['Refund Requested'], y_test['Refund Requested']


In [67]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y_delay, cv=5, scoring='accuracy')
    print(f"{name} Cross-validation accuracy for Delivery Delay:", scores.mean())

    scores = cross_val_score(model, X, y_refund, cv=5, scoring='accuracy')
    print(f"{name} Cross-validation accuracy for Refund Request:", scores.mean())


RandomForest Cross-validation accuracy for Delivery Delay: 1.0
RandomForest Cross-validation accuracy for Refund Request: 1.0
GradientBoosting Cross-validation accuracy for Delivery Delay: 1.0
GradientBoosting Cross-validation accuracy for Refund Request: 1.0




AdaBoost Cross-validation accuracy for Delivery Delay: 1.0




AdaBoost Cross-validation accuracy for Refund Request: 1.0




In [73]:
X_train, X_test, y_delay_train, y_delay_test = train_test_split(X, y_delay, test_size=0.2, random_state=42)
X_train, X_test, y_refund_train, y_refund_test = train_test_split(X, y_refund, test_size=0.2, random_state=42)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, df[['Delivery Delay', 'Refund Requested']], test_size=0.2, random_state=42)
y_delay_train, y_delay_test = y_train['Delivery Delay'], y_test['Delivery Delay']
y_refund_train, y_refund_test = y_train['Refund Requested'], y_test['Refund Requested']


In [77]:
numerical_cols = ['Order Value (INR)']  # Removed Delivery Time & Service Rating


In [79]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=42)
}


In [81]:
print(df['Delivery Delay'].value_counts(normalize=True))
print(df['Refund Requested'].value_counts(normalize=True))


Delivery Delay
No     0.86328
Yes    0.13672
Name: proportion, dtype: float64
Refund Requested
No     0.54181
Yes    0.45819
Name: proportion, dtype: float64


In [83]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_delay_train)


In [85]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y_delay, cv=5, scoring='accuracy')
    print(f"{name} Cross-validation accuracy for Delivery Delay:", scores.mean())

    scores = cross_val_score(model, X, y_refund, cv=5, scoring='accuracy')
    print(f"{name} Cross-validation accuracy for Refund Request:", scores.mean())


RandomForest Cross-validation accuracy for Delivery Delay: 1.0
RandomForest Cross-validation accuracy for Refund Request: 1.0
GradientBoosting Cross-validation accuracy for Delivery Delay: 1.0
GradientBoosting Cross-validation accuracy for Refund Request: 1.0
AdaBoost Cross-validation accuracy for Delivery Delay: 1.0
AdaBoost Cross-validation accuracy for Refund Request: 1.0


In [None]:
# Save Models
for name, model in trained_models_delay.items():
    joblib.dump(model, f"model_delay_{name}.pkl")
for name, model in trained_models_refund.items():
    joblib.dump(model, f"model_refund_{name}.pkl")

joblib.dump(encoder, "encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(tfidf, "tfidf.pkl")

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, df[['Delivery Delay', 'Refund Requested']], test_size=0.2, random_state=42)

y_delay_train, y_delay_test = y_train['Delivery Delay'], y_test['Delivery Delay']
y_refund_train, y_refund_test = y_train['Refund Requested'], y_test['Refund Requested']


In [89]:
numerical_cols = ['Order Value (INR)']  # Removed Delivery Time & Service Rating


In [91]:
print(df['Delivery Delay'].value_counts(normalize=True))
print(df['Refund Requested'].value_counts(normalize=True))


Delivery Delay
No     0.86328
Yes    0.13672
Name: proportion, dtype: float64
Refund Requested
No     0.54181
Yes    0.45819
Name: proportion, dtype: float64


In [93]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=99)
X_train_resampled, y_delay_train_resampled = smote.fit_resample(X_train, y_delay_train)
X_train_resampled, y_refund_train_resampled = smote.fit_resample(X_train, y_refund_train)


In [95]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=42)
}


In [97]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y_delay, cv=5, scoring='accuracy')
    print(f"{name} Cross-validation accuracy for Delivery Delay:", scores.mean())

    scores = cross_val_score(model, X, y_refund, cv=5, scoring='accuracy')
    print(f"{name} Cross-validation accuracy for Refund Request:", scores.mean())


RandomForest Cross-validation accuracy for Delivery Delay: 1.0
RandomForest Cross-validation accuracy for Refund Request: 1.0
GradientBoosting Cross-validation accuracy for Delivery Delay: 1.0
GradientBoosting Cross-validation accuracy for Refund Request: 1.0
AdaBoost Cross-validation accuracy for Delivery Delay: 1.0
AdaBoost Cross-validation accuracy for Refund Request: 1.0
