In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv("final_new_cleaned_orders.csv")

print("Data loaded successfully. Shape:", df.shape)
df.head()

Data loaded successfully. Shape: (100000, 18)


Unnamed: 0,order_id,customer_id,platform_name,product_category_id,order_datetime,delivery_time_min,order_value_inr,delivery_delay,refund_requested,service_rating,customer_feedback,product_category_name,sla_delay,Segment,hour,weekday,date,order_hour
0,ORD000001,CUST2824,Blinkit,4,16-05-2025 14:31,15,382,No,0,5,"Fast delivery, great service!",Fruits & Vegetables,No,Price-only,8,Monday,16-05-2025,14
1,ORD000002,CUST1409,Blinkit,1,02-05-2025 18:45,6,90,No,0,5,Quick and reliable!,Dairy,No,Price-only,5,Saturday,02-05-2025,18
2,ORD000003,CUST5506,Blinkit,5,09-05-2025 22:28,14,599,No,1,4,Items missing from order.,Beverages,No,Loyalist,23,Monday,09-05-2025,22
3,ORD000004,CUST5012,Blinkit,5,07-05-2025 09:42,13,40,Yes,0,4,Items missing from order.,Beverages,No,Price-only,20,Monday,07-05-2025,9
4,ORD000005,CUST4657,Blinkit,5,13-05-2025 21:37,8,110,No,0,5,"Fast delivery, great service!",Beverages,No,Loyalist,19,Monday,13-05-2025,21


In [3]:
# Target variable
y = df['refund_requested']

# features
features = [
    'sla_delay',
    'platform_name',
    'product_category_name',
    'Segment',
    'order_value_inr',
    'delivery_time_min',
    'order_hour'
]

X = df[features]

# Convert categorical columns to dummy/one-hot encoding
X = pd.get_dummies(X, drop_first=True)

print("Final feature matrix shape:", X.shape)
X.head()


Final feature matrix shape: (100000, 14)


Unnamed: 0,order_value_inr,delivery_time_min,order_hour,sla_delay_Yes,platform_name_JioMart,platform_name_Swiggy Instamart,product_category_name_Dairy,product_category_name_Fruits & Vegetables,product_category_name_Grocery,product_category_name_Personal Care,product_category_name_Snacks,Segment_Loyalist,Segment_Price-only,Segment_Promisable
0,382,15,14,False,False,False,False,True,False,False,False,False,True,False
1,90,6,18,False,False,False,True,False,False,False,False,False,True,False
2,599,14,22,False,False,False,False,False,False,False,False,True,False,False
3,40,13,9,False,False,False,False,False,False,False,False,False,True,False
4,110,8,21,False,False,False,False,False,False,False,False,True,False,False


In [5]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Predict on test data
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report (precision, recall, f1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

Accuracy: 0.9286

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96     18226
           1       0.64      0.44      0.52      1774

    accuracy                           0.93     20000
   macro avg       0.80      0.71      0.74     20000
weighted avg       0.92      0.93      0.92     20000


Confusion Matrix:
[[17799   427]
 [ 1001   773]]


In [7]:
importance = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", ascending=False)

importance


Unnamed: 0,feature,coefficient
3,sla_delay_Yes,2.802675
4,platform_name_JioMart,0.175486
8,product_category_name_Grocery,0.07881
9,product_category_name_Personal Care,0.077972
1,delivery_time_min,0.072046
2,order_hour,0.003996
0,order_value_inr,0.000124
5,platform_name_Swiggy Instamart,-0.025048
7,product_category_name_Fruits & Vegetables,-0.138328
6,product_category_name_Dairy,-0.154337


In [8]:
# Retrain logistic regression with class balancing
model_balanced = LogisticRegression(max_iter=2000, class_weight='balanced')

model_balanced.fit(X_train, y_train)

y_pred_balanced = model_balanced.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_balanced))


Accuracy: 0.85675

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92     18226
           1       0.34      0.64      0.44      1774

    accuracy                           0.86     20000
   macro avg       0.65      0.76      0.68     20000
weighted avg       0.91      0.86      0.88     20000



In [9]:
new_order = {
    'sla_delay': "Yes",
    'platform_name': "JioMart",
    'product_category_name': "Grocery",
    'Segment': "At-Risk",
    'order_value_inr': 450,
    'delivery_time_min': 18,
    'order_hour': 22
}

In [10]:
new_df = pd.DataFrame([new_order])

# one-hot encode EXACTLY like training data
new_df = pd.get_dummies(new_df)

# match training columns (X.columns)
new_df = new_df.reindex(columns=X.columns, fill_value=0)


In [11]:
prob = model_balanced.predict_proba(new_df)[0][1]
pred = model_balanced.predict(new_df)[0]

print("Refund Probability:", prob)
print("Refund Predicted? (1=yes):", pred)


Refund Probability: 0.9811028633836116
Refund Predicted? (1=yes): 1
