In [1]:
import pandas as pd
!pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score





[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# Step 1: Load data
df = pd.read_csv("ecommerce_data_with_returns.csv")

In [8]:
# Step 2: Select features for modeling
features = ["category", "price_rs.", "discount_%", "final_pricers.", "payment_method", "purchase_month", "purchase_dayofweek"]
target = "is_returned"
df_model = df[features + [target]]

In [9]:
# Step 3: One-hot encode categorical columns
categorical_cols = ["category", "payment_method"]
encoder = OneHotEncoder(drop='first', sparse_output=False)  # use sparse_output for scikit-learn >= 1.2
encoded = encoder.fit_transform(df_model[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))



In [10]:
# Combine encoded and numeric features
X = pd.concat([df_model.drop(columns=categorical_cols + [target]), encoded_df], axis=1)
y = df_model[target]


In [11]:
# Step 4: Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Step 5: Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [14]:
import numpy as np
unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))


{np.int64(0): np.int64(732)}


In [15]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("📊 Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("🎯 ROC AUC Score:", round(roc_auc_score(y_test, y_proba), 3))


📊 Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       625
           1       0.00      0.00      0.00       107

    accuracy                           0.85       732
   macro avg       0.43      0.50      0.46       732
weighted avg       0.73      0.85      0.79       732

🧮 Confusion Matrix:
 [[625   0]
 [107   0]]
🎯 ROC AUC Score: 0.561


In [17]:
# Step 7: Add return probability to full dataset
df["return_probability"] = model.predict_proba(X)[:, 1]

# Step 8: Save high-risk products (prob > 0.5)
high_risk = df[df["return_probability"] > 0.5]
# Optional: re-save to ensure the latest version
df.to_csv("ecommerce_data_with_return_scores.csv", index=False)


In [18]:
import os
print(os.path.exists("ecommerce_data_with_return_scores.csv"))


True


In [19]:
# Load the CSV back into a new DataFrame
df_check = pd.read_csv("ecommerce_data_with_return_scores.csv")

# Show the first 5 rows
print(df_check.head())

# Confirm it includes the return_probability column
print("🧪 Columns:", df_check.columns.tolist())


    user_id  product_id  category  price_rs.  discount_%  final_pricers.  \
0  337c166f  f414122f-e    Sports      36.53          15           31.05   
1  d38a19bf  fde50f9c-5  Clothing     232.79          20          186.23   
2  d7f5f0b0  0d96fc90-3    Sports     317.02          25          237.76   
3  395d4994  964fc44b-d      Toys     173.19          25          129.89   
4  a83c145c  d70e2fc6-e    Beauty     244.80          20          195.84   

  payment_method purchase_date  purchase_month  purchase_dayofweek  \
0    Net Banking    2024-11-12              11                   1   
1    Net Banking    2024-02-09               2                   4   
2    Credit Card    2024-09-01               9                   6   
3            UPI    2024-04-01               4                   0   
4    Net Banking    2024-09-27               9                   4   

   is_returned  return_probability  
0            0            0.154170  
1            1            0.124545  
2          

In [20]:
print(df_check["return_probability"].describe())


count    3660.000000
mean        0.150077
std         0.029825
min         0.078157
25%         0.129057
50%         0.148490
75%         0.167471
max         0.312319
Name: return_probability, dtype: float64


In [None]:
SSS