In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('cleaned_ecommerce_data.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [4]:
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['Product_Category'])
df['location_encoded'] = le.fit_transform(df['User_Location'])
df['payment_encoded'] = le.fit_transform(df['Payment_Method'])
df['shipping_encoded'] = le.fit_transform(df['Shipping_Method'])
df['gender_encoded'] = le.fit_transform(df['User_Gender'])

In [5]:
X = df[['category_encoded', 'location_encoded', 'payment_encoded', 'shipping_encoded', 'gender_encoded', 'User_Age', 'Product_Price', 'Order_Quantity', 'Discount_Applied', 'Days_to_Return']]
y = df['Returned']

In [6]:
print("Class distribution in full dataset:")
print(y.value_counts())
if y.nunique() < 2:
    print("Error: Target has only one class. Check 'Returned' column or dataset.")
    # Exit or handle (e.g., if all are 1, perhaps no model needed)
else:
    # Stratified split to ensure both classes in train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Check train distribution
    print("Class distribution in y_train:")
    print(y_train.value_counts())

Class distribution in full dataset:
Returned
0    4948
1    2539
Name: count, dtype: int64
Class distribution in y_train:
Returned
0    3958
1    2031
Name: count, dtype: int64


In [7]:
  if y_train.value_counts().min() / y_train.value_counts().max() < 0.1:  # If minority < 10% of majority
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        print("Applied SMOTE. New y_train distribution:")
        print(y_train.value_counts())

 # Train model with class weighting (helps with imbalance)

In [9]:

model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Predict and evaluate

In [10]:

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
print("Classification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_pred_proba))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       990
           1       1.00      0.99      1.00       508

    accuracy                           1.00      1498
   macro avg       1.00      1.00      1.00      1498
weighted avg       1.00      1.00      1.00      1498

AUC Score: 0.9985385349558578


# Predict probabilities for all data

In [11]:

    df['return_probability'] = model.predict_proba(X)[:, 1]

# Save predictions

In [14]:
# Save predictions
df.to_csv('data_with_predictions.csv', index=False)