In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load your labeled dataset
df = pd.read_csv("ecommerce_data_with_returns.csv")

# Define input features and target
features = ["category", "price_rs.", "discount_%", "final_pricers.", 
            "payment_method", "purchase_month", "purchase_dayofweek"]
target = "is_returned"

# Prepare feature dataframe
df_model = df[features + [target]]

# One-hot encode categorical columns
categorical_cols = ["category", "payment_method"]
encoder = OneHotEncoder(drop="first", sparse_output=False)
X_cat = encoder.fit_transform(df_model[categorical_cols])
X_cat = pd.DataFrame(X_cat, columns=encoder.get_feature_names_out(categorical_cols))

# Combine encoded categorical and numeric features
X_num = df_model.drop(columns=categorical_cols + [target])
X = pd.concat([X_num.reset_index(drop=True), X_cat.reset_index(drop=True)], axis=1)
y = df_model[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Feature engineering and split completed.")
print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")


✅ Feature engineering and split completed.
Shape of X_train: (2928, 15), y_train: (2928,)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Reuse the X_train and y_train already created
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Optional: evaluate to confirm it's working
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("🎯 Model trained.")


🎯 Model trained.


In [8]:
# Now predict for the full dataset
df['return_probability'] = model.predict_proba(X)[:, 1]

# Save outputs
df.to_csv("ecommerce_data_with_return_scores.csv", index=False)
df[df['return_probability'] > 0.5].to_csv("high_risk_products.csv", index=False)

print("✅ Return scores added and files saved.")


✅ Return scores added and files saved.
