In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Step 1: Load data and save original copy
df_original = pd.read_csv("ecommerce_returns_synthetic_data.csv")
df = df_original.copy()

# Step 2: Encode target variable
df['Return_Status'] = df['Return_Status'].map({'Returned': 1, 'Not Returned': 0})

# Step 3: Drop non-numeric or ID-based features for training
df.drop(columns=[
    'Order_ID', 'Product_ID', 'User_ID', 'Order_Date',
    'Return_Date', 'Return_Reason', 'User_Location'
], inplace=True)

# Step 4: Handle missing values and one-hot encode
df['Days_to_Return'].fillna(0, inplace=True)
df = pd.get_dummies(df, columns=['Product_Category', 'User_Gender', 'Payment_Method', 'Shipping_Method'], drop_first=True)

# Step 5: Define features and target
X = df.drop('Return_Status', axis=1)
y = df['Return_Status']

# Step 6: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Train-test split with index tracking
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X_scaled, y, df.index, test_size=0.2, random_state=42
)

# Step 8: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 9: Predict probabilities for test set
return_probs = rf_model.predict_proba(X_test)[:, 1]

# Step 10: Fetch original rows for test set
df_test = df_original.loc[test_idx].copy()
df_test['return_probability'] = return_probs

# Step 11: Filter high-risk items
high_risk = df_test[df_test['return_probability'] > 0.7]

# Step 12: Export selected columns
high_risk[['Product_ID', 'Product_Category', 'User_Gender', 'Return_Status', 'return_probability']] \
    .to_csv('high_risk_products.csv', index=False)

print("✅ High-risk product list exported to 'high_risk_products.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Days_to_Return'].fillna(0, inplace=True)


✅ High-risk product list exported to 'high_risk_products.csv'
