In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE


df = pd.read_csv("/content/ecommerce_sales_34500.csv")


date_cols = [col for col in df.columns if "date" in col.lower()]
df = df.drop(columns=date_cols)

le_returned = LabelEncoder()
df["returned"] = le_returned.fit_transform(df["returned"].astype(str))

# 4. Identify categorical columns with alphabets (except IDs & target)
skip_id_cols = ["order_id", "customer_id", "product_id"]
label_encoders = {}
alpha_cols = []

for col in df.columns:
    if col in skip_id_cols or col == "returned":
        continue
    if df[col].dtype == "object":
        has_alpha = df[col].astype(str).str.contains(r"[A-Za-z]", regex=True, na=False).any()
        if has_alpha:
            alpha_cols.append(col)

# 5. Label encode those categorical columns
for col in alpha_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# 6. Fill missing numeric values
df = df.fillna(df.median(numeric_only=True))

# 7. Split features/label
label_col = "returned"
y = df[label_col]
X = df.drop(columns=[label_col])

# Remove ID columns BEFORE SMOTE so everything is numeric
X_model = X.drop(columns=skip_id_cols, errors="ignore")

# 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.2, random_state=42, stratify=y
)

# 9. Apply SMOTE on TRAIN
sm_train = SMOTE(random_state=42)
X_train_res, y_train_res = sm_train.fit_resample(X_train, y_train)

# 10. Apply SMOTE on TEST (as you requested)
# ⚠️ Not recommended for real evaluation, but done here as per your requirement
sm_test = SMOTE(random_state=42)
X_test_res, y_test_res = sm_test.fit_resample(X_test, y_test)

# 11. Save TRAIN SMOTE CSV
train_smote = pd.DataFrame(X_train_res, columns=X_model.columns)
train_smote[label_col] = y_train_res.values
train_smote.to_csv("ecommerce_train_smote.csv", index=False)

# 12. Save TEST SMOTE CSV (instead of normal test CSV)
test_smote = pd.DataFrame(X_test_res, columns=X_model.columns)
test_smote[label_col] = y_test_res.values
test_smote.to_csv("ecommerce_test_smote.csv", index=False)

print("✔ Saved: ecommerce_train_smote.csv and ecommerce_test_smote.csv")


✔ Saved: ecommerce_train_smote.csv and ecommerce_test_smote.csv
