<a href="https://colab.research.google.com/github/Wajiha-ui/updated-size-recommendation/blob/main/updated_size_recommendation_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Generate synthetic dataset
def generate_synthetic_data(num_entries=5000):
    data = []
    genders = ["Male", "Female"]
    age_groups = ["18-24", "25-34", "35-44", "45-54", "55+"]
    body_shapes = ["Slim", "Athletic", "Regular", "Plus-size"]
    fit_preferences = ["Slim", "Regular", "Loose"]
    clothing_types = ["T-Shirt", "Hoodie", "Dress", "Jacket", "Pants"]
    materials = ["Cotton", "Wool", "Silk", "Polyester", "Linen"]
    sizes = ["XS", "S", "M", "L", "XL", "XXL"]
    regions = ["Europe", "USA", "Asia"]

    for _ in range(num_entries):
        data.append({
            "Gender": random.choice(genders),
            "Age Group": random.choice(age_groups),
            "Height (cm)": random.randint(150, 200),
            "Weight (kg)": random.randint(45, 120),
            "Chest (cm)": random.randint(75, 130),
            "Waist (cm)": random.randint(60, 110),
            "Hips (cm)": random.randint(80, 130),
            "Body Shape": random.choice(body_shapes),
            "Preferred Fit": random.choice(fit_preferences),
            "Clothing Type": random.choice(clothing_types),
            "Brand Size": random.choice(sizes),
            "Material Preference": random.choice(materials),
            "Country/Region": random.choice(regions)
        })
    return pd.DataFrame(data)

# Generate and preprocess dataset
df = generate_synthetic_data(5000)
df["BMI"] = df["Weight (kg)"] / ((df["Height (cm)"] / 100) ** 2)
df["Chest_Waist_Ratio"] = df["Chest (cm)"] / df["Waist (cm)"]
df["Waist_Hips_Ratio"] = df["Waist (cm)"] / df["Hips (cm)"]

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Identify the one-hot encoded 'Brand Size' columns
brand_size_cols = [col for col in df.columns if col.startswith("Brand Size_")]
X = df.drop(columns=brand_size_cols)
y = df[brand_size_cols].idxmax(axis=1).str.replace("Brand Size_", "")

# Encode target labels numerically
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Balance the dataset with SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
drf_model = RandomForestClassifier(n_estimators=500, max_depth=30, random_state=42)
xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=len(np.unique(y)), learning_rate=0.05, max_depth=10, n_estimators=300, random_state=42)
lgb_model = lgb.LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=300, random_state=42)
nn_model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=500, random_state=42)

# Train models
models = {"RandomForest": drf_model, "XGBoost": xgb_model, "LightGBM": lgb_model, "NeuralNetwork": nn_model}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"📊 {name} Accuracy: {acc * 100:.2f}%")

# Stacking Model
stacking_clf = StackingClassifier(
    estimators=[("RandomForest", drf_model), ("XGBoost", xgb_model), ("LightGBM", lgb_model)],
    final_estimator=nn_model
)

# Train and evaluate stacking model
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
stacking_accuracy = accuracy_score(y_test, y_pred_stack)
print(f"🔗 Stacking Ensemble Accuracy: {stacking_accuracy * 100:.2f}%")

# Save best model
best_model = max(models.items(), key=lambda x: accuracy_score(y_test, x[1].predict(X_test)))[1]
joblib.dump(best_model, "best_size_model.pkl")
print("✅ Best model saved successfully!")

📊 RandomForest Accuracy: 60.14%
📊 XGBoost Accuracy: 56.27%
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 6512, number of used features: 28
[LightGBM] [Info] Start training from score -1.607444
[LightGBM] [Info] Start training from score -1.613593
[LightGBM] [Info] Start training from score -1.612052
[LightGBM] [Info] Start training from score -1.619780
[LightGBM] [Info] Start training from score -1.594501




📊 LightGBM Accuracy: 51.54%
📊 NeuralNetwork Accuracy: 37.65%




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001952 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 6512, number of used features: 28
[LightGBM] [Info] Start training from score -1.607444
[LightGBM] [Info] Start training from score -1.613593
[LightGBM] [Info] Start training from score -1.612052
[LightGBM] [Info] Start training from score -1.619780
[LightGBM] [Info] Start training from score -1.594501




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 5209, number of used features: 28
[LightGBM] [Info] Start training from score -1.607328
[LightGBM] [Info] Start training from score -1.614056
[LightGBM] [Info] Start training from score -1.611167
[LightGBM] [Info] Start training from score -1.619859
[LightGBM] [Info] Start training from score -1.594953




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 5209, number of used features: 28
[LightGBM] [Info] Start training from score -1.607328
[LightGBM] [Info] Start training from score -1.614056
[LightGBM] [Info] Start training from score -1.612129
[LightGBM] [Info] Start training from score -1.618889
[LightGBM] [Info] Start training from score -1.594953




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 5210, number of used features: 28
[LightGBM] [Info] Start training from score -1.607520
[LightGBM] [Info] Start training from score -1.613284
[LightGBM] [Info] Start training from score -1.612321
[LightGBM] [Info] Start training from score -1.620051
[LightGBM] [Info] Start training from score -1.594200




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 5210, number of used features: 28
[LightGBM] [Info] Start training from score -1.607520
[LightGBM] [Info] Start training from score -1.613284
[LightGBM] [Info] Start training from score -1.612321
[LightGBM] [Info] Start training from score -1.620051
[LightGBM] [Info] Start training from score -1.594200




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 5210, number of used features: 28
[LightGBM] [Info] Start training from score -1.607520
[LightGBM] [Info] Start training from score -1.613284
[LightGBM] [Info] Start training from score -1.612321
[LightGBM] [Info] Start training from score -1.620051
[LightGBM] [Info] Start training from score -1.594200




🔗 Stacking Ensemble Accuracy: 61.12%




✅ Best model saved successfully!


In [2]:
joblib.dump(best_model, "best_size_model.pkl")


['best_size_model.pkl']

In [3]:
from google.colab import files
files.download("best_size_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>