MR SN Ngobese 22356737

import libraries

In [40]:
import pandas as pd
import numpy as np

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

Load data

In [42]:
file_path = "/content/drive/MyDrive/dataset_1/FashionDatasets/DataPenjualanFashion.xlsx"
sales_df = pd.read_excel(file_path, sheet_name='SalesItems')
product_df = pd.read_excel(file_path, sheet_name='ProductItems')

Merge data on index(no product id in SalesItems)

In [43]:
merged = pd.concat([sales_df.reset_index(drop=True), product_df.reset_index(drop=True)], axis=1)


Keep only relevant channels

In [44]:
merged = merged[merged['channel'].isin(['App Mobile', 'E-commerce'])].copy()

target virables

In [45]:
y = (merged['channel'] == 'App Mobile').astype(int)  # 1 = App Mobile, 0 = E-commerce

features

In [46]:
candidate_features = [
    'quantity', 'original_price', 'unit_price', 'item_total',
    'category', 'brand', 'color', 'size', 'gender',
    'catalog_price', 'cost_price'
]
X = merged[candidate_features].copy()

fill in the missing

In [47]:
for col in X.select_dtypes(include=[np.number]).columns:
    X[col] = X[col].fillna(X[col].median())
for col in X.select_dtypes(exclude=[np.number]).columns:
    X[col] = X[col].fillna('missing')

Train

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


Preprocessing

In [49]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
])

Logistic Regression

In [50]:
log_pipe = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=1000))
])
log_pipe.fit(X_train, y_train)
y_pred_log = log_pipe.predict(X_test)
y_proba_log = log_pipe.predict_proba(X_test)[:, 1]

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Logistic Regression ROC AUC:", roc_auc_score(y_test, y_proba_log))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_log))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


Logistic Regression Accuracy: 0.5797872340425532
Logistic Regression ROC AUC: 0.5471581678274121
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.91      0.69       293
           1       0.69      0.23      0.34       271

    accuracy                           0.58       564
   macro avg       0.63      0.57      0.52       564
weighted avg       0.62      0.58      0.52       564

Logistic Regression Confusion Matrix:
 [[266  27]
 [210  61]]


Random forest

In [51]:
rf_pipe = Pipeline(steps=[
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])
rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_test)
y_proba_rf = rf_pipe.predict_proba(X_test)[:, 1]

print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Accuracy: 0.5336879432624113
Random Forest ROC AUC: 0.5542800649849502
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.57      0.56       293
           1       0.52      0.49      0.50       271

    accuracy                           0.53       564
   macro avg       0.53      0.53      0.53       564
weighted avg       0.53      0.53      0.53       564

Random Forest Confusion Matrix:
 [[168 125]
 [138 133]]


Feature importance from Random Forest

In [52]:
ohe = rf_pipe.named_steps['pre'].named_transformers_['cat']
ohe_features = list(ohe.get_feature_names_out(cat_features))
all_features = numeric_features + ohe_features
importances = rf_pipe.named_steps['clf'].feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 10 Important Features in Random Forest:")
print(feature_importance_df.head(10))


Top 10 Important Features in Random Forest:
              feature  importance
1      original_price    0.282625
3          item_total    0.272561
2          unit_price    0.267729
4       catalog_price    0.036898
5          cost_price    0.034990
0            quantity    0.023102
17          color_Red    0.005431
16        color_Green    0.005403
10  category_T-Shirts    0.005311
15         color_Blue    0.005238
