In [8]:
import pandas as pd
from sqlalchemy import create_engine

# Connexion au Data Warehouse
user = "root"
password = "root"
host = "127.0.0.1"
port = 3307
db = "fashion_data_dw_project"

engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{db}")


In [9]:
sales = pd.read_sql("SELECT * FROM fact_sales", engine)
dim_orders_status = pd.read_sql("SELECT * FROM dim_orders_status", engine)
customers = pd.read_sql("SELECT * FROM dim_customers", engine)

print(sales.head())
print(dim_orders_status.head())
print(customers.head())


  code_sale  code_customer order_code        SKU code_ship_type  \
0   S244495           2060  ORD405664  118458034       SHIP1209   
1   S279244            125  ORD226537  118458034       SHIP4965   
2   S519880           1989  ORD231649  144993001       SHIP4965   
3   S395741            839  ORD613094  126589012       SHIP3159   
4   S558471           1179  ORD435118  118458004       SHIP4965   

  code_pricing_strategy code_payment_method  quantity               prod_name  \
0            TOPBAN0041             PAY1961         1    Jerry jogger bottoms   
1            BOTH&M0012             PAY0503         3    Jerry jogger bottoms   
2            OUTZAR0031             PAY1985         1  Mama 100 den 1p Tights   
3            OUTMAN0029             PAY4543         1                 2p Claw   
4            SHOANN0032             PAY5626         2    Jerry jogger bottoms   

   Estimated_Unit_Price  sale_date  is_discounted  
0                 24.99 2023-02-02              0  
1     

In [10]:
sales_status = sales.merge(
    dim_orders_status[['order_code', 'status_label']],
    on='order_code',
    how='left'
)


In [11]:
client_agg = sales_status.groupby('code_customer').agg(
    total_orders=('order_code', 'count'),
    total_cancelled=('status_label', lambda x: (x=='cancelled').sum()),
    avg_quantity=('quantity', 'mean'),
    avg_unit_price=('Estimated_Unit_Price', 'mean'),
    unique_products=('SKU', pd.Series.nunique)
).reset_index()

# Calculer total_not_cancelled
client_agg['total_not_cancelled'] = client_agg['total_orders'] - client_agg['total_cancelled']

# Calculer les taux
client_agg['cancel_rate'] = client_agg['total_cancelled'] / client_agg['total_orders']
client_agg['not_cancelled_rate'] = client_agg['total_not_cancelled'] / client_agg['total_orders']


In [12]:
client_df = client_agg.merge(
    customers[['code_customer', 'Age', 'Gender']],
    on='code_customer',
    how='left'
)


In [13]:
client_df.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)


In [18]:
client_df['high_risk_cancelling_customer'] = 0

num_nonzero = (client_df['cancel_rate'] > 0).sum()

if num_nonzero > 0:
    nonzero_cancel = client_df[client_df['cancel_rate'] > 0]
    percentile_threshold = nonzero_cancel['cancel_rate'].quantile(0.80)
    client_df.loc[nonzero_cancel.index, 'high_risk_cancelling_customer'] = \
        nonzero_cancel['cancel_rate'].apply(lambda x: 1 if x >= percentile_threshold else 0)

num_high_risk_forced = max(1, int(0.05 * len(client_df)))  # 5% des clients
high_risk_indices = client_df.sample(num_high_risk_forced, random_state=42).index
client_df.loc[high_risk_indices, 'high_risk_cancelling_customer'] = 1

# Vérifier la répartition
print("Répartition high_risk_cancelling_customer :")
print(client_df['high_risk_cancelling_customer'].value_counts())


Répartition high_risk_cancelling_customer :
high_risk_cancelling_customer
0    3705
1     195
Name: count, dtype: int64


In [19]:
X = client_df.drop(['code_customer', 'cancel_rate', 'not_cancelled_rate', 'high_risk_cancelling_customer', 'gender'], axis=1)
X['gender_male'] = (client_df['gender'] == 'Male').astype(int)
y = client_df['high_risk_cancelling_customer']

# Standardisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Classes après SMOTE :", pd.Series(y_train_res).value_counts())


Classes après SMOTE : high_risk_cancelling_customer
0    2964
1    2964
Name: count, dtype: int64


In [23]:
models = {
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    
    print(f"\n===== {name} =====")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



===== XGBoost =====
Accuracy: 0.9256
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       741
           1       0.09      0.05      0.06        39

    accuracy                           0.93       780
   macro avg       0.52      0.51      0.51       780
weighted avg       0.91      0.93      0.92       780

Confusion Matrix:
 [[720  21]
 [ 37   2]]

===== LogisticRegression =====
Accuracy: 0.5551
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.57      0.71       741
           1       0.04      0.33      0.07        39

    accuracy                           0.56       780
   macro avg       0.49      0.45      0.39       780
weighted avg       0.90      0.56      0.68       780

Confusion Matrix:
 [[420 321]
 [ 26  13]]

===== SVM =====
Accuracy: 0.6308
Classification Report:
               precision    recall  f1-score   support

           0  

In [24]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [26]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# scale_pos_weight
scale = len(y_train_res[y_train_res==0]) / len(y_train_res[y_train_res==1])
xgb = XGBClassifier(eval_metric='logloss', random_state=42, use_label_encoder=False, scale_pos_weight=scale)

# pour hyperparamètres
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3,5],
    'learning_rate': [0.01,0.1]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_xgb = GridSearchCV(xgb, param_grid_xgb, scoring='f1', cv=cv, n_jobs=-1)
grid_xgb.fit(X_train_res, y_train_res)

# 4️ Évaluation
y_pred_xgb = grid_xgb.predict(X_test)
print("Best params:", grid_xgb.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Accuracy: 0.9153846153846154
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96       741
           1       0.03      0.03      0.03        39

    accuracy                           0.92       780
   macro avg       0.49      0.49      0.49       780
weighted avg       0.90      0.92      0.91       780

Confusion Matrix:
 [[713  28]
 [ 38   1]]


In [28]:
from sklearn.linear_model import LogisticRegression

# Oversampling avec SMOTE
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# class_weight='balanced'
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# 3️ GridSearchCV
param_grid_lr = {'C':[0.1,1,10], 'penalty':['l2'], 'solver':['lbfgs']}
grid_lr = GridSearchCV(lr, param_grid_lr, scoring='f1', cv=cv, n_jobs=-1)
grid_lr.fit(X_train_res, y_train_res)

# 4️ Évaluation
y_pred_lr = grid_lr.predict(X_test)
print("Best params:", grid_lr.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


Best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Accuracy: 0.5551282051282052
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.57      0.71       741
           1       0.04      0.33      0.07        39

    accuracy                           0.56       780
   macro avg       0.49      0.45      0.39       780
weighted avg       0.90      0.56      0.68       780

Confusion Matrix:
 [[420 321]
 [ 26  13]]


In [30]:
from sklearn.svm import SVC

#  Oversampling avec SMOTE
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# class_weight='balanced'
svm = SVC(kernel='rbf', probability=True, random_state=42, class_weight='balanced')

# GridSearchCV
param_grid_svm = {'C':[0.1,1,10], 'gamma':[0.01,0.1,1], 'kernel':['rbf']}
grid_svm = GridSearchCV(svm, param_grid_svm, scoring='f1', cv=cv, n_jobs=-1)
grid_svm.fit(X_train_res, y_train_res)

# 4️ Évaluation
y_pred_svm = grid_svm.predict(X_test)
print("Best params:", grid_svm.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


Best params: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.8474358974358974
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.89      0.92       741
           1       0.05      0.10      0.06        39

    accuracy                           0.85       780
   macro avg       0.50      0.49      0.49       780
weighted avg       0.90      0.85      0.87       780

Confusion Matrix:
 [[657  84]
 [ 35   4]]
