In [1]:
from src.setup.mlflow_setup import init_mlflow

MLflow initialization module imported.


In [25]:
import os
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")

In [26]:
init_mlflow()

In [27]:
df = pd.read_csv('../data/raw/telecom.csv')

In [28]:
df['ratio_day_night_calls'] = round(df['total_day_calls'] / df['total_night_calls'], 2)
df['ratio_day_night_calls'] = df['ratio_day_night_calls'].replace([np.inf, -np.inf], 0)

In [29]:
df['ratio_day_night_calls'].value_counts()

ratio_day_night_calls
0.92    74
0.89    72
0.94    71
1.05    68
0.97    67
        ..
0.36     1
2.14     1
2.29     1
2.48     1
2.53     1
Name: count, Length: 206, dtype: int64

In [30]:
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,ratio_day_night_calls
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,...,16.62,254.4,103,11.45,13.7,3,3.7,1,no,1.19
1,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,...,16.62,254.4,103,11.45,13.7,3,3.7,1,no,1.19
2,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,...,10.3,162.6,104,7.32,12.2,5,3.29,0,no,1.1
3,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,...,5.26,196.9,89,8.86,6.6,7,1.78,2,no,0.8
4,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,...,12.61,186.9,121,8.41,10.1,3,2.73,3,no,0.93


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4251 entries, 0 to 4250
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4251 non-null   object 
 1   account_length                 4251 non-null   int64  
 2   area_code                      4251 non-null   object 
 3   international_plan             4251 non-null   object 
 4   voice_mail_plan                4251 non-null   object 
 5   number_vmail_messages          4251 non-null   int64  
 6   total_day_minutes              4251 non-null   float64
 7   total_day_calls                4251 non-null   int64  
 8   total_day_charge               4251 non-null   float64
 9   total_eve_minutes              4251 non-null   float64
 10  total_eve_calls                4251 non-null   int64  
 11  total_eve_charge               4251 non-null   float64
 12  total_night_minutes            4251 non-null   f

## –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏

In [None]:
# EDA
drop_cols = ['churn', 'total_day_charge', 'total_eve_charge', 
             'total_night_charge', 'total_intl_charge']

X = df.drop(columns=[c for c in drop_cols if c in df.columns], axis=1)

# Target mapping
y = df['churn'].map({'no': 0, 'yes': 1, False: 0, True: 1})

numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['number']).columns.tolist()

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y
)

# Preprocessing
numerical_transformer = StandardScaler()

# handle_unknown='ignore' –∫—Ä–∏—Ç–∏—á–µ–Ω, –µ—Å–ª–∏ –≤ —Ç–µ—Å—Ç–µ –ø–æ–ø–∞–¥–µ—Ç—Å—è –Ω–æ–≤—ã–π —à—Ç–∞—Ç –∏–ª–∏ –∫–æ–¥ –∑–æ–Ω—ã
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    verbose_feature_names_out=False
)

# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –º–æ–¥–µ–ª–∏ –≤—ã–Ω–æ—Å–∏–º –≤ —Å–ª–æ–≤–∞—Ä—å, —á—Ç–æ–±—ã –ª–æ–≥–∏—Ä–æ–≤–∞—Ç—å –∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –æ–¥–Ω–∏ –∏ —Ç–µ –∂–µ –∑–Ω–∞—á–µ–Ω–∏—è
params = {
    "solver": "liblinear",
    "class_weight": "balanced",
    "random_state": 42,
    "C": 1.0
}

pipeline_weighted = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(**params))
])

Numeric features: 12
Categorical features: 4


In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np

# --- 1. –ü–∞—Ä–∞–º–µ—Ç—Ä—ã Random Forest ---
params_rf = {
    "n_estimators": 100,
    "max_depth": 10,
    "class_weight": "balanced",
    "random_state": 42,
    "n_jobs": -1
}

# --- 2. –°–æ–∑–¥–∞–µ–º –ø–∞–π–ø–ª–∞–π–Ω ---
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**params_rf))
])

# --- 3. –û–±—É—á–µ–Ω–∏–µ ---
pipeline_rf.fit(X_train, y_train)

# --- 4. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Å –ü–û–†–û–ì–û–ú (Threshold) ---
threshold = 0.4621

# –ü–æ–ª—É—á–∞–µ–º –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —É—Ö–æ–¥–∞ (–∫–ª–∞—Å—Å 1)
y_proba = pipeline_rf.predict_proba(X_test)[:, 1]

# –ü—Ä–∏–º–µ–Ω—è–µ–º –ø–æ—Ä–æ–≥: –µ—Å–ª–∏ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å >= 0.4621, —Ç–æ —Å—Ç–∞–≤–∏–º 1, –∏–Ω–∞—á–µ 0
y_pred_custom = (y_proba >= threshold).astype(int)

# --- 5. –ú–µ—Ç—Ä–∏–∫–∏ ---
acc_rf = accuracy_score(y_test, y_pred_custom)
f1_rf = f1_score(y_test, y_pred_custom)

print(f"RANDOM FOREST (–ø–æ—Ä–æ–≥: {threshold})")
print(f"Accuracy: {acc_rf:.4f}")
print(f"F1 Score: {f1_rf:.4f}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred_custom))

print("\nCONFUSION MATRIX:")
cm = confusion_matrix(y_test, y_pred_custom)
print(cm)

import joblib
joblib.dump(pipeline_rf, 'churn_rf_model_optimized.pkl')
print(f"\nModel saved. Remember to use threshold {threshold} during inference!")

RANDOM FOREST (–ø–æ—Ä–æ–≥: 0.4621)
Accuracy: 0.9266
F1 Score: 0.7431

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1206
           1       0.73      0.76      0.74       197

    accuracy                           0.93      1403
   macro avg       0.85      0.86      0.85      1403
weighted avg       0.93      0.93      0.93      1403


CONFUSION MATRIX:
[[1151   55]
 [  48  149]]

Model saved. Remember to use threshold 0.4621 during inference!


In [None]:
mlflow.set_experiment("churn_classification")      # –∏–º—è —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞

with mlflow.start_run():

    # --- 1. –õ–æ–≥–∏—Ä—É–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –º–æ–¥–µ–ª–∏ ---
    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("solver", "liblinear")
    mlflow.log_param("class_weight", "balanced")

    # --- 2. –î–µ–ª–∞–µ–º –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ ---
    y_pred = pipeline_weighted.predict(X_test)

    # --- 3. –ú–µ—Ç—Ä–∏–∫–∏ ---
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # –º–æ–∂–Ω–æ –ª–æ–≥–∏—Ä–æ–≤–∞—Ç—å —Ç–µ–∫—Å—Ç–æ–≤—ã–π classification_report –∫–∞–∫ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç
    report = classification_report(y_test, y_pred)
    with open("cls_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("cls_report.txt")

    # --- 4. –õ–æ–≥–∏—Ä—É–µ–º —Å–∞–º PIPELINE ---
    mlflow.sklearn.log_model(
        sk_model=pipeline_weighted,
        name="model",
        registered_model_name="ChurnPipeline"  # –æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ
    )

    print("Model logged to MLflow!")

Registered model 'ChurnPipeline' already exists. Creating a new version of this model...
2025/11/16 22:14:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ChurnPipeline, version 12
Created version '12' of model 'ChurnPipeline'.


Model logged to MLflow!
üèÉ View run persistent-snail-80 at: http://94.228.117.198:5000/#/experiments/1/runs/3a5151f6410e4dc48bdbf865bf6b7c32
üß™ View experiment at: http://94.228.117.198:5000/#/experiments/1
