In [1]:

# 1. Import Libraries

import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier


# 2. Read and Prepare Data

# Read the dataset
df = pd.read_csv('processed_data_final.csv')  # Replace with your filename

# Drop 'year' and 'customer_id'
df = df.drop(columns=['year', 'customer_id'])

# Separate features (X) and target (y)
# After this line
X = df.drop(columns=['churn'])
y = df['churn']

# Identify categorical columns
categorical_cols = ['gender', 'multi_screen', 'mail_subscribed']


# 3. Encode categorical features BEFORE SMOTE

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # save encoder if needed later

# Replace inf, -inf with NaN
X.replace([float('inf'), float('-inf')], pd.NA, inplace=True)

# Drop or fill NaN values
X = X.fillna(0).infer_objects(copy=False)


# 4. Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Apply SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# 6. CatBoost Model Training

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    eval_metric='F1',
    early_stopping_rounds=50,
    verbose=100,
)

# Fit the model
model.fit(X_train_resampled, y_train_resampled)


# 7. Predict Probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]  # probability of class '1'



0:	learn: 0.6962516	total: 168ms	remaining: 2m 47s
100:	learn: 0.9062845	total: 824ms	remaining: 7.34s
200:	learn: 0.9622709	total: 1.44s	remaining: 5.72s
300:	learn: 0.9856322	total: 2.12s	remaining: 4.91s
400:	learn: 0.9938871	total: 2.77s	remaining: 4.13s
500:	learn: 0.9985612	total: 3.44s	remaining: 3.42s
600:	learn: 0.9996402	total: 4.1s	remaining: 2.72s
700:	learn: 1.0000000	total: 4.77s	remaining: 2.03s
800:	learn: 1.0000000	total: 5.46s	remaining: 1.36s
900:	learn: 1.0000000	total: 6.14s	remaining: 675ms
999:	learn: 1.0000000	total: 6.81s	remaining: 0us


In [3]:
model.save_model("catboost_churn_model_final1.cbm")

In [5]:
# ------------------------------
# 8. Threshold Tuning (with Accuracy and ROC-AUC)
# ------------------------------
from sklearn.metrics import accuracy_score, roc_auc_score

thresholds = [0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85]

print("Threshold | Precision | Recall | F1 Score | Accuracy | ROC-AUC")
print("------------------------------------------------------------------")
for thresh in thresholds:
    y_pred_thresh = (y_pred_proba >= thresh).astype(int)
    precision = precision_score(y_test, y_pred_thresh, zero_division=0)
    recall = recall_score(y_test, y_pred_thresh, zero_division=0)
    f1 = f1_score(y_test, y_pred_thresh, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred_thresh)
    roc_auc = roc_auc_score(y_test, y_pred_thresh)
    print(f"{thresh:<10} | {precision:.2f}      | {recall:.2f}   | {f1:.2f}    | {accuracy:.2f}    | {roc_auc:.2f}")


Threshold | Precision | Recall | F1 Score | Accuracy | ROC-AUC
------------------------------------------------------------------
0.1        | 0.33      | 0.81   | 0.47    | 0.76    | 0.78
0.15       | 0.36      | 0.79   | 0.49    | 0.79    | 0.79
0.2        | 0.40      | 0.79   | 0.53    | 0.82    | 0.81
0.25       | 0.43      | 0.77   | 0.55    | 0.84    | 0.81
0.3        | 0.46      | 0.77   | 0.58    | 0.85    | 0.82
0.35       | 0.46      | 0.73   | 0.57    | 0.85    | 0.80
0.4        | 0.47      | 0.71   | 0.57    | 0.86    | 0.80
0.45       | 0.50      | 0.65   | 0.57    | 0.87    | 0.78
0.5        | 0.53      | 0.62   | 0.57    | 0.88    | 0.77
0.55       | 0.55      | 0.62   | 0.58    | 0.89    | 0.77
0.6        | 0.60      | 0.60   | 0.60    | 0.90    | 0.77
0.65       | 0.67      | 0.58   | 0.62    | 0.91    | 0.77
0.7        | 0.68      | 0.54   | 0.60    | 0.91    | 0.75
0.75       | 0.71      | 0.48   | 0.57    | 0.91    | 0.73
0.8        | 0.70      | 0.40   | 0.51    | 

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load your CSV
df = pd.read_csv('processed_data_final.csv')

# Step 2: Split features and target
X = df.drop('churn', axis=1)
y = df['churn']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Display the outputs
print("X_train:")
print(X_train.head())

print("\nX_test:")
print(X_test.head())

print("\ny_train:")
print(y_train.head())

print("\ny_test:")
print(y_test.head())

X_train:
     year  customer_id  gender  age  no_of_days_subscribed multi_screen  \
968  2020       553858    Male   29                     91           no   
240  2020       205057    Male   45                    155           no   
819  2020       484485  Female   32                    113           no   
692  2020       423786    Male   31                    149          yes   
420  2020       285996  Female   32                    157           no   

    mail_subscribed  weekly_mins_watched  minimum_daily_mins  \
968             yes               254.70                10.9   
240              no               305.10                 8.9   
819              no               290.70                10.6   
692             yes               190.35                 8.8   
420             yes               291.45                 7.1   

     maximum_daily_mins  ...  videos_watched  maximum_days_inactive  \
968               28.87  ...               4                      3   
240          