# Placeholder model generator for churn probability

uses moe random data from [RandomDataGenerator](../RandomDataGenerator.ipynb)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

## getting data

In [2]:
clients = pd.read_csv('train_clients.csv')
policies = pd.read_csv('train_policies.csv')

In [3]:
clients.head(3)

Unnamed: 0,client_name,client_id,account_type,address,phone_number,email,branch_id,servicing_broker_id,status,issued_on,updated_on,avg_days_between_logins,avg_user_sentiment_score
0,Christian Hartman,C001611,Personal,"7112 James Gardens Apt. 690, Kennedyborough, M...",+1-553-830-9960x5930,christian.hartman@cervantes-rosario.info,B_US00695,S002088,Prospect,2005-05-31,2025-11-28,9.935357,7.094186
1,Elizabeth Chen,C002570,Commercial,"PSC 6588, Box 2008, APO AP 81074",560.265.0928,elizabeth.chen@sanchez.com,B_US00225,S001110,Inactive,2005-09-19,2025-11-28,3.926767,5.830084
2,Jennifer Payne,C003734,Personal,"945 Alexandria Motorway, Mirandabury, AS 74673",831-287-0777,jennifer.payne@torres.com,B_US00223,S006084,Inactive,2005-09-24,2025-11-28,18.233584,8.13792


In [4]:
policies.head(3)

Unnamed: 0,policy_id,client_id,line_of_business,issuing_carrier,issued_on,expiry_date,premium_amount,premium_bill_to,renewal_status,last_updated,broker_commision_pct
0,P007860,C001611,Exercise physiologist,Wagner Inc Insurance Inc.,2005-05-31,2025-05-31,1070.79796,Premium Finance Company,New,2025-11-28,10.779305
1,P005156,C002570,"Nurse, mental health","Perkins, Hicks and Mendoza Insurance Inc.",2005-09-19,2025-09-19,450.371676,Payroll Vendor,Renewal Quoted,2025-11-28,7.931847
2,P009640,C003734,Press sub,Rivera-Delacruz Insurance Inc.,2005-09-24,2025-09-24,661.982877,Subsidiary,Renewal Quoted,2025-11-28,7.888466


In [5]:
clients.issued_on = pd.to_datetime(clients.issued_on)
clients.updated_on = pd.to_datetime(clients.updated_on)

policies.expiry_date = pd.to_datetime(policies.expiry_date)
policies.last_updated = pd.to_datetime(policies.last_updated)
policies.issued_on = pd.to_datetime(policies.issued_on)

In [6]:
merged = pd.merge(
    left=clients,
    right=policies,
    how='right',
    on='client_id'
)
merged.head(3)

Unnamed: 0,client_name,client_id,account_type,address,phone_number,email,branch_id,servicing_broker_id,status,issued_on_x,updated_on,avg_days_between_logins,avg_user_sentiment_score,policy_id,line_of_business,issuing_carrier,issued_on_y,expiry_date,premium_amount,premium_bill_to,renewal_status,last_updated,broker_commision_pct
0,Christian Hartman,C001611,Personal,"7112 James Gardens Apt. 690, Kennedyborough, M...",+1-553-830-9960x5930,christian.hartman@cervantes-rosario.info,B_US00695,S002088,Prospect,2005-05-31,2025-11-28,9.935357,7.094186,P007860,Exercise physiologist,Wagner Inc Insurance Inc.,2005-05-31,2025-05-31,1070.79796,Premium Finance Company,New,2025-11-28,10.779305
1,Elizabeth Chen,C002570,Commercial,"PSC 6588, Box 2008, APO AP 81074",560.265.0928,elizabeth.chen@sanchez.com,B_US00225,S001110,Inactive,2005-09-19,2025-11-28,3.926767,5.830084,P005156,"Nurse, mental health","Perkins, Hicks and Mendoza Insurance Inc.",2005-09-19,2025-09-19,450.371676,Payroll Vendor,Renewal Quoted,2025-11-28,7.931847
2,Jennifer Payne,C003734,Personal,"945 Alexandria Motorway, Mirandabury, AS 74673",831-287-0777,jennifer.payne@torres.com,B_US00223,S006084,Inactive,2005-09-24,2025-11-28,18.233584,8.13792,P009640,Press sub,Rivera-Delacruz Insurance Inc.,2005-09-24,2025-09-24,661.982877,Subsidiary,Renewal Quoted,2025-11-28,7.888466


## Generating fake churn status

features: `avg_days_between_logins`, `avg_user_sentiment_score`, `premium_amount`, 

**methodology behind generating the fake churn (0/1) labels**
1. more the `premium_amount` more the likelihood of `churn_status` being 1
2. more the `avg_user_sentiment_score` less the likelohood of `churn_status` being 1
3. more the `avg_days_between_logins` more the likelihood of `churn_status` being 1

> probForChurn = (premium_amount * avg_days_between_logins)/(avg_user_sentiment_score)

Each qualtity is normalized to equate magnitude.

In [7]:
from random import choices
import numpy as np

def normalize(x: pd.Series) -> pd.Series:
    return (x - x.mean())/x.std()

def sigmoid(x: pd.Series) -> pd.Series:
    return 1/(1+np.exp(-x))

def get_status(row) -> int:
    w1, w2 = row
    return choices(['churn', 'not_churn'], [w1, w2])[0]

def churn_status(premium_amount: pd.Series, avg_user_sentiment_score:pd.Series, avg_days_between_logins:pd.Series) -> tuple:

    premium_amount = normalize(premium_amount)
    avg_user_sentiment_score = normalize(avg_user_sentiment_score)
    avg_days_between_logins = normalize(avg_days_between_logins)

    w_churn = pd.Series([0 for _ in range(premium_amount.shape[0])], dtype=np.float32)
    w_not_churn = pd.Series([0 for _ in range(premium_amount.shape[0])], dtype=np.float32)

    w_churn[premium_amount > 0] += 2*premium_amount[premium_amount > 0]
    w_churn[avg_days_between_logins > 0] += 2*avg_days_between_logins[avg_days_between_logins > 0]
    w_churn[avg_user_sentiment_score < 0] -= 2*avg_user_sentiment_score[avg_user_sentiment_score < 0]

    w_not_churn[premium_amount < 0] += premium_amount[premium_amount < 0]
    w_not_churn[avg_days_between_logins < 0] += avg_days_between_logins[avg_days_between_logins < 0]
    w_not_churn[avg_user_sentiment_score > 0] -= avg_user_sentiment_score[avg_user_sentiment_score > 0]

    return pd.concat((sigmoid(w_churn), sigmoid(w_not_churn)), axis=1).apply(get_status, axis=1)


In [8]:
merged['_churn_status'] = churn_status(merged.premium_amount, merged.avg_user_sentiment_score, merged.avg_days_between_logins)
merged.head(5)

 4.88391928e-01 9.08410286e-01 2.70200632e+00 3.33784109e-01
 1.70290792e-01 4.49511130e+00 3.39677432e-01 3.21678375e+00
 2.57833200e+00 1.59586644e+00 3.88855679e+00 1.46215367e-01
 6.71487611e+00 8.32552531e-01 2.08296789e+00 6.00742965e+00
 3.20926748e-02 3.49839292e+00 1.54181495e-01 3.02857557e+00
 2.26551972e-01 3.34501587e+00 2.23356504e+00 1.85683508e+00
 2.22933115e-01 1.27378960e+00 7.64556203e-02 3.07134715e+00
 2.57286241e+00 3.72220317e-01 2.79426228e-01 2.30806070e-01
 8.38465215e-01 1.37123156e+00 1.33318826e+00 2.99828700e+00
 4.64846064e+00 7.24949593e-01 1.22355857e-04 1.46852283e+00
 1.65477054e+00 9.21559789e-01 3.38948729e-01 6.16625385e+00
 1.22800360e+00 8.16355841e+00 5.13632393e-01 9.48840695e-01
 1.72537795e+00 4.49488429e+00 4.98619223e+00 1.28183620e+00
 1.35778297e+00 1.12449350e+00 8.57869130e-01 7.05173116e+00
 3.88027182e-01 3.29354232e+00 1.21855952e+00 3.75784681e+00
 1.80601222e+00 4.57875153e-01 1.07650967e+00 1.36496224e+00
 1.15748323e+00 1.263984

Unnamed: 0,client_name,client_id,account_type,address,phone_number,email,branch_id,servicing_broker_id,status,issued_on_x,updated_on,avg_days_between_logins,avg_user_sentiment_score,policy_id,line_of_business,issuing_carrier,issued_on_y,expiry_date,premium_amount,premium_bill_to,renewal_status,last_updated,broker_commision_pct,_churn_status
0,Christian Hartman,C001611,Personal,"7112 James Gardens Apt. 690, Kennedyborough, M...",+1-553-830-9960x5930,christian.hartman@cervantes-rosario.info,B_US00695,S002088,Prospect,2005-05-31,2025-11-28,9.935357,7.094186,P007860,Exercise physiologist,Wagner Inc Insurance Inc.,2005-05-31,2025-05-31,1070.79796,Premium Finance Company,New,2025-11-28,10.779305,churn
1,Elizabeth Chen,C002570,Commercial,"PSC 6588, Box 2008, APO AP 81074",560.265.0928,elizabeth.chen@sanchez.com,B_US00225,S001110,Inactive,2005-09-19,2025-11-28,3.926767,5.830084,P005156,"Nurse, mental health","Perkins, Hicks and Mendoza Insurance Inc.",2005-09-19,2025-09-19,450.371676,Payroll Vendor,Renewal Quoted,2025-11-28,7.931847,churn
2,Jennifer Payne,C003734,Personal,"945 Alexandria Motorway, Mirandabury, AS 74673",831-287-0777,jennifer.payne@torres.com,B_US00223,S006084,Inactive,2005-09-24,2025-11-28,18.233584,8.13792,P009640,Press sub,Rivera-Delacruz Insurance Inc.,2005-09-24,2025-09-24,661.982877,Subsidiary,Renewal Quoted,2025-11-28,7.888466,churn
3,Jasmine Baker,C002092,Commercial,"274 Steven Lock Suite 362, West Claudiatown, I...",001-599-416-3402x661,jasmine.baker@ramirez.net,B_US00256,S008898,Prospect,2005-10-02,2025-11-28,27.16364,5.790689,P009660,Police officer,Russell Group Insurance Inc.,2005-10-02,2025-10-02,216.988832,Insured,Canceled,2025-11-28,8.70198,churn
4,Phillip Wall,C006646,Commercial,"46444 Jeffery Tunnel, Ashleyview, MH 44727",982.970.9859x35037,phillip.wall@sharp.com,B_US00695,S006383,Prospect,2005-06-14,2025-11-28,8.474849,7.216465,P007267,Press sub,"Young, Good and Scott Insurance Inc.",2005-06-14,2025-06-14,344.057155,Third-Party Billing,Renewal Quoted,2025-11-28,7.532532,churn


In [9]:
choices(['churn', 'not_churn'], [1, 2])

['not_churn']

## Training the Churn Predictor Model

In [10]:
merged.columns

Index(['client_name', 'client_id', 'account_type', 'address', 'phone_number',
       'email', 'branch_id', 'servicing_broker_id', 'status', 'issued_on_x',
       'updated_on', 'avg_days_between_logins', 'avg_user_sentiment_score',
       'policy_id', 'line_of_business', 'issuing_carrier', 'issued_on_y',
       'expiry_date', 'premium_amount', 'premium_bill_to', 'renewal_status',
       'last_updated', 'broker_commision_pct', '_churn_status'],
      dtype='object')

In [26]:
X = merged[[
    # 'client_name', 'client_id', 'account_type', 'address', 'phone_number',
    #    'email', 'branch_id', 'servicing_broker_id', 'status', 'issued_on_x',
    #    'updated_on', 
       'avg_days_between_logins', 'avg_user_sentiment_score',
    #    'policy_id', 'line_of_business', 'issuing_carrier', 'issued_on_y',
    #    'expiry_date', 
       'premium_amount', 
    #    'premium_bill_to', 'renewal_status',
    #    'last_updated', 'broker_commision_pct', '_churn_status'
       ]]
X.head(3)

Unnamed: 0,avg_days_between_logins,avg_user_sentiment_score,premium_amount
0,9.935357,7.094186,1070.79796
1,3.926767,5.830084,450.371676
2,18.233584,8.13792,661.982877


In [27]:
y = merged['_churn_status']
y.head(3)

0    churn
1    churn
2    churn
Name: _churn_status, dtype: object

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
XTrain = scaler.fit_transform(X)
yTrain = pd.get_dummies(y, dtype=float)

In [14]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()

XTrain, XTest, yTrain, yTest = train_test_split(XTrain, yTrain, train_size=0.8)
XTrain, yTrain = rus.fit_resample(XTrain, yTrain)
yTrain = pd.get_dummies(yTrain.reshape(-1), dtype=float)

In [15]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(XTrain, yTrain)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [16]:
def predict(X: np.ndarray) -> pd.Series:
    pred = model.predict(X)
    classes = ['churn', 'not_churn']
    op = []
    for rowInd in range(X.shape[0]):
        op.append(classes[pred[rowInd, :].argmax()])
    return pd.Series(op)

def classify(y: np.ndarray) -> pd.Series:
    op = []
    classes = ['churn', 'not_churn']
    for rowInd in range(y.shape[0]):
        op.append(classes[y[rowInd, :].argmax()])
    return pd.Series(op)

In [17]:
from sklearn.metrics import f1_score
f1_score(yTest.argmax(axis=1), model.predict(XTest).argmax(axis=1))

0.3973509933774834

In [18]:
import joblib 
joblib.dump(model, 'churn_predictor.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [33]:
model.predict_proba(XTest)[:, 1] * 10

array([1.7838560e-01, 9.4745731e+00, 5.3619266e+00, 8.0652943e+00,
       7.9614258e-01, 9.7221365e+00, 2.8148561e+00, 4.6889591e-01,
       8.9978046e+00, 7.9369345e+00, 9.2423630e+00, 6.4359179e+00,
       6.0759687e+00, 9.2814865e+00, 8.8178263e+00, 9.8050470e+00,
       8.0850563e+00, 9.1263609e+00, 9.9046421e+00, 7.2605824e+00,
       1.9680122e-01, 4.4296722e+00, 1.4608705e+00, 6.2157321e+00,
       5.3179188e+00, 8.9981537e+00, 1.3860642e+00, 8.8434095e+00,
       1.6250536e+00, 1.7512046e+00, 8.9732399e+00, 4.3919058e+00,
       4.5014140e-01, 2.4516611e+00, 9.8093433e+00, 1.5991926e+00,
       2.3494742e+00, 7.4181151e+00, 7.0667095e+00, 9.2058744e+00,
       5.8664818e+00, 5.7394414e+00, 3.0201342e+00, 7.5619650e+00,
       4.6287756e+00, 2.4676883e-01, 7.5864702e-01, 1.8946315e+00,
       9.3773532e-01, 9.6860905e+00, 3.9230943e+00, 1.4457142e+00,
       3.5881104e+00, 9.5446110e+00, 8.4673328e+00, 8.3141251e+00,
       9.0448961e+00, 6.2874050e+00, 9.5418234e+00, 2.6413140e

## Getting Feature Importances

In [36]:
from datetime import datetime, timedelta
today = datetime(
    year=2025,
    month=11,
    day=28
)

In [37]:
merged['_time_to_expiry_days'] = (today - merged.expiry_date).apply(lambda x: x.days)
merged['_time_to_expiry_days']

0      181
1       70
2       65
3       57
4      167
      ... 
995    281
996     87
997    172
998     68
999     69
Name: _time_to_expiry_days, Length: 1000, dtype: int64

In [42]:
import os
os.chdir('../')

In [43]:
from prioritization import not_churn_prob_score
merged['not_churn_prob'] = not_churn_prob_score(merged.premium_amount,
                                            merged.avg_days_between_logins,
                                            merged.avg_user_sentiment_score)



In [44]:
merged.columns

Index(['client_name', 'client_id', 'account_type', 'address', 'phone_number',
       'email', 'branch_id', 'servicing_broker_id', 'status', 'issued_on_x',
       'updated_on', 'avg_days_between_logins', 'avg_user_sentiment_score',
       'policy_id', 'line_of_business', 'issuing_carrier', 'issued_on_y',
       'expiry_date', 'premium_amount', 'premium_bill_to', 'renewal_status',
       'last_updated', 'broker_commision_pct', '_churn_status',
       '_time_to_expiry_days', 'not_churn_prob'],
      dtype='object')

In [62]:
X = merged[[
      # 'client_name', 'client_id', 'account_type', 'address', 'phone_number',
      #  'email', 'branch_id', 'servicing_broker_id', 'status', 'issued_on_x',
      #  'updated_on', 
       'avg_days_between_logins', 'avg_user_sentiment_score',
      #  'policy_id', 'line_of_business', 'issuing_carrier', 'issued_on_y',
      #  'expiry_date', 
       'premium_amount', 
      #  'premium_bill_to', 'renewal_status',
      #  'last_updated', 'broker_commision_pct', '_churn_status',
       '_time_to_expiry_days', 'not_churn_prob'
       ]]

In [63]:
y = merged['_churn_status']

In [64]:
rf_scaler = StandardScaler()
XTrain = rf_scaler.fit_transform(X)
yTrain = pd.get_dummies(y, dtype=float).to_numpy()

In [65]:
rf_rus = RandomUnderSampler()
XTrain, yTrain = rf_rus.fit_resample(XTrain, yTrain)
yTrain = pd.get_dummies(yTrain.reshape(-1), dtype=float).to_numpy()

In [66]:
XTrain, XTest, yTrain, yTest = train_test_split(XTrain, yTrain, train_size=0.8)

In [67]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()

model.fit(XTrain, yTrain)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [68]:
f1_score(yTest.argmax(axis=1), model.predict(XTest).argmax(axis=1))

0.574468085106383

## Feature Weights for GPA

In [71]:
pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

avg_days_between_logins     0.228554
premium_amount              0.210162
avg_user_sentiment_score    0.205210
not_churn_prob              0.201903
_time_to_expiry_days        0.154170
dtype: float32