# XGB Model for predicting Churn 

## Importing Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

## Getting the data

In [2]:
import os 
os.chdir('../fake_CRM_data/')
data = pd.read_csv('placements.csv')
os.chdir('../models/')

In [3]:
data.head()

Unnamed: 0,Client,Placement Client Local ID,Placement Name,Coverage,Product Line,Carrier Group,Placement Created Date/Time,Placement Created By,Placement Created By (ID),Response Received Date,Placement Specialist,Placement Renewing Status,Placement Status,Declination Reason,Placement Id,Placement Effective Date,Placement Expiry Date,Incumbent Indicator,Participation Status Code,Placement Client Segment Code,Placement Renewing Status Code,Limit,Coverage Premium Amount,Tria Premium,Total Premium,Comission %,Comission Amount,Participation Percentage,Carrier Group Local ID,Production Code,Submission Sent Date,Program Product Local Code Text,Approach Non Admitted Market Indicator,Carrier Integration
0,Global Technologies,SCR-0b810b6f4c20,SCR-8d9f15ee3a3c,General Liability,Energy and Power,Eastern Risk Management,2025-04-24T06:37:09.314837765,Kimberly Jackson,SCR-c54656cdfecb,29/07/25,Mary Jackson,In progress,Quote,-,SCR-76fd0b40a1cb,30/09/25,30/09/26,N,QUOTATION_STATUS_QUOTED,CLIENT_SEGMENT_RISK_MGMT,RENEWAL_STATUS_IN_PROGRESS,3558700.0,65304.28,1881.62,67311.79,8,5760.41,100,498,PRODUCTION_TYPE_NEW,-,SCR-262eac00ad8f,N,Not Applicable
1,Apex Enterprises,SCR-cfcd65ae0f7a,SCR-6b2e08c3f11d,General Liability,Casualty,Liberty Insurance Group,2025-04-25T15:21:40.575782415,Matthew Johnson,SCR-d47a7dd7c8ff,29/07/25,Donald Martin,In progress,Quote,-,SCR-07c81694713d,15/09/25,15/09/26,Y,QUOTATION_STATUS_QUOTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,82254400.0,1441557.71,55830.17,1581997.56,15,282637.79,100,815,PRODUCTION_TYPE_RENEWAL,-,SCR-af66d3bd1f7b,Y,Not Applicable
2,Omega Global,SCR-51dd14cf0f45,SCR-6d2b42de057e,General Liability,Casualty,United Coverage Corp,2025-03-13T18:03:34.574937777,Robert Young,SCR-0b51d24e469e,-,Michelle Anderson,In progress,Submitted,-,SCR-b422393095aa,15/09/25,15/09/26,Y,QUOTATION_STATUS_SUBMITTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,225000.0,3279.01,0.0,3256.21,17,645.63,100,3093,PRODUCTION_TYPE_RENEWAL,29/07/25,SCR-af66d3bd1f7b,N,Not Applicable
3,Omega Global,SCR-51dd14cf0f45,SCR-6d2b42de057e,General Liability,Casualty,United Coverage Corp,2025-03-13T18:03:34.574937777,Robert Young,SCR-0b51d24e469e,-,Michelle Anderson,In progress,Submitted,-,SCR-b422393095aa,15/09/25,15/09/26,Y,QUOTATION_STATUS_SUBMITTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,225000.0,0.0,0.0,0.0,17,0.0,100,3093,PRODUCTION_TYPE_RENEWAL,29/07/25,SCR-af66d3bd1f7b,N,Not Applicable
4,Omega Global,SCR-51dd14cf0f45,SCR-6d2b42de057e,General Liability,Casualty,Northern Trust Insurance,2025-03-13T18:03:34.574937777,Robert Young,SCR-0b51d24e469e,-,Michelle Anderson,In progress,Submitted,-,SCR-b422393095aa,15/09/25,15/09/26,N,QUOTATION_STATUS_SUBMITTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,225000.0,0.0,0.0,0.0,17,0.0,100,4584,PRODUCTION_TYPE_RENEWAL,30/07/25,SCR-af66d3bd1f7b,N,Not Applicable


## Preprocessing

### to camel case

In [4]:
def camelCase(names: list) -> list:
  return [name\
          .strip() \
          .replace('_', ' ') \
        #   .title() \
          .replace(' ', '') \
          .replace('(', '_') \
          .replace(')', '') \
          for name in names]

data.columns = camelCase(data.columns)
data.rename({'PlacementCreatedDate/Time': 'PlacemenCreatedDatetime'}, inplace=True)

### casting '-' with NA

In [5]:
for col in data.columns:
    data[col] = data[col].replace('-', pd.NA)

### to datetime 

In [6]:
def setColsToDatetime(df: pd.DataFrame, cols: list) -> None:
    for col in cols:
        df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')

In [7]:
setColsToDatetime(
    df=data, 
    cols=\
        ['ResponseReceivedDate', 'PlacementEffectiveDate',
       'PlacementExpiryDate', 'SubmissionSentDate']
)

  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')
  df[col] = pd.to_datetime(df[col], dayfirst=True, errors='ignore')


## Adding Columns

In [8]:
data['_DaysToExpiry'] = (data.PlacementExpiryDate - data.PlacementEffectiveDate).apply(lambda x: x.days)
data['_CarrierResponseTime'] = (data.ResponseReceivedDate - data.SubmissionSentDate).apply(lambda x: x.days)

### Churn Status

If at least one Bound placement for that client in that renewal year → churn = 0
Else → churn = 1

In [9]:
clients = data.PlacementClientLocalID.unique()
data['_ChurnStatus'] = pd.Series([pd.NA for _ in range(data.shape[0])], dtype='str')
for client in clients:
    clientMask = (data.PlacementClientLocalID == client)
    years = data[clientMask].PlacementExpiryDate.apply(lambda x: x.year).unique()
    status = ''
    
    for year in years:
        yearMask = (data.PlacementExpiryDate.apply(lambda x: x.year) == year)
        count = data[clientMask & yearMask].shape[0]
        if 'QUOTATION_STATUS_BOUND' in data[clientMask & yearMask].ParticipationStatusCode.to_list():            
            data.loc[clientMask & yearMask, '_ChurnStatus'] = 'N'
        else:
            data.loc[clientMask & yearMask, '_ChurnStatus'] = 'C'

### Client Past Performance

In [10]:
def fracRenewed(group):
    x = group[group == 'N'].size
    y = group.size
    return x/(x+y)

fracPlacementsRenewedByClient = data.groupby(['PlacementClientLocalID'])['_ChurnStatus'].agg(fracRenewed)
# fracPlacementsRenewedByClient = fracPlacementsRenewedByClient.rename({'PlacementRenewingStatusCode':'FracPlacementsRenewed'})

In [11]:
clients = fracPlacementsRenewedByClient.index.unique()
data['_FracPlacementsRenewedByClient'] = pd.Series([0. for _ in range(data.shape[0])], dtype='float')
for client in clients:
    clientMask = (data.PlacementClientLocalID == client)
    data.loc[clientMask, '_FracPlacementsRenewedByClient'] = fracPlacementsRenewedByClient[client]

### Carrier Past Performance

In [12]:
def fracRenewed(group):
    x = group[group == 'N'].size
    y = group.size
    return x/(x+y)

fracPlacementsRenewedByCarrier = data.groupby(['CarrierGroupLocalID'])['_ChurnStatus'].agg(fracRenewed)

In [13]:
carriers = fracPlacementsRenewedByCarrier.index.unique()
data['_FracPlacementsRenewedByCarrier'] = pd.Series([0. for _ in range(data.shape[0])], dtype='float')
for carrier in carriers:
    carrierMask = (data.CarrierGroupLocalID == carrier)
    data.loc[carrierMask, '_FracPlacementsRenewedByCarrier'] = fracPlacementsRenewedByCarrier[carrier]

In [14]:
data.head()

Unnamed: 0,Client,PlacementClientLocalID,PlacementName,Coverage,ProductLine,CarrierGroup,PlacementCreatedDate/Time,PlacementCreatedBy,PlacementCreatedBy_ID,ResponseReceivedDate,PlacementSpecialist,PlacementRenewingStatus,PlacementStatus,DeclinationReason,PlacementId,PlacementEffectiveDate,PlacementExpiryDate,IncumbentIndicator,ParticipationStatusCode,PlacementClientSegmentCode,PlacementRenewingStatusCode,Limit,CoveragePremiumAmount,TriaPremium,TotalPremium,Comission%,ComissionAmount,ParticipationPercentage,CarrierGroupLocalID,ProductionCode,SubmissionSentDate,ProgramProductLocalCodeText,ApproachNonAdmittedMarketIndicator,CarrierIntegration,_DaysToExpiry,_CarrierResponseTime,_ChurnStatus,_FracPlacementsRenewedByClient,_FracPlacementsRenewedByCarrier
0,Global Technologies,SCR-0b810b6f4c20,SCR-8d9f15ee3a3c,General Liability,Energy and Power,Eastern Risk Management,2025-04-24T06:37:09.314837765,Kimberly Jackson,SCR-c54656cdfecb,2025-07-29,Mary Jackson,In progress,Quote,,SCR-76fd0b40a1cb,2025-09-30,2026-09-30,N,QUOTATION_STATUS_QUOTED,CLIENT_SEGMENT_RISK_MGMT,RENEWAL_STATUS_IN_PROGRESS,3558700.0,65304.28,1881.62,67311.79,8,5760.41,100,498,PRODUCTION_TYPE_NEW,NaT,SCR-262eac00ad8f,N,Not Applicable,365,,C,0.0,0.477273
1,Apex Enterprises,SCR-cfcd65ae0f7a,SCR-6b2e08c3f11d,General Liability,Casualty,Liberty Insurance Group,2025-04-25T15:21:40.575782415,Matthew Johnson,SCR-d47a7dd7c8ff,2025-07-29,Donald Martin,In progress,Quote,,SCR-07c81694713d,2025-09-15,2026-09-15,Y,QUOTATION_STATUS_QUOTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,82254400.0,1441557.71,55830.17,1581997.56,15,282637.79,100,815,PRODUCTION_TYPE_RENEWAL,NaT,SCR-af66d3bd1f7b,Y,Not Applicable,365,,C,0.48,0.484444
2,Omega Global,SCR-51dd14cf0f45,SCR-6d2b42de057e,General Liability,Casualty,United Coverage Corp,2025-03-13T18:03:34.574937777,Robert Young,SCR-0b51d24e469e,NaT,Michelle Anderson,In progress,Submitted,,SCR-b422393095aa,2025-09-15,2026-09-15,Y,QUOTATION_STATUS_SUBMITTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,225000.0,3279.01,0.0,3256.21,17,645.63,100,3093,PRODUCTION_TYPE_RENEWAL,2025-07-29,SCR-af66d3bd1f7b,N,Not Applicable,365,,C,0.214286,0.478673
3,Omega Global,SCR-51dd14cf0f45,SCR-6d2b42de057e,General Liability,Casualty,United Coverage Corp,2025-03-13T18:03:34.574937777,Robert Young,SCR-0b51d24e469e,NaT,Michelle Anderson,In progress,Submitted,,SCR-b422393095aa,2025-09-15,2026-09-15,Y,QUOTATION_STATUS_SUBMITTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,225000.0,0.0,0.0,0.0,17,0.0,100,3093,PRODUCTION_TYPE_RENEWAL,2025-07-29,SCR-af66d3bd1f7b,N,Not Applicable,365,,C,0.214286,0.478673
4,Omega Global,SCR-51dd14cf0f45,SCR-6d2b42de057e,General Liability,Casualty,Northern Trust Insurance,2025-03-13T18:03:34.574937777,Robert Young,SCR-0b51d24e469e,NaT,Michelle Anderson,In progress,Submitted,,SCR-b422393095aa,2025-09-15,2026-09-15,N,QUOTATION_STATUS_SUBMITTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,225000.0,0.0,0.0,0.0,17,0.0,100,4584,PRODUCTION_TYPE_RENEWAL,2025-07-30,SCR-af66d3bd1f7b,N,Not Applicable,365,,C,0.214286,0.459016


## Preparing Data for Training 

In [15]:
data.head(2)

Unnamed: 0,Client,PlacementClientLocalID,PlacementName,Coverage,ProductLine,CarrierGroup,PlacementCreatedDate/Time,PlacementCreatedBy,PlacementCreatedBy_ID,ResponseReceivedDate,PlacementSpecialist,PlacementRenewingStatus,PlacementStatus,DeclinationReason,PlacementId,PlacementEffectiveDate,PlacementExpiryDate,IncumbentIndicator,ParticipationStatusCode,PlacementClientSegmentCode,PlacementRenewingStatusCode,Limit,CoveragePremiumAmount,TriaPremium,TotalPremium,Comission%,ComissionAmount,ParticipationPercentage,CarrierGroupLocalID,ProductionCode,SubmissionSentDate,ProgramProductLocalCodeText,ApproachNonAdmittedMarketIndicator,CarrierIntegration,_DaysToExpiry,_CarrierResponseTime,_ChurnStatus,_FracPlacementsRenewedByClient,_FracPlacementsRenewedByCarrier
0,Global Technologies,SCR-0b810b6f4c20,SCR-8d9f15ee3a3c,General Liability,Energy and Power,Eastern Risk Management,2025-04-24T06:37:09.314837765,Kimberly Jackson,SCR-c54656cdfecb,2025-07-29,Mary Jackson,In progress,Quote,,SCR-76fd0b40a1cb,2025-09-30,2026-09-30,N,QUOTATION_STATUS_QUOTED,CLIENT_SEGMENT_RISK_MGMT,RENEWAL_STATUS_IN_PROGRESS,3558700.0,65304.28,1881.62,67311.79,8,5760.41,100,498,PRODUCTION_TYPE_NEW,NaT,SCR-262eac00ad8f,N,Not Applicable,365,,C,0.0,0.477273
1,Apex Enterprises,SCR-cfcd65ae0f7a,SCR-6b2e08c3f11d,General Liability,Casualty,Liberty Insurance Group,2025-04-25T15:21:40.575782415,Matthew Johnson,SCR-d47a7dd7c8ff,2025-07-29,Donald Martin,In progress,Quote,,SCR-07c81694713d,2025-09-15,2026-09-15,Y,QUOTATION_STATUS_QUOTED,CLIENT_SEGMENT_MIDDLE_MKT,RENEWAL_STATUS_IN_PROGRESS,82254400.0,1441557.71,55830.17,1581997.56,15,282637.79,100,815,PRODUCTION_TYPE_RENEWAL,NaT,SCR-af66d3bd1f7b,Y,Not Applicable,365,,C,0.48,0.484444


In [16]:
X = data[[
    # 'Client', 
    # 'PlacementClientLocalID', 
    # 'PlacementName', 
    'Coverage',
    'ProductLine', 
    #    'CarrierGroup', 
    # 'PlacementCreatedDate/Time',
    #    'PlacementCreatedBy',
        # 'PlacementCreatedBy_ID', 'ResponseReceivedDate',
    #    'PlacementSpecialist', 
    #    'PlacementRenewingStatus', 'PlacementStatus',
    #    'DeclinationReason', 'PlacementId', 'PlacementEffectiveDate',
    #    'PlacementExpiryDate', 
       'IncumbentIndicator', 
    #    'ParticipationStatusCode',
       'PlacementClientSegmentCode', 
    #    'PlacementRenewingStatusCode', 
       'Limit',
       'CoveragePremiumAmount', 'TriaPremium', 'TotalPremium', 'Comission%',
       'ComissionAmount', 
      #  'ParticipationPercentage', 
    #    'CarrierGroupLocalID',
    #    'ProductionCode', 'SubmissionSentDate', 'ProgramProductLocalCodeText',
       'ApproachNonAdmittedMarketIndicator', 
    #    'CarrierIntegration',
       '_DaysToExpiry', '_CarrierResponseTime',
       '_FracPlacementsRenewedByClient', '_FracPlacementsRenewedByCarrier']]
X.PlacementClientSegmentCode.replace(pd.NA, 'null', inplace=True)
X.IncumbentIndicator.replace(pd.NA, 'null', inplace=True)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.PlacementClientSegmentCode.replace(pd.NA, 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.IncumbentIndicator.replace(pd.NA, 'null', inplace=True)


Unnamed: 0,Coverage,ProductLine,IncumbentIndicator,PlacementClientSegmentCode,Limit,CoveragePremiumAmount,TriaPremium,TotalPremium,Comission%,ComissionAmount,ApproachNonAdmittedMarketIndicator,_DaysToExpiry,_CarrierResponseTime,_FracPlacementsRenewedByClient,_FracPlacementsRenewedByCarrier
0,General Liability,Energy and Power,N,CLIENT_SEGMENT_RISK_MGMT,3558700.0,65304.28,1881.62,67311.79,8,5760.41,N,365,,0.0,0.477273
1,General Liability,Casualty,Y,CLIENT_SEGMENT_MIDDLE_MKT,82254400.0,1441557.71,55830.17,1581997.56,15,282637.79,Y,365,,0.48,0.484444
2,General Liability,Casualty,Y,CLIENT_SEGMENT_MIDDLE_MKT,225000.0,3279.01,0.0,3256.21,17,645.63,N,365,,0.214286,0.478673
3,General Liability,Casualty,Y,CLIENT_SEGMENT_MIDDLE_MKT,225000.0,0.0,0.0,0.0,17,0.0,N,365,,0.214286,0.478673
4,General Liability,Casualty,N,CLIENT_SEGMENT_MIDDLE_MKT,225000.0,0.0,0.0,0.0,17,0.0,N,365,,0.214286,0.459016


In [17]:
X.columns


Index(['Coverage', 'ProductLine', 'IncumbentIndicator',
       'PlacementClientSegmentCode', 'Limit', 'CoveragePremiumAmount',
       'TriaPremium', 'TotalPremium', 'Comission%', 'ComissionAmount',
       'ApproachNonAdmittedMarketIndicator', '_DaysToExpiry',
       '_CarrierResponseTime', '_FracPlacementsRenewedByClient',
       '_FracPlacementsRenewedByCarrier'],
      dtype='object')

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

scaler = StandardScaler()
cat_encoder = OneHotEncoder()

numerical_features = ['Limit', 'CoveragePremiumAmount', 'TriaPremium', 'TotalPremium', 'Comission%', 'ComissionAmount', '_DaysToExpiry',
       '_CarrierResponseTime', '_FracPlacementsRenewedByClient',
       '_FracPlacementsRenewedByCarrier']
categorical_features = ['Coverage', 'ProductLine', 'IncumbentIndicator', 'PlacementClientSegmentCode', 'ApproachNonAdmittedMarketIndicator']

transformer = ColumnTransformer([
    ('numTf', scaler, numerical_features),
    ('catTf', cat_encoder, categorical_features)
])

In [19]:
XTf = transformer.fit_transform(X)

In [20]:
y = data[['_ChurnStatus']]
label_encoder = LabelEncoder()
yTf = label_encoder.fit_transform(y)

  y = column_or_1d(y, warn=True)


### Getting Test Train Split

In [21]:
from sklearn.model_selection import train_test_split

XTrain, XTest, yTrain, yTest = train_test_split(XTf, yTf, train_size=0.8, random_state=69)

### Oversampling 

In [22]:
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler()
XTrain, yTrain = oversampler.fit_resample(XTrain, yTrain)

## Training the Model

In [23]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(XTrain, yTrain)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


## Checking F1 Score

In [24]:
from sklearn.metrics import f1_score
yPred = model.predict(XTest)

f1_score(yTest, yPred)

0.9938650306748467

## Saving the Model

In [25]:
import joblib

joblib.dump(model, 'churn_model.pkl')
joblib.dump(transformer, 'data_transformer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(list(X.columns), 'features.pkl')

['features.pkl']

## Getting Feature Weights

In [26]:
import os
os.chdir('../')

In [27]:
from prioritization import time_to_expiry_score, premium_at_risk_score, past_performance_score, churn_prob_score

In [28]:
sset = pd.DataFrame({
    'time_to_expiry_score' : time_to_expiry_score(data._DaysToExpiry),
    'premium_at_risk_score' : premium_at_risk_score(data.TotalPremium),
    'client_past_performance_score' : past_performance_score(data._FracPlacementsRenewedByClient),
    'carrier_past_performance_score' : past_performance_score(data._FracPlacementsRenewedByCarrier),
    'churn_prob_score': churn_prob_score(data)
})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.PlacementClientSegmentCode.replace(pd.NA, 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.IncumbentIndicator.replace(pd.NA, 'null', inplace=True)


In [29]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

scaler = StandardScaler()
label_encoder = LabelEncoder()

In [30]:
XTf = scaler.fit_transform(sset)
yTf = label_encoder.fit_transform(data._ChurnStatus)

In [31]:
from sklearn.ensemble import RandomForestClassifier
weights_model = RandomForestClassifier(random_state=69)
weights_model.fit(XTf, yTf)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True
