In [1]:
import pandas as pd
import numpy as np


In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
calls_df = pd.read_csv('../data/raw/support_calls.csv')

In [5]:
def label_churn_risk(text):
    if isinstance(text, str):
        text_lower = text.lower()
        if any(keyword in text_lower for keyword in ["cancel", "closing", "leave", "unhappy"]):
            return 1
    return 0

calls_df['churn_label'] = calls_df['call_transcript'].apply(label_churn_risk)

print("Distribution of churn_label:\n", calls_df['churn_label'].value_counts())

Distribution of churn_label:
 churn_label
0    16804
1     3196
Name: count, dtype: int64


In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

texts = calls_df['call_transcript'].tolist()
labels = calls_df['churn_label'].tolist()

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 622kB/s] 
Downloading: 100%|██████████| 48.0/48.0 [00:00<00:00, 85.1kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 1.17MB/s]


In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

import torch
class ChurnDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = ChurnDataset(train_encodings, train_labels)
val_dataset = ChurnDataset(val_encodings, val_labels)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Downloading: 100%|██████████| 268M/268M [00:11<00:00, 22.8MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly init

In [9]:
trainer.train()

***** Running training *****
  Num examples = 16000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2000
  Number of trainable parameters = 66955010
  2%|▎         | 50/2000 [01:07<41:12,  1.27s/it] 

{'loss': 0.2669, 'learning_rate': 4.899497487437186e-05, 'epoch': 0.03}


  5%|▌         | 100/2000 [02:32<1:02:05,  1.96s/it]

{'loss': 0.0035, 'learning_rate': 4.7738693467336685e-05, 'epoch': 0.05}


  8%|▊         | 150/2000 [03:45<44:09,  1.43s/it]  

{'loss': 0.0009, 'learning_rate': 4.648241206030151e-05, 'epoch': 0.07}


 10%|█         | 200/2000 [04:59<55:30,  1.85s/it]  

{'loss': 0.0404, 'learning_rate': 4.522613065326633e-05, 'epoch': 0.1}


 12%|█▎        | 250/2000 [06:43<1:33:15,  3.20s/it]

{'loss': 0.0195, 'learning_rate': 4.396984924623116e-05, 'epoch': 0.12}


 15%|█▌        | 300/2000 [08:11<41:56,  1.48s/it]  

{'loss': 0.0517, 'learning_rate': 4.271356783919598e-05, 'epoch': 0.15}


 18%|█▊        | 350/2000 [09:26<47:34,  1.73s/it]

{'loss': 0.0673, 'learning_rate': 4.1457286432160806e-05, 'epoch': 0.17}


 20%|██        | 400/2000 [10:50<46:54,  1.76s/it]

{'loss': 0.1041, 'learning_rate': 4.020100502512563e-05, 'epoch': 0.2}


 22%|██▎       | 450/2000 [12:02<39:57,  1.55s/it]

{'loss': 0.0194, 'learning_rate': 3.8944723618090455e-05, 'epoch': 0.23}


 25%|██▌       | 500/2000 [13:37<38:54,  1.56s/it]  Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'loss': 0.0795, 'learning_rate': 3.768844221105528e-05, 'epoch': 0.25}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
 28%|██▊       | 550/2000 [15:13<44:59,  1.86s/it]  

{'loss': 0.0669, 'learning_rate': 3.64321608040201e-05, 'epoch': 0.28}


 30%|███       | 600/2000 [16:41<33:28,  1.43s/it]  

{'loss': 0.0643, 'learning_rate': 3.517587939698493e-05, 'epoch': 0.3}


 32%|███▎      | 650/2000 [17:50<27:28,  1.22s/it]

{'loss': 0.018, 'learning_rate': 3.391959798994975e-05, 'epoch': 0.33}


 35%|███▌      | 700/2000 [18:56<30:51,  1.42s/it]

{'loss': 0.0496, 'learning_rate': 3.2663316582914576e-05, 'epoch': 0.35}


 38%|███▊      | 750/2000 [20:07<34:30,  1.66s/it]

{'loss': 0.0449, 'learning_rate': 3.14070351758794e-05, 'epoch': 0.38}


 40%|████      | 800/2000 [21:35<26:09,  1.31s/it]  

{'loss': 0.0036, 'learning_rate': 3.015075376884422e-05, 'epoch': 0.4}


 42%|████▎     | 850/2000 [22:53<26:57,  1.41s/it]

{'loss': 0.0007, 'learning_rate': 2.8894472361809045e-05, 'epoch': 0.42}


 45%|████▌     | 900/2000 [23:57<22:48,  1.24s/it]

{'loss': 0.0005, 'learning_rate': 2.763819095477387e-05, 'epoch': 0.45}


 48%|████▊     | 950/2000 [24:58<20:41,  1.18s/it]

{'loss': 0.0003, 'learning_rate': 2.6381909547738694e-05, 'epoch': 0.47}


 50%|█████     | 1000/2000 [25:59<19:45,  1.19s/it]Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'loss': 0.0003, 'learning_rate': 2.5125628140703518e-05, 'epoch': 0.5}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
 52%|█████▎    | 1050/2000 [27:04<19:45,  1.25s/it]

{'loss': 0.0002, 'learning_rate': 2.3869346733668342e-05, 'epoch': 0.53}


 55%|█████▌    | 1100/2000 [28:07<18:57,  1.26s/it]

{'loss': 0.0002, 'learning_rate': 2.2613065326633167e-05, 'epoch': 0.55}


 57%|█████▊    | 1150/2000 [29:10<17:00,  1.20s/it]

{'loss': 0.0002, 'learning_rate': 2.135678391959799e-05, 'epoch': 0.57}


 60%|██████    | 1200/2000 [30:12<17:23,  1.30s/it]

{'loss': 0.0002, 'learning_rate': 2.0100502512562815e-05, 'epoch': 0.6}


 62%|██████▎   | 1250/2000 [31:14<15:32,  1.24s/it]

{'loss': 0.0001, 'learning_rate': 1.884422110552764e-05, 'epoch': 0.62}


 65%|██████▌   | 1300/2000 [32:15<14:27,  1.24s/it]

{'loss': 0.0001, 'learning_rate': 1.7587939698492464e-05, 'epoch': 0.65}


 68%|██████▊   | 1350/2000 [33:18<13:25,  1.24s/it]

{'loss': 0.0001, 'learning_rate': 1.6331658291457288e-05, 'epoch': 0.68}


 70%|███████   | 1400/2000 [34:19<11:45,  1.18s/it]

{'loss': 0.0001, 'learning_rate': 1.507537688442211e-05, 'epoch': 0.7}


 72%|███████▎  | 1450/2000 [35:21<10:57,  1.20s/it]

{'loss': 0.0001, 'learning_rate': 1.3819095477386935e-05, 'epoch': 0.72}


 75%|███████▌  | 1500/2000 [36:26<10:26,  1.25s/it]Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json


{'loss': 0.0001, 'learning_rate': 1.2562814070351759e-05, 'epoch': 0.75}


Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
 78%|███████▊  | 1550/2000 [37:48<19:43,  2.63s/it]

{'loss': 0.0001, 'learning_rate': 1.1306532663316583e-05, 'epoch': 0.78}


 80%|████████  | 1600/2000 [39:00<09:00,  1.35s/it]

{'loss': 0.0001, 'learning_rate': 1.0050251256281408e-05, 'epoch': 0.8}


 82%|████████▎ | 1650/2000 [40:17<13:38,  2.34s/it]

{'loss': 0.0001, 'learning_rate': 8.793969849246232e-06, 'epoch': 0.82}


 85%|████████▌ | 1700/2000 [41:29<06:45,  1.35s/it]

{'loss': 0.0001, 'learning_rate': 7.537688442211055e-06, 'epoch': 0.85}


 88%|████████▊ | 1750/2000 [42:31<05:09,  1.24s/it]

{'loss': 0.0001, 'learning_rate': 6.2814070351758795e-06, 'epoch': 0.88}


 90%|█████████ | 1800/2000 [43:32<04:09,  1.25s/it]

{'loss': 0.0001, 'learning_rate': 5.025125628140704e-06, 'epoch': 0.9}


 92%|█████████▎| 1850/2000 [44:35<03:00,  1.21s/it]

{'loss': 0.0001, 'learning_rate': 3.7688442211055276e-06, 'epoch': 0.93}


 95%|█████████▌| 1900/2000 [45:38<02:01,  1.21s/it]

{'loss': 0.0001, 'learning_rate': 2.512562814070352e-06, 'epoch': 0.95}


 98%|█████████▊| 1950/2000 [46:42<01:04,  1.30s/it]

{'loss': 0.0001, 'learning_rate': 1.256281407035176e-06, 'epoch': 0.97}


100%|██████████| 2000/2000 [47:46<00:00,  1.23s/it]Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json


{'loss': 0.0039, 'learning_rate': 0.0, 'epoch': 1.0}


Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 8
                                                   
100%|██████████| 2000/2000 [51:16<00:00,  1.23s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 2000/2000 [51:16<00:00,  1.54s/it]

{'eval_loss': 5.013004192733206e-05, 'eval_runtime': 208.6831, 'eval_samples_per_second': 19.168, 'eval_steps_per_second': 2.396, 'epoch': 1.0}
{'train_runtime': 3076.3433, 'train_samples_per_second': 5.201, 'train_steps_per_second': 0.65, 'train_loss': 0.022710809982614592, 'epoch': 1.0}





TrainOutput(global_step=2000, training_loss=0.022710809982614592, metrics={'train_runtime': 3076.3433, 'train_samples_per_second': 5.201, 'train_steps_per_second': 0.65, 'train_loss': 0.022710809982614592, 'epoch': 1.0})

In [10]:
val_preds = trainer.predict(val_dataset)
val_pred_labels = np.argmax(val_preds.predictions, axis=1)

***** Running Prediction *****
  Num examples = 4000
  Batch size = 8
100%|██████████| 500/500 [04:02<00:00,  2.07it/s]


In [11]:
print("\nClassification Report:")
print(classification_report(val_labels, val_pred_labels))

print("\nNLP-based Churn Prediction Complete.")


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3377
           1       1.00      1.00      1.00       623

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000


NLP-based Churn Prediction Complete.
