In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
import warnings
import joblib
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [3]:
file_path = '/content/gdrive/My Drive/telco.csv'
data = pd.read_csv(file_path)
data['Churn_encoded'] = data['Churn'].map({"Stayed": 0, "Churned": 1})
data['Churn'].value_counts()
1869/(1869+5174)

### Churn rate for telco data set is 26.5%

0.2653698707936959

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, precision_score, recall_score, precision_recall_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

category_cols = ['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']


def prep_data(category_cols, data):
  data_encoded = pd.get_dummies(data , columns = category_cols )

  data_encoded['TotalCharges'] = data_encoded['TotalCharges'].replace(' ', np.nan)
  data_encoded['TotalCharges'] = data_encoded['TotalCharges'].astype(float)
  drop_cols = ['Unnamed: 0', 'customerID']
  data_encoded.drop(drop_cols, axis=1, inplace=True)
  data_encoded = data_encoded.dropna()

  mms = MinMaxScaler()
  data_encoded[['tenure','MonthlyCharges','TotalCharges']] = mms.fit_transform(data_encoded[['tenure','MonthlyCharges','TotalCharges']])

  return data_encoded

In [None]:
def split_data(df):
  y = df['Churn_encoded'].values
  X = df.drop(columns=['Churn', 'Churn_encoded'], axis=1)

  X_train, X_temp, y_train, y_temp = train_test_split(
      X, y, test_size=0.3, random_state=42, stratify=y
  )

  X_val, X_test, y_val, y_test = train_test_split(
      X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
  )
  return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def benchmark_classifiers(X_train, y_train, X_val, y_val):
    model_list = [RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs = -1),
                  LogisticRegression()]
    model_names = ["RandomForestClassifier", "LogisticRegression"]
    model_data = zip(model_list, model_names)

    acc_list = []
    model_trained = []
    recall_list = []
    precision_list = []
    roc_auc_scores = []
    pr_auc_scores = []

    for model, name in model_data:
        model.fit(X_train, y_train)
        # y_pred_class = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        model_trained.append(model)
        prec, rec, thresh = precision_recall_curve(y_val, y_pred_proba)
        balance_idx = np.argmin(np.abs(prec - rec))

        best_thresh = thresh[balance_idx]
        print(f"Threshold for balanced precision/recall: {best_thresh:.3f}")
        print(f"Precision: {prec[balance_idx]:.3f}, Recall: {rec[balance_idx]:.3f}")

        y_pred_class = (y_pred_proba >= best_thresh).astype(int)
        acc = accuracy_score(y_val, y_pred_class)

        recall = recall_score(y_val, y_pred_class)
        recall_list.append(recall)
        print(f"Recall: {recall:.4f}")
        roc_auc = roc_auc_score(y_val, y_pred_proba)
        roc_auc_scores.append(roc_auc)
        print(f"ROC AUC: {roc_auc:.4f}")

        precision_at_default = precision_score(y_val, y_pred_class)
        precision_list.append(precision_at_default)
        print(f"Precision (at default threshold): {precision_at_default:.4f}")

        precision_curve, recall_curve, thresholds = precision_recall_curve(y_val, y_pred_proba)
        pr_auc = auc(recall_curve, precision_curve)
        pr_auc_scores.append(pr_auc)
        print(f"PR AUC: {pr_auc:.4f}")

        acc_list.append(acc)

    acc_df = pd.DataFrame(columns = ['accuracy'])
    acc_df['accuracy'] = acc_list
    acc_df['classifier'] = model_names
    acc_df['recall'] = recall_list
    acc_df['precision'] = precision_list
    acc_df['roc_auc'] = roc_auc_scores
    acc_df['pr_auc'] = pr_auc_scores
    acc_df = acc_df[['classifier', 'accuracy', 'recall', 'precision', 'roc_auc', 'pr_auc']]

    return acc_df, model_trained



## Base Model

In [None]:
data2 = prep_data(category_cols, data)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data2)
y_test_b = y_test.copy()
pd.DataFrame(y_test_b).to_csv('/content/gdrive/MyDrive/y_test_b.csv', index=False)

In [None]:
acc_df, model_trained = benchmark_classifiers(X_train, y_train, X_val, y_val)
clf = model_trained[1]
joblib.dump(clf, "/content/gdrive/MyDrive/random_forest_model_v0.pkl")
clf = model_trained[0]
joblib.dump(clf, "/content/gdrive/MyDrive/logistic_model_v0.pkl")

acc_df['data'] = 'v0'
acc_df.to_csv('/content/gdrive/MyDrive/acc_df_v0.csv')

Threshold for balanced precision/recall: 0.387
Precision: 0.587, Recall: 0.587
Recall: 0.5872
ROC AUC: 0.8142
Precision (at default threshold): 0.5872
PR AUC: 0.5921
Threshold for balanced precision/recall: 0.451
Precision: 0.658, Recall: 0.658
Recall: 0.6584
ROC AUC: 0.8530
Precision (at default threshold): 0.6584
PR AUC: 0.6547


## GENERATE LLM DATA

In [None]:
# import os
# import pandas as pd
# import random
# from openai import OpenAI
# # os.environ["OPENAI_API_KEY"]

# client = OpenAI(os.environ["OPENAI_API_KEY"])

# # Few example churn-related contexts to guide LLM
# prompt_template = """
# You are generating synthetic customer feedback for a telco churn dataset.

# Customer details:
# - Gender: {gender}
# - Senior Citizen: {SeniorCitizen}
# - Partner: {partner}
# - Tenure: {tenure} months
# - Contract type: {contract}
# - Monthly charges: {charges}
# - Churned: {churn}

# Task:
# Generate a short customer feedback statement (1-3 sentences) that reflects
# their likelihood of churn. Make it realistic and vary tone across examples.
# """

# def generate_feedback(row):
#     prompt = prompt_template.format(
#         gender=row["gender"],
#         SeniorCitizen=row["SeniorCitizen"],
#         partner=row["Partner"],
#         tenure=row["tenure"],
#         contract=row["Contract"],
#         charges=row["MonthlyCharges"],
#         churn=row["Churn"]
#     )

#     # Call the LLM
#     response = client.chat.completions.create(
#         model="gpt-4o-mini",   # cost-efficient, or use gpt-4o
#         messages=[{"role": "user", "content": prompt}],
#         max_tokens=60,
#         temperature=0.8,
#     )

#     return response.choices[0].message.content.strip()

# # # Generate feedback for a small subset (to test)
# df_sample = data.sample(3, random_state=42)
# df_sample["customer_feedback"] = df_sample.apply(generate_feedback, axis=1)


In [None]:
### should have chunked this....
### How long did this take????
### how much $?

In [None]:
# df = []
# chunk = 500

# for i in range(0, len(data2), chunk):
#   current_chunk = i
#   remainder = len(data2) - i

#   if remainder < 0:
#     break
#   elif remainder < chunk:
#     data_chunk = data2.iloc[i:i+remainder]
#   else:
#     data_chunk = data2.iloc[i:i+chunk]

#   data_chunk["customer_feedback"] = data_chunk.apply(generate_feedback, axis=1)

#   # file_path = f"/content/drive/MyDrive/Colab Notebooks/Outputs/telco_feedback_{chunk}.csv"
#   # data_chunk.to_csv(file_path, index=False)

#   df.append(data_chunk)

# data_feedback = pd.concat(df, axis=0)
# print(data_feedback.shape)
# data_feedback.to_csv("/content/drive/MyDrive/telco_feedback_7500.csv", index=False)

# Incorporating feedback features

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# data_feedback1 = pd.read_csv("/content/gdrive/MyDrive/telco_feedback_4000.csv")
# data_feedback2 = pd.read_csv("/content/gdrive/MyDrive/telco_feedback_7500.csv")
# data_feedback1['Churn_encoded'] = data_feedback1['Churn'].map({"Stayed": 0, "Churned": 1})
# data_feedback2['Churn_encoded'] = data_feedback2['Churn'].map({"Stayed": 0, "Churned": 1})
# all_data = pd.concat([data_feedback1, data_feedback2], axis=0)
# all_data.to_csv("/content/gdrive/MyDrive/telco_feedback_all.csv", index=False)

## 1. Batch / Bulk Inference

Instead of running one input at a time, process multiple feedback items in batches.

Transformers libraries like Hugging Face support DataLoader + GPU batching, which can increase throughput 5–10x.

If you’re CPU-bound, even batching 10–50 texts per forward pass reduces overhead.

In [None]:
all_data = pd.read_csv("/content/gdrive/MyDrive/telco_feedback_all.csv")
customer_feedback = list(all_data['customer_feedback'].values)
len(customer_feedback)

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
print(torch.cuda.is_available())  # should be True

# Sentiment model
sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model_name, device=0)  # device=0 for GPU

# Zero-shot classification
zero_shot_model_name = "facebook/bart-large-mnli"
zero_shot_pipeline = pipeline("zero-shot-classification", model=zero_shot_model_name, device=0)


True


Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
def batchify(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

batch_size = 40  # adjust depending on GPU/CPU memory

In [None]:
import torch
import time
start_time = time.perf_counter()

all_sentiments = []

with torch.no_grad():  # disable gradient calculations
    for batch in batchify(customer_feedback, batch_size):
        results = sentiment_pipeline(batch)
        for i in range(len(results)):
            label = results[i]['label']
            all_sentiments.append(label)

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time/60:.4f} minutes")

Elapsed time: 1.0987 minutes


In [None]:
### Nice to have but no time

# import torch
# import time

# start_time = time.perf_counter()
# candidate_labels = ["billing", "customer service", "product quality", "technical issues", "other"]

# all_classifications = []

# with torch.no_grad():  # disable gradient calculations
#     for batch in batchify(customer_feedback, batch_size):
#       results = zero_shot_pipeline(batch, candidate_labels)
#       category = results[0]['labels']
#       all_classifications.extend(category)

# ### Cache results if feedback don't change often
# # import pickle
# # with open("sentiment_results.pkl", "wb") as f:
# #     pickle.dump(all_sentiments, f)

# end_time = time.perf_counter()
# elapsed_time = end_time - start_time
# print(f"Elapsed time: {elapsed_time/60:.4f} minutes")

In [None]:
category_cols.append('sentiment')

In [None]:
all_data['sentiment'] = all_sentiments
data_senti = prep_data(category_cols, all_data)
data_senti.drop('customer_feedback', axis=1, inplace=True)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data_senti)
pd.DataFrame(X_test).to_csv('/content/gdrive/MyDrive/X_test_v1.csv', index=False)

In [None]:
X_test.to_csv('X_test_v1.csv', index=False)

In [None]:
acc_df, model_trained = benchmark_classifiers(X_train, y_train, X_val, y_val)
clf = model_trained[0]
joblib.dump(clf, "/content/gdrive/MyDrive/logistic_model_v1.pkl")
clf = model_trained[1]
joblib.dump(clf, "/content/gdrive/MyDrive/random_forest_model_v1.pkl")

acc_df['data'] = 'v1'
acc_df.to_csv('/content/gdrive/MyDrive/acc_df_v1.csv')

Threshold for balanced precision/recall: 0.405
Precision: 0.826, Recall: 0.826
Recall: 0.8256
ROC AUC: 0.9608
Precision (at default threshold): 0.8256
PR AUC: 0.8965
Threshold for balanced precision/recall: 0.728
Precision: 0.875, Recall: 0.875
Recall: 0.8754
ROC AUC: 0.9767
Precision (at default threshold): 0.8754
PR AUC: 0.9183


## 2. Semi-Supervised / Distillation Approach

Since your zero-shot models already generate labels:

Sample a representative subset of your customer data.

Run your zero-shot models once to label them.

Train a smaller student model on this pseudo-labeled dataset:

DistilBERT, TinyBERT, or even a non-transformer classifier.

Deploy the student model for the full dataset.

Benefit: You retain the “LLM intelligence” but cut inference cost drastically.

### How many samples are enough for fine-tuning a student model?

### Option A: DistilBERT / TinyBERT Fine-Tuning

In [None]:
import os
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import AutoTokenizer, DistilBertForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
os.environ["WANDB_DISABLED"] = "true"

texts = customer_feedback
labels = all_sentiments

dataset = Dataset.from_dict({'text': texts, 'label': labels})
# id2label = {0: "NEGATIVE", 1: "POSITIVE"}

dataset = dataset.rename_column("label", "old_labels")
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

def encode_labels(example):
    example["label"] = label2id[example["old_labels"]]
    return example

dataset = dataset.map(encode_labels)
print(dataset[0]["label"])  # 0 or 1 (int)

Map:   0%|          | 0/7043 [00:00<?, ? examples/s]

1


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

start_time = time.perf_counter()

training_args = TrainingArguments(output_dir="./test_trainer", per_device_train_batch_size=8, report_to="none")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time/60:.4f} minutes")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/7043 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.2021
1000,0.1593
1500,0.1218
2000,0.0953
2500,0.0587


Elapsed time: 4.1490 minutes


In [None]:
texts = all_data.iloc[4000:]['customer_feedback'].values
print(len(texts))
dataset_test = Dataset.from_dict({'text': texts})

3043


### This is to generate sentiment column using the student model. Then use result (predicted sentiment) as a feature in Binary Classification Model


In [None]:
start_time = time.perf_counter()

encoded_dataset = dataset_test.map(tokenize, batched=True)
encoded_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

predictions = trainer.predict(encoded_dataset)
logits = predictions.predictions
predicted_classes = np.argmax(logits, axis=1)
print(predicted_classes[:10])

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time/60:.4f} minutes")

Map:   0%|          | 0/3043 [00:00<?, ? examples/s]

[0 1 1 1 1 0 1 1 1 1]
Elapsed time: 0.1396 minutes


In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
predicted_labels = [id2label[i] for i in predicted_classes]
predicted_labels[:5]

['NEGATIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE', 'POSITIVE']

In [None]:
# Chain first 4000 labels and next 4000 (predicted) labels for sentiment column on the dataset
cut_off = 4000
sentiment_list = all_sentiments[:cut_off] + predicted_labels
all_data['sentiment'] = sentiment_list
data_senti2 = prep_data(category_cols, all_data)
data_senti2.drop('customer_feedback', axis=1, inplace=True)

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(data_senti2)
pd.DataFrame(X_test).to_csv('/content/gdrive/MyDrive/X_test_v2.csv', index=False)

In [None]:
X_test.to_csv('X_test_v2.csv', index=False)

In [None]:
acc_df, model_trained = benchmark_classifiers(X_train, y_train, X_val, y_val)
clf = model_trained[0]
joblib.dump(clf, "/content/gdrive/MyDrive/logistic_model_v2.pkl")
clf = model_trained[1]
joblib.dump(clf, "/content/gdrive/MyDrive/random_forest_model_v2.pkl")
acc_df['data'] = 'v2'
acc_df.to_csv('/content/gdrive/MyDrive/acc_df_v2.csv')

Threshold for balanced precision/recall: 0.378
Precision: 0.811, Recall: 0.811
Recall: 0.8114
ROC AUC: 0.9469
Precision (at default threshold): 0.8114
PR AUC: 0.8636
Threshold for balanced precision/recall: 0.734
Precision: 0.875, Recall: 0.875
Recall: 0.8754
ROC AUC: 0.9764
Precision (at default threshold): 0.8754
PR AUC: 0.9171


### Option B: Non-Transformer Lightweight Model (DID NOT DO BUT CAN WRITE ABOUT IT)


In [None]:
# from sentence_transformers import SentenceTransformer
# from sklearn.linear_model import LogisticRegression

# # Generate pseudo labels
# pseudo_labels = []a
# for feedback, sentiment in zip(customer_feedback, all_sentiments):
#     pseudo_labels.append({'text': feedback, 'sentiment': sentiment})
# pseudo_labels

In [None]:
# # Embeddings
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
# X = embedder.encode([x['text'] for x in pseudo_labels], show_progress_bar=True)
# y = [0 if x['sentiment']=='NEGATIVE' else 1 for x in pseudo_labels]

# # Train classifier
# clf = LogisticRegression(max_iter=1000)
# clf.fit(X, y)

### create sentiment col for the next 4000 rows
# clf.predict( ... )

## 3. Embedding + Lightweight Classifier Pipeline

You can combine embeddings with small classifiers:

Generate sentence embeddings (e.g., all-MiniLM-L6-v2) for all feedback.

Cluster / reduce dimensions (UMAP, PCA) if needed.

Train simple classifiers (Logistic Regression, XGBoost, LightGBM) on these embeddings to predict sentiment / category.

Why it’s cheaper:

One forward pass through a small embedding model is faster than repeated zero-shot LLM calls.

Embeddings are reusable; you can compute them once and store them.

Classifier inference is near-instant on CPU.

In [None]:
from sentence_transformers import SentenceTransformer
import umap

all_data = pd.read_csv("/content/gdrive/MyDrive/telco_feedback_all.csv")
model = SentenceTransformer('all-MiniLM-L6-v2')
all_data["embedding"] = all_data["customer_feedback"].apply(lambda x: model.encode(x).tolist())
embedding_series = all_data['embedding']
embeddings_array = np.stack(embedding_series.values)
reducer = umap.UMAP(n_components=5, random_state=42)
reduced_embeddings = reducer.fit_transform(embeddings_array)
reduced_df = pd.DataFrame(reduced_embeddings, columns=[f'umap_{i}' for i in range(reduced_embeddings.shape[1])])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  warn(


In [None]:
all_data = all_data.reset_index(drop=True)
print(all_data.index.is_unique)     # should be True
print(reduced_df.index.is_unique)   # should be True
all_data_1 = pd.concat([all_data, reduced_df], axis=1)

True
True


Attempt to get a representative text from each cluster using HBSCAN not successful?

In [None]:
category_cols.remove('sentiment')
all_data_1_2 = prep_data(category_cols, all_data_1)
all_data_1_2.drop(['customer_feedback', 'embedding'], axis=1, inplace=True)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(all_data_1_2)

In [None]:
acc_df, model_trained = benchmark_classifiers(X_train, y_train, X_val, y_val)
clf = model_trained[0]
joblib.dump(clf, "/content/gdrive/MyDrive/logistic_model_v3.pkl")
clf = model_trained[1]
joblib.dump(clf, "/content/gdrive/MyDrive/random_forest_model_v3.pkl")
acc_df['data'] = 'v3'
acc_df.to_csv('/content/gdrive/MyDrive/acc_df_v3.csv')

Threshold for balanced precision/recall: 0.362
Precision: 0.630, Recall: 0.630
Recall: 0.6299
ROC AUC: 0.8501
Precision (at default threshold): 0.6299
PR AUC: 0.6646
Threshold for balanced precision/recall: 0.430
Precision: 0.779, Recall: 0.779
Recall: 0.7794
ROC AUC: 0.9481
Precision (at default threshold): 0.7794
PR AUC: 0.8850


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Sanity check Feedback generated by LLM

In [8]:
# read in all acc_df
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [1]:
# Option B — Use Embedding Similarity

# Compute embedding similarity between generated text and archetypal “churn” or “loyalty” texts.

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

# Reference archetypes
churn_text = "I am unhappy with the service and plan to cancel."
stay_text = "I am satisfied and plan to continue my subscription."

churn_emb = model.encode(churn_text, convert_to_tensor=True)
stay_emb = model.encode(stay_text, convert_to_tensor=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
import pandas as pd

df = pd.read_csv("/content/gdrive/MyDrive/telco_feedback_all.csv")
df['Churn_encoded'] = df['Churn'].map({"Stayed": 0, "Churned": 1})
df["embedding"] = df["customer_feedback"].apply(lambda x: model.encode(x, convert_to_tensor=True))
df["sim_churn"] = df["embedding"].apply(lambda x: float(util.cos_sim(x, churn_emb)))
df["sim_stay"] = df["embedding"].apply(lambda x: float(util.cos_sim(x, stay_emb)))

df["churn_alignment"] = df["sim_churn"] - df["sim_stay"]

# Evaluate correlation
alignment_corr = df[["churn_alignment", "Churn_encoded"]].corr().iloc[0,1]
print(f"Alignment correlation: {alignment_corr:.3f}")

# Alignment correlation: 0.632


Alignment correlation: 0.632


In [13]:
# Option A — Use Sentiment Polarity

# If the LLM-generated text is supposed to reflect emotions (complaints, praise, etc.):

from transformers import pipeline
import pandas as pd


df = pd.read_csv("/content/gdrive/MyDrive/telco_feedback_all.csv")
df['Churn_encoded'] = df['Churn'].map({"Stayed": 0, "Churned": 1})

# Example: Sentiment pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

df["sentiment"] = df["customer_feedback"].apply(lambda x: sentiment_analyzer(x)[0]["label"])
df["sentiment_score"] = df["customer_feedback"].apply(lambda x: sentiment_analyzer(x)[0]["score"])

# Convert to numeric
df["sentiment_num"] = df["sentiment"].map({"POSITIVE": 0, "NEGATIVE": 1})

# Compare with churn
correlation = df[["sentiment_num", "Churn_encoded"]].corr().iloc[0,1]
print(f"Correlation between sentiment and churn: {correlation:.3f}")

# Correlation between sentiment and churn: 0.877


Device set to use cuda:0


Correlation between sentiment and churn: 0.877


## PICK One or Two Models to get business impact calculations


In [None]:
# read in all acc_df
from google.colab import drive
drive.mount('/content/gdrive')

In [11]:
import pandas as pd
import numpy as np
import warnings

acc_df_v0 = pd.read_csv("/content/gdrive/MyDrive/acc_df_v0.csv")
acc_df_v1 = pd.read_csv("/content/gdrive/MyDrive/acc_df_v1.csv")
acc_df_v2 = pd.read_csv("/content/gdrive/MyDrive/acc_df_v2.csv")
acc_df_v3 = pd.read_csv("/content/gdrive/MyDrive/acc_df_v3.csv")

acc_df = pd.concat([acc_df_v0, acc_df_v1, acc_df_v2, acc_df_v3], axis=0)
acc_df.drop('Unnamed: 0', axis=1, inplace=True)
acc_df.sort_values('roc_auc', ascending=False, inplace=True)
acc_df

Unnamed: 0,classifier,accuracy,recall,precision,roc_auc,pr_auc,data
1,LogisticRegression,0.933649,0.875445,0.875445,0.976675,0.918284,v1
1,LogisticRegression,0.933649,0.875445,0.875445,0.976418,0.917146,v2
0,RandomForestClassifier,0.907109,0.825623,0.825623,0.960847,0.896512,v1
1,LogisticRegression,0.882464,0.779359,0.779359,0.948077,0.885013,v3
0,RandomForestClassifier,0.899526,0.811388,0.811388,0.946872,0.863621,v2
1,LogisticRegression,0.818009,0.658363,0.658363,0.852959,0.654731,v0
0,RandomForestClassifier,0.802844,0.629893,0.629893,0.850115,0.664573,v3
0,RandomForestClassifier,0.780095,0.587189,0.587189,0.814239,0.592146,v0


In [12]:
import joblib

X_test_v1 = pd.read_csv('/content/gdrive/MyDrive/X_test_v1.csv')
X_test_v2 = pd.read_csv('/content/gdrive/MyDrive/X_test_v2.csv')
log_model_v1 = joblib.load("/content/gdrive/MyDrive/logistic_model_v1.pkl")
log_model_v2 = joblib.load("/content/gdrive/MyDrive/logistic_model_v2.pkl")

In [13]:
y_scores_v1 = log_model_v1.predict_proba(X_test_v1)[:, 1]
y_scores_v2 = log_model_v2.predict_proba(X_test_v2)[:, 1]

## Business Impact Calculation

In [14]:
file_path = '/content/gdrive/My Drive/telco.csv'
data = pd.read_csv(file_path)
data['Churn_encoded'] = data['Churn'].map({"Stayed": 0, "Churned": 1})

In [15]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
churn_rate = data['Churn_encoded'].mean()
customer_ltv = data['TotalCharges'].mean()
retention_cost = data['MonthlyCharges'].mean()*0.2

In [16]:
print(f'churn rate : {churn_rate}')
print(f'customer_ltv : {customer_ltv}')
print(f'retention_cost : {retention_cost}')

churn rate : 0.2653698707936959
customer_ltv : 2283.3004408418656
retention_cost : 12.952338492119836


In [17]:
y_test_b = pd.read_csv('/content/gdrive/MyDrive/y_test_b.csv')['0'].values
total_customers = len(y_test_b)

In [29]:
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np

pd.set_option('display.float_format', '{:,.2f}'.format)

def optimal_threshold_for_roi(y_true, y_scores, total_customers, churn_rate, customer_ltv, retention_cost, model_version):
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    best_result = None

    for thresh, recall, fp_rate in zip(thresholds, tpr, fpr):
        actual_churners = int(total_customers * churn_rate)
        actual_non_churners = total_customers - actual_churners

        tp = int(recall * actual_churners)
        fn = actual_churners - tp
        fp = int(fp_rate * actual_non_churners)
        flagged_customers = tp + fp

        precision = tp / (tp + fp) if flagged_customers > 0 else 0

        campaign_cost = flagged_customers * retention_cost
        revenue_protected = tp * customer_ltv
        revenue_lost = fn * customer_ltv
        net_savings = revenue_protected - campaign_cost

        best_savings = -np.inf
        best_thresh = None
        best_stats = None

        if net_savings > best_savings:
            best_savings = net_savings
            best_thresh = thresh
            best_stats  = {
                "model_version": model_version,
                "threshold": thresh,
                "precision": precision,
                "recall": recall,
                # "true_positives": tp,
                # "false_positives": fp,
                # "false_negatives": fn,
                "flagged_customers": flagged_customers,
                "campaign_cost": campaign_cost,
                "revenue_protected": revenue_protected,
                "revenue_lost": revenue_lost,
                "net_savings": net_savings,
            }

    return pd.DataFrame(best_stats, index=[0])


In [30]:
from sklearn.metrics import precision_recall_curve

def optimal_threshold_for_roi_pr(y_true, y_scores, total_customers, churn_rate, customer_ltv, retention_cost,
                              model_version, min_precision=0.8):
    """
    Find the threshold that maximizes net savings using precision-recall curve.

    Args:
        y_true: ground truth labels (0/1)
        y_scores: predicted probabilities for positive class (churn=1)
        total_customers: total number of customers
        churn_rate: fraction of customers that churn
        customer_ltv: revenue per churned customer
        retention_cost: cost to attempt retention per flagged customer
        min_precision: optional minimum precision constraint to reduce false positives

    Returns:
        dict: best threshold, corresponding net savings, and stats
    """
    prec, rec, thresholds = precision_recall_curve(y_true, y_scores)

    # precision_recall_curve returns thresholds of length n-1, append 1 for alignment
    thresholds = np.append(thresholds, 1.0)

    best_savings = -np.inf
    best_thresh = None
    best_stats = None

    for p, r, t in zip(prec, rec, thresholds):
        if p < min_precision:
            continue  # skip thresholds that do not satisfy precision constraint

        # Use your ROI function
        actual_churners = int(total_customers * churn_rate)
        actual_non_churners = total_customers - actual_churners

        true_positives = int(r * actual_churners)
        false_negatives = actual_churners - true_positives
        false_positives = int(true_positives * (1 / p - 1))
        flagged_customers = true_positives + false_positives

        campaign_cost = flagged_customers * retention_cost
        revenue_protected = true_positives * customer_ltv
        revenue_lost = false_negatives * customer_ltv
        net_savings = revenue_protected - campaign_cost

        if net_savings > best_savings:
            best_savings = net_savings
            best_thresh = t
            best_stats = {
                "model_version": model_version,
                "threshold": t,
                "precision": p,
                "recall": r,
                # "true_positives": true_positives,
                # "false_positives": false_positives,
                # "false_negatives": false_negatives,
                "flagged_customers": flagged_customers,
                "campaign_cost": campaign_cost,
                "revenue_protected": revenue_protected,
                "revenue_lost": revenue_lost,
                "net_savings": net_savings,

            }
        best_result_df = pd.DataFrame(best_stats, index=[0])

    return best_result_df


In [31]:
model_version = 'Logistic V1'
result_v1_roc = optimal_threshold_for_roi(y_test_b, y_scores_v1, total_customers,
                                            churn_rate, customer_ltv, retention_cost,
                                            model_version)
result_v1_roc['evaluation'] = 'roc'

In [32]:
result_v1_pr = optimal_threshold_for_roi_pr(y_test_b, y_scores_v1, total_customers,
                                            churn_rate, customer_ltv, retention_cost,
                                            model_version, min_precision=0.8)
result_v1_pr['evaluation'] = 'prc'
results = pd.concat([result_v1_roc, result_v1_pr], axis=0)
results

Unnamed: 0,model_version,threshold,precision,recall,flagged_customers,campaign_cost,revenue_protected,revenue_lost,net_savings,evaluation
0,Logistic V1,0.03,0.26,1.0,1055,13664.72,637040.82,0.0,623376.11,roc
0,Logistic V1,0.4,0.8,0.82,284,3678.46,522875.8,114165.02,519197.34,prc


In [33]:
model_version = 'Logistic V2'
result_v2_roc = optimal_threshold_for_roi(y_test_b, y_scores_v2, total_customers,
                                            churn_rate, customer_ltv, retention_cost,
                                            model_version)
result_v2_roc['evaluation'] = 'roc'
result_v2_pr = optimal_threshold_for_roi_pr(y_test_b, y_scores_v2, total_customers,
                                            churn_rate, customer_ltv, retention_cost,
                                            model_version, min_precision=0.8)
result_v2_pr['evaluation'] = 'prc'
results = pd.concat([result_v2_roc, result_v2_pr], axis=0)
results

Unnamed: 0,model_version,threshold,precision,recall,flagged_customers,campaign_cost,revenue_protected,revenue_lost,net_savings,evaluation
0,Logistic V2,0.03,0.26,1.0,1055,13664.72,637040.82,0.0,623376.11,roc
0,Logistic V2,0.37,0.8,0.82,284,3678.46,522875.8,114165.02,519197.34,prc


In [11]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- helper for ROI computation ---
def churn_roi_summary(customer_ltv, retention_cost, churn_rate, total_customers,
                      precision, recall):
    actual_churners = int(total_customers * churn_rate)
    actual_nonchurners = total_customers - actual_churners
    tp = int(recall * actual_churners)
    fn = actual_churners - tp
    fp = int(tp * (1 / precision - 1))
    flagged = tp + fp

    campaign_cost = flagged * retention_cost
    revenue_protected = tp * customer_ltv
    revenue_lost = fn * customer_ltv
    net_savings = revenue_protected - campaign_cost

    return {
        "Flagged Customers": flagged,
        "Revenue Protected ($)": revenue_protected,
        "Campaign Cost ($)": campaign_cost,
        "Revenue Lost ($)": revenue_lost,
        "Net Savings ($)": net_savings,
        "Precision": round(precision,3),
        "Recall": round(recall,3)
    }

# --- interactive UI ---

# wide_layout = widgets.Layout(width='400px')  # controls input box width
label_style = {'description_width': 'initial'}  # allow long labels to wrap or fully display

ltv_in = widgets.FloatText(
    value=500,
    description="Customer Lifetime Value ($)",
    # layout=wide_layout,
    style=label_style
)

cost_in = widgets.FloatText(
    value=20,
    description="Retention Cost ($)",
    # layout=wide_layout,
    style=label_style
)

churn_in = widgets.FloatSlider(
    value=0.25, min=0.01, max=0.6, step=0.01,
    description="Churn Rate",
    # layout=wide_layout,
    style=label_style,
    readout_format=".2f"
)

cust_in = widgets.IntText(
    value=10000,
    description="Total Customers",
    # layout=wide_layout,
    style=label_style
)

prec_in = widgets.FloatSlider(
    value=0.9, min=0.1, max=1.0, step=0.01,
    description="Model Precision",
    # layout=wide_layout,
    style=label_style,
    readout_format=".2f"
)

rec_in = widgets.FloatSlider(
    value=0.6, min=0.1, max=1.0, step=0.01,
    description="Model Recall",
    # layout=wide_layout,
    style=label_style,
    readout_format=".2f"
)

out = widgets.Output()

def update_table(_=None):
    with out:
        clear_output()
        res = churn_roi_summary(
            ltv_in.value, cost_in.value, churn_in.value, cust_in.value,
            prec_in.value, rec_in.value
        )
        df = pd.DataFrame([res])
        display(df.style.format({
            "Revenue Protected ($)": "{:,.0f}",
            "Campaign Cost ($)": "{:,.0f}",
            "Revenue Lost ($)": "{:,.0f}",
            "Net Savings ($)": "{:,.0f}"
        }).background_gradient(subset=["Net Savings ($)"], cmap="Greens"))

for w in [ltv_in, cost_in, churn_in, cust_in, prec_in, rec_in]:
    w.observe(update_table, names="value")

# with out:
#   display(widgets.VBox([ltv_in, cost_in, churn_in, cust_in, prec_in, rec_in, out]))
# display(out)

display(widgets.VBox([ltv_in, cost_in, churn_in, cust_in, prec_in, rec_in, out]))
update_table()




VBox(children=(FloatText(value=500.0, description='Customer Lifetime Value ($)', style=DescriptionStyle(descri…

In [8]:
from ipywidgets import Widget
Widget.close_all()

In [10]:
!jupyter nbconvert --to html --ClearOutputPreprocessor.enabled=True gdrive/MyDrive/Colab\ Notebooks/Churn_GENAI.ipynb


[NbConvertApp] Converting notebook gdrive/MyDrive/Colab Notebooks/Churn_GENAI.ipynb to html
[NbConvertApp] Writing 723383 bytes to gdrive/MyDrive/Colab Notebooks/Churn_GENAI.html


In [1]:
# !pip install --upgrade ipywidgets
# !jupyter nbextension enable --py widgetsnbextension


In [39]:
print(f'churn rate : {churn_rate}')
print(f'customer_ltv : {customer_ltv}')
print(f'retention_cost : {retention_cost}')
print(f'recall : {result_v2_pr['recall'].values[0]}')
print(f'precision : {result_v2_pr['precision'].values[0]}')

churn rate : 0.2653698707936959
customer_ltv : 2283.3004408418656
retention_cost : 12.952338492119836
recall : 0.8214285714285714
precision : 0.8041958041958042
