In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bertopic import BERTopic
import joblib
import warnings

# --- Setup ---
warnings.filterwarnings('ignore')
print("--- Setup: Loading libraries ---")

# Download necessary NLTK data (only needs to be done once per environment)
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
try:
    nltk.data.find('wordnet')
except LookupError:
    print("Downloading NLTK WordNet...")
    nltk.download('wordnet')
print("NLTK resources are ready. ✓")


# --- 1. Load Data ---
print("\n--- 1. Loading Data ---")
try:
    df = pd.read_csv('telco_churn_with_all_feedback.csv')
    df['CustomerFeedback'].fillna('', inplace=True)
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: 'telco_churn_with_all_feedback.csv' not found.")
    exit()

# --- 2. Comprehensive Text Preprocessing ---
print("\n--- 2. Performing Text Preprocessing for Topic Modeling ---")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_for_topic_modeling(text):
    """
    Cleans, tokenizes, removes stopwords, and lemmatizes text.
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = re.sub(r'\d+', '', text) # remove numbers
    tokens = text.split()
    # Lemmatize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply the preprocessing to the feedback column
print("Applying preprocessing to 'CustomerFeedback' column...")
df['ProcessedFeedback'] = df['CustomerFeedback'].apply(preprocess_for_topic_modeling)
print("Text preprocessing complete.")
print("Sample of preprocessed feedback:")
print(df[['CustomerFeedback', 'ProcessedFeedback']].head())


# --- 3. Train BERTopic Model ---
print("\n--- 3. Training BERTopic Model ---")
# We will use a subset of the data for faster training in this environment.
# On a more powerful machine, you could use the full dataset.
feedback_docs = df['ProcessedFeedback'][df['ProcessedFeedback'] != ''].tolist()
sample_docs = feedback_docs[:] # Using a sample of 2000 for speed

print(f"Training topic model on {len(sample_docs)} documents...")
# Initialize BERTopic. We set min_topic_size to control the number of topics.
topic_model = BERTopic(language="english", calculate_probabilities=True, min_topic_size=30, verbose=False)

# Fit the model and transform the documents to get topic assignments
topics, probs = topic_model.fit_transform(sample_docs)
print("Topic modeling complete. ✓")


# --- 4. Analyze and Visualize Topics ---
print("\n--- 4. Analyzing Discovered Topics ---")

# Get the main topic information (frequency, name, etc.)
# Topic -1 is for outliers (documents that couldn't be assigned to a specific topic)
topic_info = topic_model.get_topic_info()
print("Top 10 Discovered Topics:")
print(topic_info.head(11))

# Visualize the most frequent topics
print("\nGenerating Topic Frequency Bar Chart...")
# We visualize the top 10 topics excluding the outlier topic (-1)
topic_model.visualize_barchart(top_n_topics=10, n_words=5, title="Top 10 Most Frequent Topics").show()


# --- 5. Save the Topic Model and Results ---
print("\n--- 5. Saving Topic Model and Data ---")

# For demonstration, we'll assign topics back to the sampled part of the dataframe
df_sample = df[df['ProcessedFeedback'] != ''].iloc[:].copy()
df_sample['Topic'] = topics

# Save the topic model using joblib for consistency
joblib.dump(topic_model, 'bertopic_model.pkl')
print("BERTopic model saved to 'bertopic_model.pkl'")

# Save the sample dataframe with topic assignments
df_sample.to_csv('telco_churn_with_topics_sample.csv', index=False)
print("Sample data with topics saved to 'telco_churn_with_topics_sample.csv'")

print("\nTopic modeling process is complete.")



--- Setup: Loading libraries ---
Downloading NLTK WordNet...
NLTK resources are ready. ✓

--- 1. Loading Data ---
Data loaded successfully.

--- 2. Performing Text Preprocessing for Topic Modeling ---
Applying preprocessing to 'CustomerFeedback' column...


[nltk_data] Downloading package wordnet to C:\Users\P RAJ
[nltk_data]     KIRAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text preprocessing complete.
Sample of preprocessed feedback:
                                    CustomerFeedback  \
0  I have been using the DSL internet service fro...   
1  I have been a customer with this company for o...   
2  I recently signed up for DSL internet service ...   
3  I have been a loyal customer with this company...   
4  I recently switched to this fiber optic intern...   

                                   ProcessedFeedback  
0  using dsl internet service provider past month...  
1  customer company two half year satisfied servi...  
2  recently signed dsl internet service provider ...  
3  loyal customer company month satisfied service...  
4  recently switched fiber optic internet service...  

--- 3. Training BERTopic Model ---
Training topic model on 7043 documents...
Topic modeling complete. ✓

--- 4. Analyzing Discovered Topics ---
Top 10 Discovered Topics:
    Topic  Count                                 Name  \
0      -1    584     -1_dsl_provider_servic


--- 5. Saving Topic Model and Data ---
BERTopic model saved to 'bertopic_model.pkl'
Sample data with topics saved to 'telco_churn_with_topics_sample.csv'

Topic modeling process is complete.


In [None]:
#pip install "plotly==5.22.0" "nbformat>=4.2.0"


Collecting plotly==5.22.0
  Downloading plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)
Downloading plotly-5.22.0-py3-none-any.whl (16.4 MB)
   ---------------------------------------- 0.0/16.4 MB ? eta -:--:--
   --------- ------------------------------ 3.9/16.4 MB 19.6 MB/s eta 0:00:01
   ------------------------ --------------- 10.0/16.4 MB 23.9 MB/s eta 0:00:01
   ------------------------------------- -- 15.5/16.4 MB 24.3 MB/s eta 0:00:01
   ---------------------------------------- 16.4/16.4 MB 23.0 MB/s  0:00:00
Installing collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 6.2.0
    Uninstalling plotly-6.2.0:
      Successfully uninstalled plotly-6.2.0
Successfully installed plotly-5.22.0
Note: you may need to restart the kernel to use updated packages.
