In [None]:
import pandas as pd

file_path = 'Octopus_cleaned_data.csv'
df = pd.read_csv(file_path)

## Step 1: Preprocess Text Data
Use the Cleaned_Text column for clean input.
Tokenize, remove stopwords, and lemmatize the text.

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
negation_words = {"not", "no", "never", "none", "nothing", "nobody", "nowhere", "neither", "nor", "without"}

def handle_negations(tokens):
    processed_tokens = []
    skip_next = False

    for i, token in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue

        if token in negation_words and i + 1 < len(tokens):
            # Combine the negation word with the next word
            combined_token = f"{token}_{tokens[i + 1]}"
            processed_tokens.append(combined_token)
            skip_next = True
        else:
            processed_tokens.append(token)

    return processed_tokens

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text_with_negations(text):
    if pd.isna(text):
        return ""
    tokens = word_tokenize(text.lower())
    tokens = handle_negations(tokens)

    processed_tokens = [
        lemmatizer.lemmatize(token) for token in tokens
        if token not in stop_words and token not in string.punctuation
    ]
    return " ".join(processed_tokens)

df['Preprocessed_Text'] = df['Cleaned_Text'].apply(preprocess_text_with_negations)

## Step 2: Extract Features
Convert text data into numerical features using vectorization techniques. Here we have used "Count Vectorizer".

### 2.1 Initialize and Fit the Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import numpy as np

count_vectorizer = CountVectorizer(max_features=1200, ngram_range=(1, 2))
count_matrix = count_vectorizer.fit_transform(df['Preprocessed_Text'])
count_feature_names = count_vectorizer.get_feature_names_out()

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_feature_names)

# Step 3: Apply Topic Modeling
Use Latent Dirichlet Allocation (LDA) to uncover hidden topics within the reviews.

Steps:
1. LDA Topic Modeling
2. Extracting Top Words
3. Automated Topic-to-Category Mapping
4. Topic Coherence

### 3.1 Latent Dirichlet Allocation (LDA)
An LDA model with n_components=3 (the number of predefined categories: Customer Service, Pricing, and Product Quality.)

In [None]:
lda_model = LatentDirichletAllocation(
    n_components=3,
    random_state=150,
    learning_method='online',
    max_iter=10
)

lda_model.fit(count_matrix)

### 3.2 Extracting Top Words
Top 12 words for each topic to interpret their meanings effectively.

In [None]:
def display_topics(model, feature_names, no_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics.append(top_features)
    return topics

no_top_words = 12
lda_topics = display_topics(lda_model, count_feature_names, no_top_words)

topic_words = {f"Topic {i+1}": words for i, words in enumerate(lda_topics)}
topic_words_df = pd.DataFrame(topic_words).T
print("LDA Topics and Top Words:")
topic_words_df

LDA Topics and Top Words:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Topic 1,kind,problem,operator,quick,thanks,thank,helpful,response,kindness,speed,professional,fast
Topic 2,service,customer,customer service,whatsapp,via,excellent,fast,efficient,rate,change,always,request
Topic 3,octopus,energy,bill,rate,company,price,offer,octopus energy,supplier,first,month,also


### 3.3 Automated Topic-to-Category Mapping
Using cosine similarity between LDA topics and predefined category descriptions to assign topics to categories automatically.

In [None]:
predefined_categories = ["Customer Service", "Pricing", "Product Quality"]
predefined_descriptions = ["customer support help", "price cost value bill offer", "quality product energy supply efficient problem"]

vectorizer = CountVectorizer()
category_vectors = vectorizer.fit_transform(predefined_descriptions)
topic_vectors = vectorizer.transform([" ".join(topic) for topic in lda_topics])


similarity = cosine_similarity(topic_vectors, category_vectors)
mapped_categories = [predefined_categories[i] for i in similarity.argmax(axis=1)]

print("\nTopic-to-Category Mapping:\n")
for idx, category in enumerate(mapped_categories, 1):
    print(f"Topic {idx}: {category}")


Topic-to-Category Mapping:

Topic 1: Product Quality
Topic 2: Customer Service
Topic 3: Pricing


### 3.4 Topic Coherence
Computing a coherence score using Gensim's CoherenceModel, providing a metric to evaluate the interpretability of topics.

In [None]:
!pip install gensim



In [None]:
from gensim.corpora.dictionary import Dictionary

processed_tokens_list = [text.split() for text in df['Preprocessed_Text']]
dictionary = Dictionary(processed_tokens_list)

corpus = [dictionary.doc2bow(text) for text in processed_tokens_list]

gensim_topics = [[(dictionary[i], topic[i]) for i in topic.argsort()[:-no_top_words - 1:-1]] for topic in lda_model.components_]

# Compute coherence score
coherence_model = CoherenceModel(
    topics=[[word for word, _ in topic] for topic in gensim_topics],
    texts=processed_tokens_list,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()
print("\nCoherence Score for LDA Topics:", coherence_score)


Coherence Score for LDA Topics: 0.5708287206681072


## Step 4: Clustering
Perform clustering (K-Means) on the extracted features.

Steps:
* Choose the number of clusters as 3 (based on the number of categories).
* Analyze cluster centers or most frequent words in each cluster.
* Assign cluster labels to predefined categories manually.


In [None]:
kmeans_model = KMeans(n_clusters=3, random_state=82, max_iter=200)
kmeans_model.fit(count_matrix)

df['Cluster_Label'] = kmeans_model.labels_

cluster_centers = kmeans_model.cluster_centers_
top_n = 15

# Extract top terms for each cluster
cluster_keywords = {}
for cluster_num in range(3):
    top_feature_indices = cluster_centers[cluster_num].argsort()[-top_n:][::-1]
    top_features = [count_feature_names[i] for i in top_feature_indices]
    cluster_keywords[f"Cluster {cluster_num + 1}"] = top_features

# Map clusters to predefined categories manually
cluster_to_category = {
    0: "Product Quality",
    1: "Customer Service",
    2: "Pricing",
}

df['Assigned_Category'] = df['Cluster_Label'].map(cluster_to_category)

print("Cluster Keywords:")
print(pd.DataFrame(cluster_keywords))
print("\nAssigned Categories:")
df[['Cleaned_Text', 'Cluster_Label', 'Assigned_Category']].head(10)


Cluster Keywords:
    Cluster 1           Cluster 2       Cluster 3
0        kind            customer         octopus
1    operator             service          energy
2      thanks    customer service        customer
3        fast           excellent            rate
4   excellent             octopus  octopus energy
5     problem                rate            bill
6     service                kind        operator
7     company                fast           offer
8       thank            whatsapp        contract
9       quick              thanks           month
10      clear            operator         service
11   response               great         company
12      great              always          change
13    octopus  excellent customer        supplier
14       rate             company          thanks

Assigned Categories:


Unnamed: 0,Cleaned_Text,Cluster_Label,Assigned_Category
0,registered today even though received bill mom...,2,Pricing
1,thanks laila solved one problem none attributa...,0,Product Quality
2,fantastic customer service opportunity speak m...,1,Customer Service
3,switched octopus energy luce july january also...,2,Pricing
4,switched octopus energy almost year ago say ha...,2,Pricing
5,ive month say ive month say excellent supplier...,2,Pricing
6,lowest price market fixed rate year modern eff...,1,Customer Service
7,contract flex mono tariff considering increasi...,0,Product Quality
8,meanwhile speed telephone response immediate r...,0,Product Quality
9,octopus customer officially today even though ...,2,Pricing


## Step 5: Evaluate Results
Analyze overlaps or ambiguities between clusters/topics and predefined categories.

In [None]:
cluster_keywords_debug = {}
top_n = 10

for cluster_num in range(3):
    top_feature_indices = cluster_centers[cluster_num].argsort()[-top_n:][::-1]
    top_features = [count_feature_names[i] for i in top_feature_indices if i < len(count_feature_names)]
    cluster_keywords_debug[f"Cluster {cluster_num + 1}"] = top_features

cluster_distribution = df['Cluster_Label'].value_counts()

sample_per_cluster = df.groupby('Cluster_Label').apply(
    lambda x: x[['Cleaned_Text', 'Assigned_Category']].head(3)
).reset_index(drop=True)

overlapping_keywords = {}
for cluster_a in range(3):
    for cluster_b in range(cluster_a + 1, 3):
        overlap = set(cluster_keywords_debug[f"Cluster {cluster_a + 1}"]).intersection(
            set(cluster_keywords_debug[f"Cluster {cluster_b + 1}"])
        )
        overlapping_keywords[f"Cluster {cluster_a + 1} & Cluster {cluster_b + 1}"] = list(overlap)


Cluster_Distribution = cluster_distribution.to_frame()
Sample_Reviews_by_Cluster = sample_per_cluster
OverlappingKeywords = pd.DataFrame(overlapping_keywords.items(), columns=["Clusters", "Overlapping Keywords"])

OverlappingKeywords.head()

  sample_per_cluster = df.groupby('Cluster_Label').apply(


Unnamed: 0,Clusters,Overlapping Keywords
0,Cluster 1 & Cluster 2,"[excellent, thanks, fast, service, kind]"
1,Cluster 1 & Cluster 3,[operator]
2,Cluster 2 & Cluster 3,"[customer, octopus, rate]"


In [None]:
Cluster_Distribution.head()

Unnamed: 0_level_0,count
Cluster_Label,Unnamed: 1_level_1
0,6097
1,1698
2,1050


In [None]:
Sample_Reviews_by_Cluster.head()

Unnamed: 0,Cleaned_Text,Assigned_Category
0,thanks laila solved one problem none attributa...,Product Quality
1,contract flex mono tariff considering increasi...,Product Quality
2,meanwhile speed telephone response immediate r...,Product Quality
3,fantastic customer service opportunity speak m...,Customer Service
4,lowest price market fixed rate year modern eff...,Customer Service
