In [1]:
# Problem Statement:
# Analyzing large volumes of Google Play reviews manually is slow and makes it hard to identify recurring user complaints.
# Data: 0.6M+ Google Play Store reviews including text, rating, and app metadata.
# Goal: Automatically cluster similar complaints, extract key themes, and simulate clarifying follow-up questions for negative reviews.
# Business Impact: Enables faster triage of customer issues, prioritization of product fixes, and proactive customer engagement.
# Outcome: Reduced manual review time, improved product quality, and higher customer satisfaction.

In [2]:
!pip install nltk




In [3]:
!pip install gensim transformers torch scikit-learn

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [None]:
!pip install bertopic gsdmm transformers sentence-transformers umap-learn


Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
[31mERROR: Could not find a version that satisfies the requirement gsdmm (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gsdmm[0m[31m
[0m^C


In [6]:
import pandas as pd

In [7]:
from google.colab import files
uploaded = files.upload()

Saving googleplaystore_user_reviews.csv to googleplaystore_user_reviews (1).csv


In [8]:
# Replace 'your_file.xlsx' with the actual filename
df = pd.read_csv('googleplaystore_user_reviews.csv')
# Preview the data
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,gp:AOqpTOEhZuqSqqWnaKRgv-9ABYdajFUB0WugPGh-SG-...,Eric Tie,https://play-lh.googleusercontent.com/a-/AOh14...,I cannot open the app anymore,1,0,5.4.0.6,2020-10-27 21:24:41,,,newest,com.anydo
1,gp:AOqpTOH0WP4IQKBZ2LrdNmFy_YmpPCVrV3diEU9KGm3...,john alpha,https://play-lh.googleusercontent.com/a-/AOh14...,I have been begging for a refund from this app...,1,0,,2020-10-27 14:03:28,"Please note that from checking our records, yo...",2020-10-27 15:05:52,newest,com.anydo
2,gp:AOqpTOEMCkJB8Iq1p-r9dPwnSYadA5BkPWTf32Z1azu...,Sudhakar .S,https://play-lh.googleusercontent.com/a-/AOh14...,Very costly for the premium version (approx In...,1,0,,2020-10-27 08:18:40,,,newest,com.anydo
3,gp:AOqpTOGFrUWuKGycpje8kszj3uwHN6tU_fd4gLVFy9z...,SKGflorida@bellsouth.net DAVID S,https://play-lh.googleusercontent.com/-75aK0WF...,"Used to keep me organized, but all the 2020 UP...",1,0,,2020-10-26 13:28:07,What do you find troublesome about the update?...,2020-10-26 14:58:29,newest,com.anydo
4,gp:AOqpTOHls7DW8wmDFzTkHwxuqFkdNQtKHmO6Pt9jhZE...,Louann Stoker,https://play-lh.googleusercontent.com/-pBcY_Z-...,Dan Birthday Oct 28,1,0,5.6.0.7,2020-10-26 06:10:50,,,newest,com.anydo


# Task
Analyze customer reviews in the dataframe `df` to identify complaint clusters using word2vec, TF-IDF, BERTopic, and GSDMM. Additionally, use Llama3 with HuggingFace embeddings to generate follow-up questions for negative reviews to uncover deeper patterns.

## Load and preprocess data

### Subtask:
Load the `df` dataframe and perform initial preprocessing steps on the 'content' column, such as handling missing values, removing noise (e.g., special characters, URLs), and tokenization.


**Reasoning**:
Inspect the DataFrame and handle missing values in the 'content' column.



In [9]:
# Inspect the DataFrame
display(df.head())
display(df.info())

# Check for missing values in the 'content' column
display(df['content'].isnull().sum())

# Drop rows with missing 'content' values
df.dropna(subset=['content'], inplace=True)

# Verify that missing values have been handled
display(df['content'].isnull().sum())

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,sortOrder,appId
0,gp:AOqpTOEhZuqSqqWnaKRgv-9ABYdajFUB0WugPGh-SG-...,Eric Tie,https://play-lh.googleusercontent.com/a-/AOh14...,I cannot open the app anymore,1,0,5.4.0.6,2020-10-27 21:24:41,,,newest,com.anydo
1,gp:AOqpTOH0WP4IQKBZ2LrdNmFy_YmpPCVrV3diEU9KGm3...,john alpha,https://play-lh.googleusercontent.com/a-/AOh14...,I have been begging for a refund from this app...,1,0,,2020-10-27 14:03:28,"Please note that from checking our records, yo...",2020-10-27 15:05:52,newest,com.anydo
2,gp:AOqpTOEMCkJB8Iq1p-r9dPwnSYadA5BkPWTf32Z1azu...,Sudhakar .S,https://play-lh.googleusercontent.com/a-/AOh14...,Very costly for the premium version (approx In...,1,0,,2020-10-27 08:18:40,,,newest,com.anydo
3,gp:AOqpTOGFrUWuKGycpje8kszj3uwHN6tU_fd4gLVFy9z...,SKGflorida@bellsouth.net DAVID S,https://play-lh.googleusercontent.com/-75aK0WF...,"Used to keep me organized, but all the 2020 UP...",1,0,,2020-10-26 13:28:07,What do you find troublesome about the update?...,2020-10-26 14:58:29,newest,com.anydo
4,gp:AOqpTOHls7DW8wmDFzTkHwxuqFkdNQtKHmO6Pt9jhZE...,Louann Stoker,https://play-lh.googleusercontent.com/-pBcY_Z-...,Dan Birthday Oct 28,1,0,5.6.0.7,2020-10-26 06:10:50,,,newest,com.anydo


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12495 entries, 0 to 12494
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              12495 non-null  object
 1   userName              12495 non-null  object
 2   userImage             12495 non-null  object
 3   content               12495 non-null  object
 4   score                 12495 non-null  int64 
 5   thumbsUpCount         12495 non-null  int64 
 6   reviewCreatedVersion  10333 non-null  object
 7   at                    12495 non-null  object
 8   replyContent          5818 non-null   object
 9   repliedAt             5818 non-null   object
 10  sortOrder             12495 non-null  object
 11  appId                 12495 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.1+ MB


None

0

0

**Reasoning**:
Clean the 'content' column by removing special characters and URLs, then tokenize the cleaned text.



**Reasoning**:
The previous command failed because the 'punkt_tab' resource was not found. I will download this resource and re-run the code to clean and tokenize the text.



In [10]:
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab') # Download the missing resource

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

df['cleaned_content'] = df['content'].apply(clean_text)
df['tokenized_content'] = df['cleaned_content'].apply(word_tokenize)

display(df[['content', 'cleaned_content', 'tokenized_content']].head())

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,content,cleaned_content,tokenized_content
0,I cannot open the app anymore,i cannot open the app anymore,"[i, can, not, open, the, app, anymore]"
1,I have been begging for a refund from this app...,i have been begging for a refund from this app...,"[i, have, been, begging, for, a, refund, from,..."
2,Very costly for the premium version (approx In...,very costly for the premium version approx ind...,"[very, costly, for, the, premium, version, app..."
3,"Used to keep me organized, but all the 2020 UP...",used to keep me organized but all the updates...,"[used, to, keep, me, organized, but, all, the,..."
4,Dan Birthday Oct 28,dan birthday oct,"[dan, birthday, oct]"


## Feature extraction

### Subtask:
Experiment with different techniques like word2vec, TF-IDF, and Hugging Face embeddings to convert the text data into numerical representations.


**Reasoning**:
Calculate TF-IDF features, train a Word2Vec model, generate Word2Vec embeddings, prepare data for Hugging Face embeddings, and generate Hugging Face embeddings as per the instructions.



**Reasoning**:
The previous command failed because the `gensim` library was not installed. Install `gensim` and `transformers` using pip.



**Reasoning**:
The necessary libraries are now installed. Re-run the code to calculate TF-IDF features, train a Word2Vec model and generate embeddings, and generate Hugging Face embeddings.



In [11]:
!pip uninstall -y numpy gensim
!pip install numpy gensim

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting numpy
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy, gensim
[31mERROR: pip's dependency resolver does not currently 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import AutoModel, AutoTokenizer
import torch

# 1 and 2: Calculate and store TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit features for manageability
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_content'].fillna('')) # Handle potential NaNs
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
display(tfidf_df.head())

# 3 and 5: Train Word2Vec model and store embeddings
# Gensim Word2Vec requires a list of lists where each inner list is a tokenized sentence/review.
# Ensure all entries in tokenized_content are lists of strings
tokenized_corpus = df['tokenized_content'].apply(lambda x: [str(item) for item in x] if isinstance(x, list) else []).tolist()

# Remove empty lists from the corpus as Word2Vec can't train on them
tokenized_corpus = [review for review in tokenized_corpus if review]

if tokenized_corpus: # Check if the corpus is not empty
    word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

    # 4: Generate Word2Vec embeddings for each review by averaging word vectors
    def review_to_vec(review):
        # Filter out words not in the Word2Vec vocabulary
        valid_words = [word for word in review if word in word2vec_model.wv]
        if not valid_words:
            return [0] * word2vec_model.vector_size
        return (sum(word2vec_model.wv[word] for word in valid_words) / len(valid_words)).tolist()

    df['word2vec_embedding'] = df['tokenized_content'].apply(review_to_vec)
    display(df[['tokenized_content', 'word2vec_embedding']].head())
else:
    print("Word2Vec model could not be trained due to an empty or invalid tokenized corpus.")


# 6: Prepare the cleaned_content column for Hugging Face embeddings
cleaned_content_list = df['cleaned_content'].fillna('').tolist()

# 7 and 8: Load a pre-trained Hugging Face model and generate embeddings
# Using a Sentence-BERT model for sentence embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_hf_embeddings(text_list, tokenizer, model, batch_size=32):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)
        # Mean pooling to get sentence embeddings
        sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
        embeddings.extend(sentence_embeddings.tolist())
    return embeddings

# 9: Store the generated Hugging Face embeddings
df['hf_embedding'] = get_hf_embeddings(cleaned_content_list, tokenizer, model)
display(df[['cleaned_content', 'hf_embedding']].head())

Unnamed: 0,ability,able,about,absolutely,access,account,accounts,across,activities,activity,...,year,yearly,years,yes,yet,you,your,youre,yourself,youve
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.231524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,tokenized_content,word2vec_embedding
0,"[i, can, not, open, the, app, anymore]","[-0.09764262288808823, 0.8497353792190552, 0.0..."
1,"[i, have, been, begging, for, a, refund, from,...","[-0.18548421561717987, 0.4091920256614685, 0.3..."
2,"[very, costly, for, the, premium, version, app...","[-0.2908836901187897, 0.5418280959129333, 0.24..."
3,"[used, to, keep, me, organized, but, all, the,...","[-0.21876518428325653, 0.464526891708374, 0.07..."
4,"[dan, birthday, oct]","[-0.04106653481721878, 0.083074651658535, 0.03..."


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

KeyboardInterrupt: 

**Reasoning**:
The error indicates a binary incompatibility issue with numpy and gensim. This often happens when numpy is installed with a different version than what gensim expects. A common fix is to uninstall and reinstall numpy and gensim.



**Reasoning**:
The previous command was interrupted during the installation process. Re-run the installation command to ensure `numpy` and `gensim` are installed correctly.



**Reasoning**:
The libraries are now installed. Re-run the code to calculate TF-IDF features, train a Word2Vec model and generate embeddings, and generate Hugging Face embeddings as originally planned.



In [None]:
# ^feature extraction pipeline that transforms each document (or review) into numerical
#representations using three distinct strategies: TF-IDF, word2vec, transformer embeddings

In [None]:
df.head()

In [None]:

from bertopic import BERTopic
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import torch

# 1️⃣ BERTopic on Hugging Face embeddings
print("Running BERTopic...")
hf_embeddings_array = np.array(df['hf_embedding'].tolist())

# Optional: Dimensionality reduction for speed
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(hf_embeddings_array)

topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(df['cleaned_content'], reduced_embeddings)
df['bertopic_cluster'] = topics
display(df[['cleaned_content', 'bertopic_cluster']].head())

# 2️⃣ GSDMM clustering on tokenized content
from gsdmm import MovieGroupProcess
from collections import Counter

print("Running GSDMM...")
tokenized_docs = df['tokenized_content'].apply(lambda x: [w for w in x if len(w) > 2]).tolist()

vocab = set(w for doc in tokenized_docs for w in doc)
n_terms = len(vocab)

mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.1, n_iters=30)
y = mgp.fit(tokenized_docs, n_terms)
clusters = mgp.cluster_doc_distribution(tokenized_docs)
df['gsdmm_cluster'] = [np.argmax(c) if c else -1 for c in clusters]
display(df[['cleaned_content', 'gsdmm_cluster']].head())

# 3️⃣ Filter for negative reviews
def is_negative(row):
    # If rating column exists, use it; else sentiment analysis
    if 'score' in df.columns:
        return row['score'] <= 2
    return False

df_neg = df[df.apply(is_negative, axis=1)].copy()
print(f"Negative reviews: {len(df_neg)}")

# 4️⃣ Automated follow-up generation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

gen_model_name = "google/flan-t5-base"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)

def generate_followup(review_text, cluster_label, max_length=128):
    prompt = f"""You are a helpful app support agent.
User complaint cluster: {cluster_label}
Review: "{review_text}"
Write a short empathetic, professional reply (2-3 sentences) acknowledging the issue, suggesting a fix, and inviting more details."""
    inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = gen_model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply follow-up generation to first 5 negative reviews
df_neg['followup'] = df_neg.head(5).apply(
    lambda row: generate_followup(row['cleaned_content'], row['bertopic_cluster']), axis=1
)

display(df_neg[['cleaned_content', 'bertopic_cluster', 'followup']])

# 5️⃣ Save results
df_neg[['cleaned_content', 'bertopic_cluster', 'followup']].to_csv("negative_review_followups.csv", index=False)
print("Saved 'negative_review_followups.csv'")


In [None]:
!pip uninstall -y numpy scipy gensim
!pip install numpy scipy gensim

# Google Play Store Review Analysis

This notebook aims to analyze Google Play Store reviews to identify complaint clusters, extract key themes, and simulate clarifying follow-up questions for negative reviews.

**Problem Statement:**
Analyzing large volumes of Google Play reviews manually is slow and makes it hard to identify recurring user complaints.

**Data:** 0.6M+ Google Play Store reviews including text, rating, and app metadata.

**Goal:** Automatically cluster similar complaints, extract key themes, and simulate clarifying follow-up questions for negative reviews.

**Business Impact:** Enables faster triage of customer issues, prioritization of product fixes, and proactive customer engagement.

**Outcome:** Reduced manual review time, improved product quality, and higher customer satisfaction.

## Setup

Install necessary libraries.

In [5]:
!pip install nltk gensim transformers torch scikit-learn bertopic sentence-transformers umap-learn

Collecting bertopic
  Using cached bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Downloading bertopic-0.17.3-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.3


## Load and preprocess data

Load the dataset and perform initial preprocessing steps on the 'content' column, such as handling missing values, removing noise (e.g., special characters, URLs), and tokenization.

In [None]:
import pandas as pd
from google.colab import files
import re
import nltk
from nltk.tokenize import word_tokenize

# Upload the dataset file (if not already present)
# uploaded = files.upload() # Uncomment this line if you need to upload the file

# Replace 'googleplaystore_user_reviews.csv' with your actual filename
try:
    df = pd.read_csv('googleplaystore_user_reviews.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'googleplaystore_user_reviews.csv' not found. Please upload the file.")
    # You might want to exit or handle this case
    # exit() # Uncomment to stop execution if file is not found

# Inspect the DataFrame
display(df.head())
display(df.info())

# Check for missing values in the 'content' column
print("\nMissing values in 'content' column before dropping:")
display(df['content'].isnull().sum())

# Drop rows with missing 'content' values
df.dropna(subset=['content'], inplace=True)

# Verify that missing values have been handled
print("\nMissing values in 'content' column after dropping:")
display(df['content'].isnull().sum())

# Download necessary NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')
except LookupError:
    nltk.download('punkt')


# Define text cleaning function
def clean_text(text):
    # Ensure input is a string
    if not isinstance(text, str):
        return ""
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply cleaning and tokenization
df['cleaned_content'] = df['content'].apply(clean_text)
df['tokenized_content'] = df['cleaned_content'].apply(word_tokenize)

display(df[['content', 'cleaned_content', 'tokenized_content']].head())

## Feature extraction

Experiment with different techniques like word2vec, TF-IDF, and Hugging Face embeddings to convert the text data into numerical representations.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np

# 1 and 2: Calculate and store TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit features for manageability
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_content'].fillna('')) # Handle potential NaNs
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Features:")
display(tfidf_df.head())

# 3 and 5: Train Word2Vec model and store embeddings
# Gensim Word2Vec requires a list of lists where each inner list is a tokenized sentence/review.
# Ensure all entries in tokenized_content are lists of strings
tokenized_corpus = df['tokenized_content'].apply(lambda x: [str(item) for item in x] if isinstance(x, list) else []).tolist()

# Remove empty lists from the corpus as Word2Vec can't train on them
tokenized_corpus = [review for review in tokenized_corpus if review]

word2vec_model = None # Initialize model as None
if tokenized_corpus: # Check if the corpus is not empty
    print("\nTraining Word2Vec model...")
    word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    print("Word2Vec model trained.")

    # 4: Generate Word2Vec embeddings for each review by averaging word vectors
    def review_to_vec(review):
        # Filter out words not in the Word2Vec vocabulary
        if word2vec_model is None: # Check if model was trained
            return [0] * 100 # Return zero vector if model wasn't trained

        valid_words = [word for word in review if word in word2vec_model.wv]
        if not valid_words:
            return [0] * word2vec_model.vector_size
        return (sum(word2vec_model.wv[word] for word in valid_words) / len(valid_words)).tolist()

    df['word2vec_embedding'] = df['tokenized_content'].apply(review_to_vec)
    print("Word2Vec Embeddings:")
    display(df[['tokenized_content', 'word2vec_embedding']].head())
else:
    print("Word2Vec model could not be trained due to an empty or invalid tokenized corpus.")


# 6: Prepare the cleaned_content column for Hugging Face embeddings
cleaned_content_list = df['cleaned_content'].fillna('').tolist()

# 7 and 8: Load a pre-trained Hugging Face model and generate embeddings
# Using a Sentence-BERT model for sentence embeddings
print("\nLoading Hugging Face model for embeddings...")
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
print("Hugging Face model loaded.")

def get_hf_embeddings(text_list, tokenizer, model, batch_size=32):
    embeddings = []
    # Ensure text_list contains only strings
    text_list = [str(text) for text in text_list]
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
        # Move tensors to GPU if available
        if torch.cuda.is_available():
            encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
            model.to('cuda')

        with torch.no_grad():
            model_output = model(**encoded_input)
        # Mean pooling to get sentence embeddings
        sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
        # Move embeddings back to CPU for storage if on GPU
        embeddings.extend(sentence_embeddings.cpu().tolist())
    return embeddings

# 9: Store the generated Hugging Face embeddings
print("Generating Hugging Face embeddings...")
df['hf_embedding'] = get_hf_embeddings(cleaned_content_list, tokenizer, model)
print("Hugging Face Embeddings:")
display(df[['cleaned_content', 'hf_embedding']].head())

## Clustering and Follow-up Generation

Cluster reviews using BERTopic and GSDMM. Filter negative reviews and generate automated follow-up questions for a sample of them.

In [None]:
from bertopic import BERTopic
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from gsdmm import MovieGroupProcess
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# 1️⃣ BERTopic on Hugging Face embeddings
print("Running BERTopic...")
# Ensure hf_embedding column exists and contains lists
if 'hf_embedding' in df.columns and df['hf_embedding'].iloc[0] is not None:
    hf_embeddings_array = np.array(df['hf_embedding'].tolist())

    # Optional: Dimensionality reduction for speed (adjust n_components as needed)
    # Check if the number of components is less than the number of features and samples
    n_components = min(50, hf_embeddings_array.shape[1], hf_embeddings_array.shape[0])
    if n_components > 0:
        pca = PCA(n_components=n_components)
        reduced_embeddings = pca.fit_transform(hf_embeddings_array)
    else:
        reduced_embeddings = hf_embeddings_array # Use original if PCA not possible

    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(df['cleaned_content'].fillna(''), reduced_embeddings)
    df['bertopic_cluster'] = topics
    print("BERTopic Clusters:")
    display(df[['cleaned_content', 'bertopic_cluster']].head())
else:
    print("Hugging Face embeddings not found or are empty. Skipping BERTopic.")
    df['bertopic_cluster'] = -1 # Assign a default value


# 2️⃣ GSDMM clustering on tokenized content
print("\nRunning GSDMM...")
# Filter tokens and ensure they are strings
tokenized_docs = df['tokenized_content'].apply(lambda x: [str(w) for w in x if isinstance(w, str) and len(w) > 2] if isinstance(x, list) else []).tolist()

# Create vocabulary
vocab = set(w for doc in tokenized_docs for w in doc)
n_terms = len(vocab)

if tokenized_docs and n_terms > 0: # Check if there are documents and a vocabulary
    mgp = MovieGroupProcess(K=20, alpha=0.1, beta=0.1, n_iters=30)
    # GSDMM expects a list of lists of word IDs
    word_to_id = {word: i for i, word in enumerate(vocab)}
    tokenized_docs_ids = [[word_to_id[word] for word in doc if word in word_to_id] for doc in tokenized_docs]

    # Filter out empty lists after mapping to IDs
    tokenized_docs_ids = [doc for doc in tokenized_docs_ids if doc]

    if tokenized_docs_ids: # Check if there are documents with valid word IDs
        print(f"Running GSDMM with {len(tokenized_docs_ids)} documents and {n_terms} terms.")
        y = mgp.fit(tokenized_docs_ids, n_terms)
        clusters = mgp.cluster_doc_distribution(tokenized_docs_ids)
        # Map GSDMM cluster assignments back to the original DataFrame
        # Need to handle the mapping carefully as some documents might have been filtered
        original_indices = [i for i, doc in enumerate(df['tokenized_content'].tolist()) if [str(w) for w in doc if isinstance(w, str) and len(w) > 2]]
        gsdmm_clusters = [-1] * len(df) # Default to -1
        for i, cluster_dist in enumerate(clusters):
             if i < len(original_indices): # Ensure index is within bounds
                 original_df_index = original_indices[i]
                 gsdmm_clusters[original_df_index] = np.argmax(cluster_dist) if cluster_dist else -1
        df['gsdmm_cluster'] = gsdmm_clusters


        print("GSDMM Clusters:")
        display(df[['cleaned_content', 'gsdmm_cluster']].head())
    else:
         print("No valid documents with word IDs for GSDMM after filtering. Skipping GSDMM.")
         df['gsdmm_cluster'] = -1 # Assign a default value

else:
    print("No tokenized documents or vocabulary found for GSDMM. Skipping GSDMM.")
    df['gsdmm_cluster'] = -1 # Assign a default value


# 3️⃣ Filter for negative reviews
# Use the 'score' column directly if it exists
if 'score' in df.columns:
    df_neg = df[df['score'] <= 2].copy()
    print(f"\nNegative reviews (score <= 2): {len(df_neg)}")
else:
    print("\n'score' column not found. Cannot filter negative reviews by score.")
    df_neg = pd.DataFrame() # Create an empty DataFrame

# 4️⃣ Automated follow-up generation
if not df_neg.empty:
    print("\nAutomated follow-up generation...")
    gen_model_name = "google/flan-t5-base"
    gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
    gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name)

    # Move model to GPU if available
    if torch.cuda.is_available():
        gen_model.to('cuda')

    def generate_followup(row, max_length=128):
        review_text = row['cleaned_content'] if 'cleaned_content' in row else ''
        # Use BERTopic cluster if available, otherwise use GSDMM or a default
        cluster_label = row['bertopic_cluster'] if 'bertopic_cluster' in row and row['bertopic_cluster'] != -1 else \
                        (row['gsdmm_cluster'] if 'gsdmm_cluster' in row and row['gsdmm_cluster'] != -1 else 'general issue')

        if not review_text:
            return "N/A: Empty review content."

        prompt = f"""You are a helpful app support agent.
User complaint cluster: {cluster_label}
Review: "{review_text}"
Write a short empathetic, professional reply (2-3 sentences) acknowledging the issue, suggesting a fix, and inviting more details."""
        inputs = gen_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)

        # Move inputs to GPU if available
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        try:
            outputs = gen_model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
            return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            return f"Error generating followup: {e}"


    # Apply follow-up generation to first 5 negative reviews for demonstration
    # Check if df_neg has at least 5 rows before slicing
    if len(df_neg) > 0:
        df_neg['followup'] = df_neg.head(min(5, len(df_neg))).apply(generate_followup, axis=1)
        display(df_neg[['cleaned_content', 'bertopic_cluster', 'gsdmm_cluster', 'followup']].head(min(5, len(df_neg)))) # Display generated followups
    else:
        print("No negative reviews to generate follow-ups for.")

    # 5️⃣ Save results
    if len(df_neg) > 0:
        # Select relevant columns for saving
        cols_to_save = ['content', 'cleaned_content', 'score', 'bertopic_cluster', 'gsdmm_cluster', 'followup']
        # Filter for columns that actually exist in df_neg
        cols_to_save_existing = [col for col in cols_to_save if col in df_neg.columns]
        df_neg[cols_to_save_existing].to_csv("negative_review_followups.csv", index=False)
        print("\nSaved 'negative_review_followups.csv' with follow-ups for sample negative reviews.")
    else:
        print("\nNo negative reviews to save results for.")
else:
    print("\nSkipping follow-up generation and saving as no negative reviews were found.")