In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [39]:
# Download necessary files from NLTK:
# punkt -> Tokenization
# stopwords -> Stop words removal
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/repl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
# Load the reviews dataset and preview it
reviews = pd.read_csv("reviews.csv")
reviews.head()

Unnamed: 0,content,score
0,I cannot open the app anymore,1
1,I have been begging for a refund from this app...,1
2,Very costly for the premium version (approx In...,1
3,"Used to keep me organized, but all the 2020 UP...",1
4,Dan Birthday Oct 28,1


In [41]:
# Filter the dataframe for negative reviews
reviews_negative = reviews[reviews["score"] <= 2]['content']

In [42]:
# Create a function that tokenizes reviews and removes non-alpha characters and stopwords
def preprocess_text(text):
    tokens = word_tokenize(text)
    filtered_tokens = [
        token 
        for token in tokens 
        if token.isalpha() and token.lower() not in stopwords.words('english')
    ]
    return " ".join(filtered_tokens)

In [43]:
# Call function on negative reviews
reviews_negative_filtered = reviews_negative.apply(preprocess_text)

In [44]:
# Store result in a dataframe
preprocessed_reviews = pd.DataFrame(data={'review': reviews_negative_filtered})

In [45]:
# Vectorise reviews using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_reviews['review'])

In [46]:
# Apply K-means clustering to the TF-IDF matrix to group the reviews into five categories
clust_kmeans = KMeans(n_clusters=5, random_state=42)
pred_labels = clust_kmeans.fit_predict(tfidf_matrix)

In [47]:
# Store predicted labels as categories in the dataframe
categories = pred_labels.tolist()
preprocessed_reviews["category"] = categories

In [48]:
# Get the feature names (terms) from the vectorizer
terms = vectorizer.get_feature_names_out()

In [49]:
# Find the most frequent term for each cluster label
topic_terms_list = []

for cluster in range(clust_kmeans.n_clusters):
    cluster_indices = [i for i, label in enumerate(categories) if label == cluster]
    cluster_tfidf_sum = tfidf_matrix[cluster_indices].sum(axis=0)
    cluster_term_freq = np.array(cluster_tfidf_sum).ravel()
    top_term_index = cluster_term_freq.argsort()[::-1][0]
    topic_terms_list.append(
        {
        'category': cluster,
        'term': terms[top_term_index],
        'frequency': cluster_term_freq[top_term_index]
        }
    ) 

In [50]:
# Store results in a dataframe
topic_terms = pd.DataFrame(topic_terms_list)

In [51]:
print(topic_terms)

   category      term   frequency
0         0   account   50.977244
1         1   version   67.720235
2         2  calendar   76.045613
3         3       app  182.736681
4         4      good   37.173626
