#### Analysis of negative reviews by preprocessing the text, vectorizing it, applying clustering to group similar reviews, and identifying key themes represented by the most significant terms in each cluster. This process can help understand common customer complaints or sentiments expressed in negative reviews.

## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## Download necessary files from NLTK

In [2]:
nltk.download('punkt') # Tokenization
nltk.download('stopwords')  # Stop words removal

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load the reviews dataset and preview it

In [5]:
reviews = pd.read_csv('/content/reviews.csv')
reviews.head()

Unnamed: 0,content,score
0,I cannot open the app anymore,1
1,I have been begging for a refund from this app...,1
2,Very costly for the premium version (approx In...,1
3,"Used to keep me organized, but all the 2020 UP...",1
4,Dan Birthday Oct 28,1


## Step 1: Preprocess the negative reviews

In [7]:
# Filter negative reviews (having a score of 1 or 2)
negative_reviews_tmp = reviews[(reviews['score'] == 1) | (reviews['score'] == 2)]['content']
negative_reviews_tmp

0                            I cannot open the app anymore
1        I have been begging for a refund from this app...
2        Very costly for the premium version (approx In...
3        Used to keep me organized, but all the 2020 UP...
4                                      Dan Birthday Oct 28
                               ...                        
11940    I loved it until I realized that the very feat...
11941    Gave it a test run and tried out the notificat...
11942    Looks great but since installing, my device on...
11943    This app looked good until I had to purchase i...
11944                                             It's OK!
Name: content, Length: 4850, dtype: object

In [10]:
def preprocess_text(text):
  tokens = word_tokenize(text)
  # Removing stop words and non-alpha characters (special characters)
  filtered_tokens = [token for token in tokens if token.isalpha() and token.lower() not in stopwords.words('english')
  ]
  return " ".join(filtered_tokens)

In [15]:
# Apply the preprocessing function to the negative reviews
negative_reviews_cleaned = negative_reviews_tmp.apply(preprocess_text)
negative_reviews_cleaned

0                                         open app anymore
1                 begging refund app month nobody replying
2        costly premium version approx Indian Rupees pe...
3        Used keep organized UPDATES made mess things c...
4                                         Dan Birthday Oct
                               ...                        
11940    loved realized feature got download first plac...
11941    Gave test run tried notifications hear thing A...
11942    Looks great since installing device lasts half...
11943    app looked good purchase get week view everyti...
11944                                                   OK
Name: content, Length: 4850, dtype: object

In [14]:
# Store the preprocessed negative reviews in a pandas DataFrame
preprocessed_reviews = pd.DataFrame({"reviews": negative_reviews_cleaned})
preprocessed_reviews.head()

Unnamed: 0,reviews
0,open app anymore
1,begging refund app month nobody replying
2,costly premium version approx Indian Rupees pe...
3,Used keep organized UPDATES made mess things c...
4,Dan Birthday Oct


## Step 2: Vectorize the cleaned negative reviews using TF-IDF

In [19]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_reviews['reviews'])
tfidf_matrix.size

68707

## Step 3: Apply K-means clustering to tfidf_matrix

In [20]:
# Apply K-means clustering (store the model as clust_kmeans)
clust_kmeans = KMeans(n_clusters=5, random_state=500)
pred_labels = clust_kmeans.fit_predict(tfidf_matrix)

  super()._check_params_vs_input(X, default_n_init=10)


In [22]:
# Store the predicted labels in a list variable called categories
categories = pred_labels.tolist()
preprocessed_reviews['category'] = categories
preprocessed_reviews

Unnamed: 0,reviews,category
0,open app anymore,0
1,begging refund app month nobody replying,0
2,costly premium version approx Indian Rupees pe...,4
3,Used keep organized UPDATES made mess things c...,2
4,Dan Birthday Oct,2
...,...,...
11940,loved realized feature got download first plac...,4
11941,Gave test run tried notifications hear thing A...,2
11942,Looks great since installing device lasts half...,2
11943,app looked good purchase get week view everyti...,0


## Step 4: For each unique cluster label, find the most frequent term

In [25]:
# Get the feature names (terms) from the vectorizer
terms = vectorizer.get_feature_names_out()

# List to save the top term for each cluster
topic_terms_list = []

for cluster in range(clust_kmeans.n_clusters):
  # Get indices of reviews in the current cluster
  cluster_indices = [i for i, label in enumerate(categories) if label == cluster]

  # Sum the tf-idf scores for each term in the cluster
  cluster_tfidf_sum = tfidf_matrix[cluster_indices].sum(axis=0)
  cluster_term_freq = np.asarray(cluster_tfidf_sum).ravel()

  # Get the top term and its frequencies
  top_term_index = cluster_term_freq.argsort()[::-1][0]

  # Append rows to the topic_terms DataFrame with three fields:
  # - category: label / cluster assigned from K-means
  # - term: the identified top term
  # - frequency: term's weight for the category
  topic_terms_list.append(
  {
  "category": cluster,
  "term": terms[top_term_index],
  "frequency": cluster_term_freq[top_term_index],
  }
  )

[0, 1, 6, 8, 9, 10, 12, 14, 20, 25, 27, 29, 30, 37, 48, 64, 66, 71, 82, 85, 94, 97, 105, 110, 118, 119, 123, 125, 128, 131, 132, 139, 141, 143, 151, 154, 159, 163, 164, 165, 167, 171, 176, 180, 183, 188, 189, 199, 200, 202, 203, 206, 208, 219, 221, 224, 225, 227, 228, 233, 234, 236, 247, 250, 269, 272, 279, 286, 288, 292, 296, 302, 304, 305, 306, 317, 319, 329, 333, 335, 336, 339, 351, 361, 370, 374, 380, 382, 383, 387, 390, 396, 397, 404, 408, 413, 414, 415, 416, 417, 434, 435, 441, 442, 446, 448, 449, 452, 454, 459, 468, 471, 472, 473, 482, 487, 492, 495, 502, 504, 514, 533, 536, 538, 547, 556, 558, 561, 572, 573, 577, 580, 582, 589, 590, 593, 596, 602, 605, 608, 623, 627, 632, 638, 647, 650, 656, 659, 666, 669, 674, 675, 679, 686, 700, 702, 724, 728, 729, 731, 733, 742, 743, 746, 754, 760, 762, 763, 770, 772, 776, 780, 782, 790, 791, 792, 794, 797, 800, 801, 808, 818, 820, 821, 823, 827, 832, 835, 837, 844, 847, 852, 853, 858, 859, 862, 863, 867, 868, 871, 877, 884, 886, 893, 895, 8

In [24]:
# Pandas DataFrame to store results from this step
topic_terms = pd.DataFrame(topic_terms_list)
topic_terms

Unnamed: 0,category,term,frequency
0,0,app,189.611081
1,1,tasks,58.099738
2,2,work,48.774678
3,3,good,37.940386
4,4,version,69.153458
