In [1]:
!pip install pandas transformers torch keybert nltk scikit-learn


Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers, keybert
Successfully installed keybert-0.8.5 sentence-transformers-3.2.0


In [2]:
import pandas as pd

file_path = '/kaggle/input/electronics-customer-review-limited/data2.jsonl' 
df = pd.read_json(file_path, lines=True)

df.head()


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,3,Smells like gasoline! Going back!,First & most offensive: they reek of gasoline ...,[{'small_image_url': 'https://m.media-amazon.c...,B083NRGZMM,B083NRGZMM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-07-18 22:58:37.948,0,True
1,1,Didn’t work at all lenses loose/broken.,These didn’t work. Idk if they were damaged in...,[],B07N69T6TM,B07N69T6TM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2020-06-20 18:42:29.731,0,True
2,5,Excellent!,I love these. They even come with a carry case...,[],B01G8JO5F2,B01G8JO5F2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2018-04-07 09:23:37.534,0,True
3,5,Great laptop backpack!,I was searching for a sturdy backpack for scho...,[],B001OC5JKY,B001OC5JKY,AGGZ357AO26RQZVRLGU4D4N52DZQ,2010-11-20 18:41:35.000,18,True
4,5,Good Product,Good Product,[],B01G8JO5F2,B01G8JO5F2,AFDMZ4TRX3HXQQUGWAHJQTIF65BQ,2021-04-16 21:19:31.884,0,True


In [3]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
-
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  
    text = text.lower() 
    text = ' '.join([word for word in text.split() if word not in stop_words])  
    return text

df['processed_review'] = df['text'].apply(preprocess_text)

df[['text', 'processed_review']].head()


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,processed_review
0,First & most offensive: they reek of gasoline ...,first offensive reek gasoline sensitive allerg...
1,These didn’t work. Idk if they were damaged in...,work idk damaged shipping lenses loose somethi...
2,I love these. They even come with a carry case...,love even come carry case several sizes ear bu...
3,I was searching for a sturdy backpack for scho...,searching sturdy backpack school would allow c...
4,Good Product,good product


In [4]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
-
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
import numpy as np

df['bert_embedding'] = df['processed_review'].apply(get_bert_embedding)

embeddings_matrix = np.stack(df['bert_embedding'].values)
print(embeddings_matrix.shape)  

(27601, 768)


In [7]:
from sklearn.cluster import KMeans

num_clusters = 5

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings_matrix)

df['cluster'] = kmeans.labels_

df[['text', 'cluster']].head()




Unnamed: 0,text,cluster
0,First & most offensive: they reek of gasoline ...,1
1,These didn’t work. Idk if they were damaged in...,2
2,I love these. They even come with a carry case...,1
3,I was searching for a sturdy backpack for scho...,1
4,Good Product,0


In [9]:
from keybert import KeyBERT

kw_model = KeyBERT('bert-base-uncased')

def extract_keywords(review):
    if isinstance(review, str):  
        return kw_model.extract_keywords(review, keyphrase_ngram_range=(1, 2), stop_words='english')
    else:
        return []  

df['keywords'] = df['processed_review'].apply(extract_keywords)

df[['text', 'keywords']].head()


Unnamed: 0,text,keywords
0,First & most offensive: they reek of gasoline ...,"[(adapter tripod, 0.5076), (photograph unplann..."
1,These didn’t work. Idk if they were damaged in...,"[(came loose, 0.5102), (lenses loose, 0.494), ..."
2,I love these. They even come with a carry case...,"[(charger fits, 0.591), (bud inserts, 0.5767),..."
3,I was searching for a sturdy backpack for scho...,"[(backpack flops, 0.5696), (grabbage plan, 0.5..."
4,Good Product,"[(good product, 1.0), (product, 0.8002), (good..."


In [12]:
df.to_csv('electronics-final.csv', index=False)

In [13]:
import pandas as pd
import json

df = pd.read_csv('/kaggle/working/electronics-final.csv')

data_to_save = []

for index, row in df.iterrows():
    row_dict = {
        "asin": row["asin"],
        "cluster_label": row["cluster"], 
        "keywords": row["keywords"],
        "processed sentence": row["processed_review"]
    }
    data_to_save.append(row_dict)

pretty_json = json.dumps(data_to_save, indent=4)

with open('electronics-final.json', 'w') as json_file:
    json_file.write(pretty_json)
