In [84]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Importing necessary libraries

In [85]:
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from transformers import pipeline
from collections import defaultdict

accessing the distilbert-base-uncased-finetuned-sst-2-english via pipeline for sentimental analysis

In [86]:
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0)


In [87]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text

In [88]:
custom_stop_words = list(text.ENGLISH_STOP_WORDS) + ["product", "review", "buy", "use", "amazon"]

In [89]:
def detect_aspects_with_tfidf(reviews):
    # Preprocess reviews
    processed_texts = [preprocess_text(review["text"]) for review in reviews]
    
    # Use TF-IDF to extract important terms from reviews
    vectorizer = TfidfVectorizer(stop_words=custom_stop_words, max_features=20)  
    tfidf_matrix = vectorizer.fit_transform(processed_texts)
    
    # Get feature names (terms/aspects) with the highest TF-IDF scores
    terms = vectorizer.get_feature_names_out()
    
    return terms

In [90]:
def split_text_into_chunks(text, max_length=500):
    """
    Splits a long text into chunks by sentence, ensuring that each chunk is within the model's token limit.
    """
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # Tokenize the sentence to check its length
        sentence_tokens = sentiment_analyzer.tokenizer.tokenize(sentence)
        if len(sentence_tokens) + len(sentiment_analyzer.tokenizer.tokenize(current_chunk)) <= max_length:
            current_chunk += " " + sentence
        else:
            # If adding the sentence exceeds the limit, start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [91]:
def generate_tags_for_review(review, aspects):

    result = {
        "asin": review["asin"],
        "review_id": review["user_id"],
        "title": review["title"],
        "text": review["text"],
        "rating": review["rating"],
        "tags": []
    }

    # Preprocess the review text
    cleaned_text = preprocess_text(review["text"])

    for aspect in aspects:
        # Check if the aspect is mentioned in the review
        if aspect in cleaned_text:
            # Split the review into chunks to handle long texts
            review_chunks = split_text_into_chunks(review["text"])
            sentiment = None
            
            # Analyze sentiment for each chunk
            for chunk in review_chunks:
                sentiment_result = sentiment_analyzer(chunk)[0]
                if sentiment is None:
                    sentiment = sentiment_result  # Initialize sentiment with the first chunk's result
                else:
                    # If the sentiment of the chunk is negative, keep it as negative
                    if sentiment_result['label'] == 'NEGATIVE':
                        sentiment = sentiment_result
            
            # Generate tag based on the overall sentiment
            if sentiment['label'] == 'POSITIVE':
                result["tags"].append(f"good {aspect}")
            else:
                result["tags"].append(f"bad {aspect}")

    return result


In [92]:
jsonl_file_path = "/kaggle/input/electronics-customer-review-limited/data2.jsonl"
reviews = []

with open(jsonl_file_path, 'r') as file:
    for line in file:
        review = json.loads(line.strip())
        reviews.append(review)

# Detect important aspects using TF-IDF
aspects = detect_aspects_with_tfidf(reviews)

# Process each review and generate sentiment-based tags for each aspect
processed_reviews = []
for review in reviews:
    processed_review = generate_tags_for_review(review, aspects)
    processed_reviews.append(processed_review)

# Consolidate tags by product (asin)
product_tags = defaultdict(set)

for review in processed_reviews:
    asin = review["asin"]
    product_tags[asin].update(review["tags"])

# Save the consolidated tags to a JSON file
output_file_path = "consolidated_product_tags_tfidf.json"

with open(output_file_path, 'w') as outfile:
    json.dump({asin: list(tags) for asin, tags in product_tags.items()}, outfile, indent=4)

print(f"Consolidated product tags have been saved to {output_file_path}.")

Consolidated product tags have been saved to /kaggle/working/consolidated_product_tags_tfidf.json.
