In [None]:
# Install required libraries for PyTorch and Hugging Face Transformers
!pip install torch torchvision torchaudio transformers


In [None]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from collections import defaultdict, Counter
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Download sentence tokenizer model
nltk.download('punkt_tab')
nltk.download('punkt')


In [None]:
# Upload CSV file from local system
from google.colab import files
uploaded = files.upload()

# Read the uploaded CSV and keep only relevant columns
df = pd.read_csv(next(iter(uploaded)))
df = df[['productId', 'Title', 'Text']].dropna()


In [None]:
# Function to split review text into individual sentences
def split_into_sentences(review):
    return sent_tokenize(review)


In [None]:
# Load Hugging Face sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Function to classify each sentence as positive or negative
def get_sentiment(sentences):
    pos, neg = [], []
    for sentence in sentences:
        result = sentiment_analyzer(sentence)[0]
        label = result['label'].lower()
        if label == 'positive':
            pos.append(sentence)
        elif label == 'negative':
            neg.append(sentence)
    return pos, neg


In [None]:
# Apply sentiment analysis to each review's sentences
df[['pos_sents', 'neg_sents']] = df['Text'].apply(
    lambda x: pd.Series(get_sentiment(split_into_sentences(x)))
)


In [None]:
# Load FLAN-T5 tokenizer and model for feature extraction
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")


In [None]:
# Function to extract short product features using FLAN-T5
def extract_features(sentences):
    features = []
    for sentence in sentences:
        prompt = f"""
Extract short product features like 'battery life', 'design', 'price', 'smell' from the sentence below.
Avoid opinions, full sentences, names, or reviews.

Example 1:
Sentence: "The design is beautiful and the battery life lasts all day."
Features: design, battery life

Example 2:
Sentence: "It’s too expensive and doesn’t clean well."
Features: price, cleaning performance

Sentence: "{sentence}"
Features:"""
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        outputs = model.generate(**inputs, max_length=64)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract clean features and convert to lowercase
        features += [f.strip().lower() for f in result.split(',') if f.strip()]
    return features


In [None]:
# Apply feature extraction on positive and negative sentences
df['pos_features'] = df['pos_sents'].apply(extract_features)
df['neg_features'] = df['neg_sents'].apply(extract_features)


In [None]:
# Aggregate features per product using a summary dictionary
summary = defaultdict(lambda: {'title': '', 'pos': [], 'neg': []})

# Loop through each row to group features by productId
for _, row in df.iterrows():
    pid = row['productId']
    summary[pid]['title'] = row['Title']
    summary[pid]['pos'] += row['pos_features']
    summary[pid]['neg'] += row['neg_features']


In [None]:
# For each product, find the top 3 positive and negative features
for pid, feats in summary.items():
    pos_top = [f for f, _ in Counter(feats['pos']).most_common(3)]
    neg_top = [f for f, _ in Counter(feats['neg']).most_common(3)]

    # Print the final summarized output
    print(f"Product: {feats['title']}")
    print(f"  Most Appreciated Features: {', '.join(pos_top) or 'None'}")
    print(f"  Least Appreciated Features: {', '.join(neg_top) or 'None'}\n")
