<a href="https://colab.research.google.com/github/SujaaShri/Product-Feature-Extractor/blob/main/Feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from collections import defaultdict, Counter
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
nltk.download('punkt_tab')
nltk.download('punkt')


In [None]:
from google.colab import files
uploaded = files.upload()
# Load CSV with expected columns: 'productId', 'Title', 'Text'
df = pd.read_csv(next(iter(uploaded)))
df = df[['productId', 'Title', 'Text']].dropna()


In [None]:
def split_into_sentences(review):
    return sent_tokenize(review)


In [None]:
sentiment_analyzer = pipeline("sentiment-analysis")

def get_sentiment(sentences):
    pos, neg = [], []
    for sentence in sentences:
        result = sentiment_analyzer(sentence)[0]
        label = result['label'].lower()
        if label == 'positive':
            pos.append(sentence)
        elif label == 'negative':
            neg.append(sentence)
    return pos, neg

df[['pos_sents', 'neg_sents']] = df['Text'].apply(
    lambda x: pd.Series(get_sentiment(split_into_sentences(x)))
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")


In [None]:
def extract_features(sentences):
    features = []
    for sentence in sentences:
        prompt = f"""Extract only product features (like 'battery life', 'design', 'price', etc.) mentioned in the following sentence. Do not include opinions or full sentences.

Sentence: "{sentence}"

Features:"""
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        outputs = model.generate(**inputs, max_length=64)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        features += [f.strip().lower() for f in result.split(',') if f.strip()]
    return features


In [None]:
df['pos_features'] = df['pos_sents'].apply(extract_features)
df['neg_features'] = df['neg_sents'].apply(extract_features)


In [None]:
summary = defaultdict(lambda: {'title': '', 'pos': [], 'neg': []})

for _, row in df.iterrows():
    pid = row['productId']
    summary[pid]['title'] = row['Title']
    summary[pid]['pos'] += row['pos_features']
    summary[pid]['neg'] += row['neg_features']

for pid, feats in summary.items():
    pos_top = [f for f, _ in Counter(feats['pos']).most_common(3)]
    neg_top = [f for f, _ in Counter(feats['neg']).most_common(3)]
    print(f"Product: {feats['title']}")
    print(f"  Most Appreciated Features: {', '.join(pos_top) or 'None'}")
    print(f"  Least Appreciated Features: {', '.join(neg_top) or 'None'}\n")
