In [1]:
import pandas as pd                                       # Data handling[1]
import numpy as np                                        # Numerical operations[1]
import spacy                                              # NLP and aspect extraction[2]
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer  # Overall sentiment[3]
from textblob import TextBlob                            # Fallback sentiment[4]
from transformers import pipeline                        # Pretrained ABSA model[5]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import json
import gzip
from tqdm import tqdm  # For progress tracking (optional)

file_paths = [
    "Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz",
    "Data/Reviews with images/Magazine_Subscriptions_5.json.gz",
    "Data/Reviews with images/Appliances_5 (1).json.gz",
    "Data/Reviews with images/All_Beauty_5 (1).json.gz",
    "Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz",
]

def process_file(file_path, sample_size=1000):
    """Process a single file with memory-efficient streaming"""
    valid_rows = []
    try:
        # Read file line-by-line without loading entire file
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    # Check if images exist and are non-empty
                    if isinstance(record.get('image'), list) and record['image']:
                        valid_rows.append(record)
                except json.JSONDecodeError:
                    continue
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return pd.DataFrame()
    
    # Sample if we have enough valid rows
    if len(valid_rows) > sample_size:
        return pd.DataFrame(valid_rows).sample(n=sample_size, random_state=42)
    return pd.DataFrame(valid_rows)

# Process files incrementally to save memory
df_chunks = []
for path in tqdm(file_paths, desc="Processing files"):
    df_chunk = process_file(path)
    if not df_chunk.empty:
        df_chunks.append(df_chunk)

# Combine results if we have data
if df_chunks:
    df_with_images = pd.concat(df_chunks, ignore_index=True)
    print("\nFinal shape:", df_with_images.shape)
    print(df_with_images[['reviewerID', 'asin', 'reviewText', 'image']].head())
else:
    print("No valid data found in any files")


Processing files: 100%|██████████| 5/5 [00:05<00:00,  1.00s/it]


Final shape: (2033, 12)
       reviewerID        asin  \
0  A2PFW17GTSAY2K  B00L8754LE   
1  A1INKWYN6XXQ6P  B00SZ3R5HA   
2  A22PKZZK5DSONS  B00W847AA4   
3  A1OU2FW26L47VV  B0193XB1W0   
4  A3D09D1C6DR1QW  B00MYYVQOY   

                                          reviewText  \
0  Was good for 9 months then 1 corner cracked th...   
1  I have never had a selfie stick, so I was exci...   
2  This is one of my favorite cases outside of th...   
3  My stock S5 battery wouldn't get me through a ...   
4  Alright, I'm attempting to be as thurough as I...   

                                               image  
0  [https://images-na.ssl-images-amazon.com/image...  
1  [https://images-na.ssl-images-amazon.com/image...  
2  [https://images-na.ssl-images-amazon.com/image...  
3  [https://images-na.ssl-images-amazon.com/image...  
4  [https://images-na.ssl-images-amazon.com/image...  





In [3]:
print(df_with_images['reviewText'][1])

I have never had a selfie stick, so I was excited to get this once and try it out.  When it came, I followed the directions to hook it up and I extend it and started taking pictures. My teenager said I was crazy taking so many pictures. But I was having so much fun trying this out. The wire that you plug into your phone, I couldn't figure out where to put it when it was contracted back down. So it was just hanging free. That was my only concern with it. But that wasn't such a big deal. I still had so much fun playing with it. It really was fun to extend it all the way out and tilt the phone up and down to take a bunch of pictures. It was so much fun. It folded down and I was able to put it in my purse. I have a Galaxy Note with a cover on it and it still fit in the selfie stick. I thought it might not because of my case. But it did and I didn't have to take my phone out of the case. I was really happy that I didn't have to take it out of it's case. I know a lot of places banned selfie 

In [4]:
# Print rows where 'reviewText' is not a string
non_string_mask = ~df_with_images['reviewText'].apply(lambda x: isinstance(x, str))
non_string_rows = df_with_images[non_string_mask]

print("Rows where 'reviewText' is NOT a string:")
print(non_string_rows[['reviewerID', 'asin', 'reviewText', 'image']])
print(f"\nTotal non-string 'reviewText' entries: {len(non_string_rows)}")
print(non_string_rows['reviewText'].apply(type))


Rows where 'reviewText' is NOT a string:
          reviewerID        asin reviewText  \
220   A1RHX83VZBU3ET  B01CE4BPNU        NaN   
1928  A1CKPC88NHMYGR  B001IKJOLW        NaN   
1941  A1CKPC88NHMYGR  B0058YEJ5K        NaN   
1954  A1CKPC88NHMYGR  B0014F7B98        NaN   
1967  A1CKPC88NHMYGR  B009MA34NY        NaN   
1980  A1CKPC88NHMYGR  B0092UF54A        NaN   
1993  A1CKPC88NHMYGR  B005AGO4LU        NaN   
2008  A1CKPC88NHMYGR  B010RRWKT4        NaN   
2021  A1CKPC88NHMYGR  B014IBJKNO        NaN   

                                                  image  
220   [https://images-na.ssl-images-amazon.com/image...  
1928  [https://images-na.ssl-images-amazon.com/image...  
1941  [https://images-na.ssl-images-amazon.com/image...  
1954  [https://images-na.ssl-images-amazon.com/image...  
1967  [https://images-na.ssl-images-amazon.com/image...  
1980  [https://images-na.ssl-images-amazon.com/image...  
1993  [https://images-na.ssl-images-amazon.com/image...  
2008  [https://images-na

In [5]:
vader = SentimentIntensityAnalyzer()                      # VADER analyzer[3]

def overall_sentiment(text):
    scores = vader.polarity_scores(text)
    polarity = scores["compound"]                        # Range [-1,1]
    subjectivity = TextBlob(text).sentiment.subjectivity # Range [0,1]
    return pd.Series({"polarity": polarity, "subjectivity": subjectivity})

# Apply to all reviews
df_with_images = df_with_images[df_with_images['reviewText'].apply(lambda x: isinstance(x, str))]
sent_scores = df_with_images["reviewText"].apply(overall_sentiment)
df2 = pd.concat([df_with_images, sent_scores], axis=1)                # Append features[1]
df2.head()  # Display first few rows

Unnamed: 0,image,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,polarity,subjectivity
0,[https://images-na.ssl-images-amazon.com/image...,2.0,,True,"09 14, 2015",A2PFW17GTSAY2K,B00L8754LE,Family Man,Was good for 9 months then 1 corner cracked th...,Was good for 9 months then 1 corner cracked th...,1442188800,"{'Color:': ' Red', 'Package Type:': ' Standard...",0.468,0.527173
1,[https://images-na.ssl-images-amazon.com/image...,4.0,,False,"02 22, 2016",A1INKWYN6XXQ6P,B00SZ3R5HA,KidShufflingMom,"I have never had a selfie stick, so I was exci...",Fun selfie stick and fits in my purse.,1456099200,{'Color:': ' Jet Black - WIRED'},0.9903,0.444298
2,[https://images-na.ssl-images-amazon.com/image...,5.0,9.0,False,"05 17, 2015",A22PKZZK5DSONS,B00W847AA4,JC,This is one of my favorite cases outside of th...,"Answers the age old question "" Is that a phone...",1431820800,,0.9959,0.534387
3,[https://images-na.ssl-images-amazon.com/image...,4.0,10.0,True,"08 31, 2016",A1OU2FW26L47VV,B0193XB1W0,cjr,My stock S5 battery wouldn't get me through a ...,Lots of battery life!,1472601600,{'Color:': ' Black'},0.8499,0.379821
4,[https://images-na.ssl-images-amazon.com/image...,5.0,21.0,True,"05 15, 2015",A3D09D1C6DR1QW,B00MYYVQOY,Tabz,"Alright, I'm attempting to be as thurough as I...",I seriously loved this product!,1431648000,,0.9975,0.51826


In [6]:
nlp = spacy.load("en_core_web_sm")               

def extract_aspects(text):
    doc = nlp(text)
    return [chunk.text.lower().strip() for chunk in doc.noun_chunks if len(chunk.text.split()) <= 3]

# Build global aspect list from top frequent chunks
all_chunks = df_with_images["reviewText"].apply(extract_aspects).explode()
top_aspects = all_chunks.value_counts().head(50).index.tolist()  # Top 50 aspects[6]

In [7]:
import spacy
import pandas as pd
from collections import Counter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define stopwords and uninformative terms to exclude
stopwords = nlp.Defaults.stop_words
custom_exclusions = {
    'i', 'it', 'you', 'that', 'this', 'they', 'me', 'we', 'them', 'which', 
    'who', 'something', 'anything', 'everything', 'someone', 'anyone', 'some',
    'all', 'a', 'an', 'the', 'what', 'there', 'here', 'other', 'others'
}

def extract_aspects(text):
    """Extract meaningful noun chunks from text"""
    doc = nlp(text)
    aspects = []
    
    for chunk in doc.noun_chunks:
        # Filter by length and content quality
        tokens = [token.text.lower() for token in chunk]
        chunk_text = chunk.text.lower().strip()
        
        # Skip if any of these conditions are true:
        if (
            len(chunk) > 3 or  # Too long
            chunk_text in custom_exclusions or  # In exclusion list
            all(token in stopwords for token in tokens) or  # All stopwords
            any(token in custom_exclusions for token in tokens)  # Contains excluded terms
        ):
            continue
            
        aspects.append(chunk_text)
        
    return aspects

# Build global aspect list
all_chunks = df_with_images["reviewText"].apply(extract_aspects).explode()

# Filter and get top aspects
aspect_counts = all_chunks.value_counts()
meaningful_aspects = [
    aspect for aspect in aspect_counts.index
    if not any(excl_word in aspect for excl_word in custom_exclusions)
]
top_aspects = aspect_counts.loc[meaningful_aspects].head(50).index.tolist()

print("Top 50 meaningful aspects:")
print(top_aspects)


Top 50 meaningful aspects:
['dryer', 'rods', 'my phone', 'works', 'lots', 'town', 'my house', 'reference', 'my vent', 'common sense', 'brush', 'one rod', 'rod', 'number', 'more rods', 'gobs', '24 foot', 'our vent', 'one cycle', 'use', '=', 'phone', 'people', 'pros', 'dust', 'photos', 'phones', 'cons', 'buttons', 'money', 'my feet', 'plenty', 'usb', 'color', 'work', 'my screen', 'order', 'my pocket', 'my purse', 'edge', 'no problem', 'dp', 'course', 'screen', 'my nexus', '100%', 'bulk', 'products', '-', 'photo']


In [8]:
print("\nTop 50 aspects:")
for aspect in top_aspects:
    print(aspect)


Top 50 aspects:
dryer
rods
my phone
works
lots
town
my house
reference
my vent
common sense
brush
one rod
rod
number
more rods
gobs
24 foot
our vent
one cycle
use
=
phone
people
pros
dust
photos
phones
cons
buttons
money
my feet
plenty
usb
color
work
my screen
order
my pocket
my purse
edge
no problem
dp
course
screen
my nexus
100%
bulk
products
-
photo


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load ABSA model and tokenizer (with error handling)
try:
    model_name = "yangheng/deberta-v3-base-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)  # Disable fast tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {e}")
    # Fallback to smaller model
    model_name = "yangheng/deberta-v3-small-absa-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    print("Using smaller model as fallback")

# Batch processing function for efficiency
def batch_process_aspect_sentiment(text_aspect_pairs, batch_size=8):
    """
    Process aspect sentiment in batches for efficiency
    Returns: dict of {(text, aspect): (label, score)}
    """
    results = {}
    num_batches = int(np.ceil(len(text_aspect_pairs) / batch_size))
    
    for i in tqdm(range(num_batches), desc="Processing ABSA batches"):
        batch = text_aspect_pairs[i*batch_size : (i+1)*batch_size]
        formatted_inputs = [f"[CLS] {text} [SEP] {aspect} [SEP]" for text, aspect in batch]
        
        try:
            inputs = tokenizer(
                formatted_inputs,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            probs = torch.softmax(outputs.logits, dim=1)
            preds = torch.argmax(probs, dim=1)
            
            for j, (text, aspect) in enumerate(batch):
                label_id = preds[j].item()
                label = model.config.id2label[label_id]
                score = probs[j, label_id].item()
                results[(text, aspect)] = (label, score)
                
        except RuntimeError as e:
            print(f"Batch {i} failed: {e}")
            # Fallback to individual processing
            for text, aspect in batch:
                try:
                    inputs = tokenizer(
                        f"[CLS] {text} [SEP] {aspect} [SEP]",
                        return_tensors="pt",
                        truncation=True,
                        max_length=512
                    ).to(device)
                    
                    with torch.no_grad():
                        outputs = model(**inputs)
                    
                    probs = torch.softmax(outputs.logits, dim=1)
                    label_id = torch.argmax(probs).item()
                    label = model.config.id2label[label_id]
                    score = probs[0, label_id].item()
                    results[(text, aspect)] = (label, score)
                    
                except Exception as e:
                    print(f"Failed on ({text[:20]}..., {aspect}): {e}")
                    results[(text, aspect)] = ('Neutral', 0.0)
    
    return results

# Optimized ABSA feature extraction
def generate_absa_features(df, top_aspects, batch_size=32):
    """
    Generate ABSA features with aspect filtering and batch processing
    """
    # Step 1: Collect all (text, aspect) pairs to process
    text_aspect_pairs = []
    review_aspect_map = defaultdict(list)
    
    # First pass: Extract aspects and create processing list
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preparing aspects"):
        text = row['reviewText']
        detected = set(extract_aspects(text)) & set(top_aspects)
        
        for aspect in detected:
            text_aspect_pairs.append((text, aspect))
            review_aspect_map[idx].append(aspect)
    
    # Step 2: Batch process all aspect-sentiment pairs
    sentiment_results = batch_process_aspect_sentiment(text_aspect_pairs, batch_size)
    
    # Step 3: Build features DataFrame
    features = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Building features"):
        feats = {}
        detected_aspects = review_aspect_map.get(idx, [])
        
        # Add features for detected aspects
        for aspect in detected_aspects:
            label, score = sentiment_results.get((row['reviewText'], aspect), ('Neutral', 0.0))
            
            if label == 'Positive':
                polarity = score
            elif label == 'Negative':
                polarity = -score
            else:
                polarity = 0.0
                
            feats[f"{aspect}_polarity"] = polarity
            feats[f"{aspect}_presence"] = 1
        
        # Add zero features for non-detected aspects
        for aspect in set(top_aspects) - set(detected_aspects):
            feats[f"{aspect}_polarity"] = 0.0
            feats[f"{aspect}_presence"] = 0
            
        features.append(feats)
    
    return pd.DataFrame(features)

# Generate ABSA features
absa_df = generate_absa_features(df_with_images, top_aspects)

# Combine with original data
df3 = pd.concat([df_with_images, absa_df], axis=1)
print("ABSA features added successfully!")
print(f"Final shape: {df3.shape}")


Using device: cuda
Model loaded successfully


Preparing aspects: 100%|██████████| 2024/2024 [01:01<00:00, 32.76it/s] 
Processing ABSA batches: 100%|██████████| 144/144 [02:29<00:00,  1.04s/it]
Building features: 100%|██████████| 2024/2024 [00:00<00:00, 28709.36it/s]

ABSA features added successfully!
Final shape: (2033, 112)





In [None]:
print(df3.iloc[0,])  

my nexus_polarity     0.0
my nexus_presence     0.0
phones_polarity       0.0
phones_presence       0.0
people_polarity       0.0
people_presence       0.0
my house_polarity     0.0
my house_presence     0.0
brush_polarity        0.0
brush_presence        0.0
edge_polarity         0.0
edge_presence         0.0
rods_polarity         0.0
rods_presence         0.0
my screen_polarity    0.0
my screen_presence    0.0
our vent_polarity     0.0
our vent_presence     0.0
products_polarity     0.0
products_presence     0.0
Name: 0, dtype: object


In [26]:
import pandas as pd

# Assume df3 is your DataFrame with ABSA features appended
# Identify all polarity columns (ending with '_polarity')
polarity_cols = [col for col in df3.columns if (col.endswith('_polarity') or col.endswith('_presence'))]

# Compute non-zero counts for each polarity column
non_zero_counts = (df3[polarity_cols] != 0.0).sum().sort_values(ascending=False)

# Display the counts
print("Non-zero counts per aspect polarity column:\n", non_zero_counts)
print("\nTotal non-zero polarity columns:", len(non_zero_counts))

Non-zero counts per aspect polarity column:
 my phone_presence    239
works_polarity       228
works_presence       228
lots_polarity        224
lots_presence        224
                    ... 
edge_presence         18
edge_polarity         18
my nexus_polarity     17
dp_polarity           17
photo_polarity        16
Length: 100, dtype: int64

Total non-zero polarity columns: 100
