In [5]:
# !pip install pandas numpy spacy transformers torch vaderSentiment textblob
# !python -m spacy download en_core_web_sm

In [4]:
# !pip install tiktoken sentencepiece protobuf transformers

In [2]:
import pandas as pd                                       # Data handling[1]
import numpy as np                                        # Numerical operations[1]
import spacy                                              # NLP and aspect extraction[2]
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer  # Overall sentiment[3]
from textblob import TextBlob                            # Fallback sentiment[4]
from transformers import pipeline                        # Pretrained ABSA model[5]

In [19]:
import pandas as pd
import json
import gzip
from tqdm import tqdm  # For progress tracking (optional)
import os

file_paths = [
    "Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz",
    "Data/Reviews with images/Magazine_Subscriptions_5.json.gz",
    "Data/Reviews with images/Appliances_5 (1).json.gz",
    "Data/Reviews with images/All_Beauty_5 (1).json.gz",
    "Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz",
]

FILE_INFO = {
    "amazon_hackon/Data/Reviews with images/Cell_Phones_and_Accessories_5.json.gz": "Cell_Phones_and_Accessories",
    "amazon_hackon/Data/Reviews with images/Magazine_Subscriptions_5.json.gz": "Magazine_Subscriptions",
    "amazon_hackon/Data/Reviews with images/Appliances_5 (1).json.gz": "Appliances",
    "amazon_hackon/Data/Reviews with images/All_Beauty_5 (1).json.gz": "All_Beauty",
    "amazon_hackon/Data/Reviews with images/AMAZON_FASHION_5 (1).json.gz": "AMAZON_FASHION"
}

# Create a reverse lookup based on just the filename for matching
filename_to_category = {
    os.path.basename(path): category
    for path, category in FILE_INFO.items()
}

def process_file(file_path, sample_size=1000):
    valid_rows = []
    try:
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    if isinstance(record.get('image'), list) and record['image']:
                        if isinstance(record.get('reviewText'), str):  # Add this check
                            valid_rows.append(record)
                except json.JSONDecodeError:
                    continue
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return pd.DataFrame()
    
    df = pd.DataFrame(valid_rows)
    
    # Sample if we have more than sample_size
    if len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42)

    # Add category based on filename
    category = filename_to_category.get(os.path.basename(file_path), "Unknown")
    df["category"] = category

    return df

df_chunks = []
for path in tqdm(file_paths, desc="Processing files"):
    df_chunk = process_file(path)
    if not df_chunk.empty:
        df_chunks.append(df_chunk)

if df_chunks:
    df_with_images = pd.concat(df_chunks, ignore_index=True)
    print("\nFinal shape:", df_with_images.shape)
    print(df_with_images[['category', 'image']].head())
else:
    print("No valid data found in any files")


Processing files: 100%|██████████| 5/5 [00:07<00:00,  1.42s/it]


Final shape: (2025, 13)
                      category  \
0  Cell_Phones_and_Accessories   
1  Cell_Phones_and_Accessories   
2  Cell_Phones_and_Accessories   
3  Cell_Phones_and_Accessories   
4  Cell_Phones_and_Accessories   

                                               image  
0  [https://images-na.ssl-images-amazon.com/image...  
1  [https://images-na.ssl-images-amazon.com/image...  
2  [https://images-na.ssl-images-amazon.com/image...  
3  [https://images-na.ssl-images-amazon.com/image...  
4  [https://images-na.ssl-images-amazon.com/image...  





In [20]:
print(df_with_images['reviewText'][1])

Not the greatest design but better than mine. Broke really easy in the first drop but nothing acrylic glue couldn't fix.  Acrylic glue doesn't fix hiking trials,


In [18]:
# # Print rows where 'reviewText' is not a string
# non_string_mask = ~df_with_images['reviewText'].apply(lambda x: isinstance(x, str))
# non_string_rows = df_with_images[non_string_mask]

# print("Rows where 'reviewText' is NOT a string:")
# print(non_string_rows[['reviewerID', 'asin', 'reviewText', 'image']])
# print(f"\nTotal non-string 'reviewText' entries: {len(non_string_rows)}")
# print(non_string_rows['reviewText'].apply(type))


In [22]:
vader = SentimentIntensityAnalyzer()                      # VADER analyzer[3]

def overall_sentiment(text):
    scores = vader.polarity_scores(text)
    polarity = scores["compound"]                        # Range [-1,1]
    subjectivity = TextBlob(text).sentiment.subjectivity # Range [0,1]
    return pd.Series({"polarity": polarity, "subjectivity": subjectivity})

# Apply to all reviews
df_with_images = df_with_images[df_with_images['reviewText'].apply(lambda x: isinstance(x, str))]
sent_scores = df_with_images["reviewText"].apply(overall_sentiment)
df2 = pd.concat([df_with_images, sent_scores], axis=1)                # Append features[1]
df2.head()  # Display first few rows

Unnamed: 0,image,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,category,polarity,subjectivity
0,[https://images-na.ssl-images-amazon.com/image...,5.0,,False,"01 21, 2016",AD44H3YP65YHL,B00L9ICE9M,brian podolak,I bought this for my wife and she really loves...,Nice strong good looking case,1453334400,"{'Size:': ' iPhone 6', 'Color:': ' White'}",Cell_Phones_and_Accessories,0.992,0.745222
1,[https://images-na.ssl-images-amazon.com/image...,2.0,,True,"08 20, 2016",A2EAW9C31JE4L6,B00OS9E6AO,SmokingChicken,Not the greatest design but better than mine. ...,Hold it,1471651200,{'Color:': ' Black'},Cell_Phones_and_Accessories,0.1545,0.666667
2,[https://images-na.ssl-images-amazon.com/image...,5.0,,False,"11 30, 2015",A1DJ9ZJH1RKQIE,B017U7FQSG,Doleman,"Really likes these cables, most of the micro U...",Packs of 5 colorful cables~,1448841600,{'Color:': ' 6.6ft-Red+Orange+Yellow+Green+Blue'},Cell_Phones_and_Accessories,0.9743,0.3962
3,[https://images-na.ssl-images-amazon.com/image...,4.0,3.0,False,"10 5, 2015",A2XOUBTCDPFMBB,B0126SWVPA,inkling,Gooseneck mount:\nOverall: sturdy quality for ...,Gooseneck cell phone holder: great quality ove...,1444003200,{'Color:': ' Gooseneck Phone holder'},Cell_Phones_and_Accessories,0.9272,0.525836
4,[https://images-na.ssl-images-amazon.com/image...,5.0,,False,"08 16, 2016",A2PNMOCPBH09K8,B013OZ6J5C,Timothy Lillis,I was worried because my bike was a little cro...,Better than I expected for prixe,1471305600,{'Color:': ' Basic Version'},Cell_Phones_and_Accessories,0.9881,0.515598


In [23]:
# Saving the dataset with sentiments
print(df2.shape)
df2.to_pickle("datasetFinal.pkl")

(2025, 15)


In [6]:
# nlp = spacy.load("en_core_web_sm")               

# def extract_aspects(text):
#     doc = nlp(text)
#     return [chunk.text.lower().strip() for chunk in doc.noun_chunks if len(chunk.text.split()) <= 3]

# # Build global aspect list from top frequent chunks
# all_chunks = df_with_images["reviewText"].apply(extract_aspects).explode()
# top_aspects = all_chunks.value_counts().head(50).index.tolist()  # Top 50 aspects[6]

In [13]:
# import spacy
# import pandas as pd
# from collections import Counter

# # Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Define stopwords and uninformative terms to exclude
# stopwords = nlp.Defaults.stop_words
# custom_exclusions = {
#     'i', 'it', 'you', 'that', 'this', 'they', 'me', 'we', 'them', 'which', 
#     'who', 'something', 'anything', 'everything', 'someone', 'anyone', 'some',
#     'all', 'a', 'an', 'the', 'what', 'there', 'here', 'other', 'others'
# }

# def extract_aspects(text):
#     """Extract meaningful noun chunks from text"""
#     doc = nlp(text)
#     aspects = []
    
#     for chunk in doc.noun_chunks:
#         # Filter by length and content quality
#         tokens = [token.text.lower() for token in chunk]
#         chunk_text = chunk.text.lower().strip()
        
#         # Skip if any of these conditions are true:
#         if (
#             len(chunk) > 3 or  # Too long
#             chunk_text in custom_exclusions or  # In exclusion list
#             all(token in stopwords for token in tokens) or  # All stopwords
#             any(token in custom_exclusions for token in tokens)  # Contains excluded terms
#         ):
#             continue
            
#         aspects.append(chunk_text)
        
#     return aspects

# # Build global aspect list
# all_chunks = df_with_images["reviewText"].apply(extract_aspects).explode()

# # Filter and get top aspects
# aspect_counts = all_chunks.value_counts()
# meaningful_aspects = [
#     aspect for aspect in aspect_counts.index
#     if not any(excl_word in aspect for excl_word in custom_exclusions)
# ]
# top_aspects = aspect_counts.loc[meaningful_aspects].head(50).index.tolist()

# print("Top 50 meaningful aspects:")
# print(top_aspects)


In [12]:
# print("\nTop 50 aspects:")
# for aspect in top_aspects:
#     print(aspect)

In [11]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import torch
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
# from collections import defaultdict

# # Initialize device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Load ABSA model and tokenizer (with error handling)
# try:
#     model_name = "yangheng/deberta-v3-base-absa-v1.1"
#     tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)  # Disable fast tokenizer
#     model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
#     print("Model loaded successfully")
# except Exception as e:
#     print(f"Error loading model: {e}")
#     # Fallback to smaller model
#     model_name = "yangheng/deberta-v3-small-absa-v1.1"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
#     print("Using smaller model as fallback")

# # Batch processing function for efficiency
# def batch_process_aspect_sentiment(text_aspect_pairs, batch_size=8):
#     """
#     Process aspect sentiment in batches for efficiency
#     Returns: dict of {(text, aspect): (label, score)}
#     """
#     results = {}
#     num_batches = int(np.ceil(len(text_aspect_pairs) / batch_size))
    
#     for i in tqdm(range(num_batches), desc="Processing ABSA batches"):
#         batch = text_aspect_pairs[i*batch_size : (i+1)*batch_size]
#         formatted_inputs = [f"[CLS] {text} [SEP] {aspect} [SEP]" for text, aspect in batch]
        
#         try:
#             inputs = tokenizer(
#                 formatted_inputs,
#                 padding=True,
#                 truncation=True,
#                 max_length=512,
#                 return_tensors="pt"
#             ).to(device)
            
#             with torch.no_grad():
#                 outputs = model(**inputs)
            
#             probs = torch.softmax(outputs.logits, dim=1)
#             preds = torch.argmax(probs, dim=1)
            
#             for j, (text, aspect) in enumerate(batch):
#                 label_id = preds[j].item()
#                 label = model.config.id2label[label_id]
#                 score = probs[j, label_id].item()
#                 results[(text, aspect)] = (label, score)
                
#         except RuntimeError as e:
#             print(f"Batch {i} failed: {e}")
#             # Fallback to individual processing
#             for text, aspect in batch:
#                 try:
#                     inputs = tokenizer(
#                         f"[CLS] {text} [SEP] {aspect} [SEP]",
#                         return_tensors="pt",
#                         truncation=True,
#                         max_length=512
#                     ).to(device)
                    
#                     with torch.no_grad():
#                         outputs = model(**inputs)
                    
#                     probs = torch.softmax(outputs.logits, dim=1)
#                     label_id = torch.argmax(probs).item()
#                     label = model.config.id2label[label_id]
#                     score = probs[0, label_id].item()
#                     results[(text, aspect)] = (label, score)
                    
#                 except Exception as e:
#                     print(f"Failed on ({text[:20]}..., {aspect}): {e}")
#                     results[(text, aspect)] = ('Neutral', 0.0)
    
#     return results

# # Optimized ABSA feature extraction
# def generate_absa_features(df, top_aspects, batch_size=32):
#     """
#     Generate ABSA features with aspect filtering and batch processing
#     """
#     # Step 1: Collect all (text, aspect) pairs to process
#     text_aspect_pairs = []
#     review_aspect_map = defaultdict(list)
    
#     # First pass: Extract aspects and create processing list
#     for idx, row in tqdm(df.iterrows(), total=len(df), desc="Preparing aspects"):
#         text = row['reviewText']
#         detected = set(extract_aspects(text)) & set(top_aspects)
        
#         for aspect in detected:
#             text_aspect_pairs.append((text, aspect))
#             review_aspect_map[idx].append(aspect)
    
#     # Step 2: Batch process all aspect-sentiment pairs
#     sentiment_results = batch_process_aspect_sentiment(text_aspect_pairs, batch_size)
    
#     # Step 3: Build features DataFrame
#     features = []
#     for idx, row in tqdm(df.iterrows(), total=len(df), desc="Building features"):
#         feats = {}
#         detected_aspects = review_aspect_map.get(idx, [])
        
#         # Add features for detected aspects
#         for aspect in detected_aspects:
#             label, score = sentiment_results.get((row['reviewText'], aspect), ('Neutral', 0.0))
            
#             if label == 'Positive':
#                 polarity = score
#             elif label == 'Negative':
#                 polarity = -score
#             else:
#                 polarity = 0.0
                
#             feats[f"{aspect}_polarity"] = polarity
#             feats[f"{aspect}_presence"] = 1
        
#         # Add zero features for non-detected aspects
#         for aspect in set(top_aspects) - set(detected_aspects):
#             feats[f"{aspect}_polarity"] = 0.0
#             feats[f"{aspect}_presence"] = 0
            
#         features.append(feats)
    
#     return pd.DataFrame(features)

# # Generate ABSA features
# absa_df = generate_absa_features(df_with_images, top_aspects)

# # Combine with original data
# df3 = pd.concat([df_with_images, absa_df], axis=1)
# print("ABSA features added successfully!")
# print(f"Final shape: {df3.shape}")


In [9]:
# print(df3.iloc[0,])  

In [10]:
# import pandas as pd

# # Assume df3 is your DataFrame with ABSA features appended
# # Identify all polarity columns (ending with '_polarity')
# polarity_cols = [col for col in df3.columns if (col.endswith('_polarity') or col.endswith('_presence'))]

# # Compute non-zero counts for each polarity column
# non_zero_counts = (df3[polarity_cols] != 0.0).sum().sort_values(ascending=False)

# # Display the counts
# print("Non-zero counts per aspect polarity column:\n", non_zero_counts)
# print("\nTotal non-zero polarity columns:", len(non_zero_counts))