In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import os
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

In [3]:
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')

PyTorch version: 2.5.1
CUDA available: False


In [4]:
# Test Pytorch

classifier = pipeline("sentiment-analysis")
classifier("We are very happy to show you the 🤗 Transformers library.")
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309


In [5]:
reviews = pd.read_csv('../rawdata/cleaned_reviews.csv')
summary = pd.read_csv('../rawdata/asin_summary.csv')

In [6]:
reviews.head()

Unnamed: 0,rating,title_x,text,asin,timestamp,helpful_vote,title_y,features,price
0,5,Fast!,I have been using laptops for the past 30 year...,B089HR6CQP,1601470000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
1,5,Works Great - No Problems - Great Value for Price,"I read a bunch of negative reviews, so I wante...",B089HR6CQP,1626710000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
2,5,Excellent gaming computer for the price,"Runs smooth, fast, and works well for gaming. ...",B089HR6CQP,1614820000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
3,1,Overheating on setup,I'm impressed. I've never seen a computer so w...,B089HR6CQP,1626820000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0
4,1,Don’t buy,"I bought 2 of these laptops, asked for replace...",B089HR6CQP,1611410000000.0,22,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0


In [7]:
summary.head()

Unnamed: 0,asin,avg_rating,num_reviews,price,title_y,features,os,color
0,B004PANKIA,5.0,1,,FusionTech ABC 15 Laptop - High-Octane Enterta...,"[Genuine Windows 7 Home Premium, 64bit, Intel ...",Windows 7;,Silver
1,B0052F35I2,3.75,12,,"FusionTech ABC 17 Laptop, i7-2630QM, 8GB DDR3 ...","[Intel Core i7 Processor 2GHz, 6GB DDR3 RAM, 6...",Windows 7,
2,B0057CAGUA,5.0,1,,FusionTech Worktop 17R i17R Core i5-2430M 2.4G...,[],Microsoft Windows 7 Home Premium (64 bit),Red
3,B005SDDXF8,4.333333,6,,"FusionTech ABC 15 L502X Intel Core i5-2410, 2....",[2nd generation Intel Core i5-2 410M processor...,Windows 7 Home Premium (64-bit),
4,B0081YPX3Q,2.642857,14,,FusionTech ABC15-9375sLV 15-Inch Laptop (2.1 G...,"[Intel Core i7 3612QM Processor 2.1GHz, 8 GB D...",Windows 7,Silver


In [8]:
# Function to map sentiment labels to numerical values
def map_sentiment(label):
    if label == "NEGATIVE":
        return -1
    elif label == "POSITIVE":
        return 1

# Function to split text into chunks
def split_into_chunks(text, max_length=128):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield " ".join(words[i:i + max_length])

# Function to run sentiment analysis on a single text
def analyze_sentiment(text):
    # Check for NaN, null, or empty strings
    if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
        return np.nan  # Return NaN for invalid inputs

    # Split the text into chunks if it's too long
    chunks = list(split_into_chunks(text, max_length=128))  # Adjust chunk size as needed
    results = classifier(chunks)
    
    # Map sentiment labels to numerical values and calculate the average score
    scores = [map_sentiment(result['label']) for result in results]
    avg_score = round(np.mean(scores))  # Calculate and round the average score
    
    return avg_score

In [9]:
# Apply sentiment analysis to 'title_x' column
reviews['sentiment_title'] = reviews['title_x'].apply(analyze_sentiment)

In [10]:
# Apply sentiment analysis to 'text' column
reviews['sentiment_text'] = reviews['text'].apply(analyze_sentiment)

In [11]:
reviews.head()

Unnamed: 0,rating,title_x,text,asin,timestamp,helpful_vote,title_y,features,price,sentiment_title,sentiment_text
0,5,Fast!,I have been using laptops for the past 30 year...,B089HR6CQP,1601470000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0,1,1.0
1,5,Works Great - No Problems - Great Value for Price,"I read a bunch of negative reviews, so I wante...",B089HR6CQP,1626710000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0,1,1.0
2,5,Excellent gaming computer for the price,"Runs smooth, fast, and works well for gaming. ...",B089HR6CQP,1614820000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0,1,1.0
3,1,Overheating on setup,I'm impressed. I've never seen a computer so w...,B089HR6CQP,1626820000000.0,0,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0,-1,-1.0
4,1,Don’t buy,"I bought 2 of these laptops, asked for replace...",B089HR6CQP,1611410000000.0,22,"FusionTech Gaming G3 15 3500, 15.6 inch FHD La...",[15.6 inch FHD (1920 x 1080) 120Hz 250 nits WV...,799.0,-1,-1.0


In [12]:
print(summary.columns)
print(reviews.columns)

Index(['asin', 'avg_rating', 'num_reviews', 'price', 'title_y', 'features',
       'os', 'color'],
      dtype='object')
Index(['rating', 'title_x', 'text', 'asin', 'timestamp', 'helpful_vote',
       'title_y', 'features', 'price', 'sentiment_title', 'sentiment_text'],
      dtype='object')


In [13]:
# Step 1: Combine sentiment_title and sentiment_text for each review
reviews['combined_sentiment'] = reviews['sentiment_title'] + reviews['sentiment_text']

# Step 2: Group by asin and calculate total negative and positive reviews
summary_sentiments = reviews.groupby('asin')['combined_sentiment'].apply(
    lambda x: pd.Series({
        'total_negative': (x < 0).sum(),
        'total_positive': (x > 0).sum()
    })
).unstack()

# Step 3: Merge the calculated sentiments into the summary table
summary = summary.merge(summary_sentiments, on='asin', how='left')

In [14]:
summary.head()

Unnamed: 0,asin,avg_rating,num_reviews,price,title_y,features,os,color,total_negative,total_positive
0,B004PANKIA,5.0,1,,FusionTech ABC 15 Laptop - High-Octane Enterta...,"[Genuine Windows 7 Home Premium, 64bit, Intel ...",Windows 7;,Silver,0,1
1,B0052F35I2,3.75,12,,"FusionTech ABC 17 Laptop, i7-2630QM, 8GB DDR3 ...","[Intel Core i7 Processor 2GHz, 6GB DDR3 RAM, 6...",Windows 7,,3,9
2,B0057CAGUA,5.0,1,,FusionTech Worktop 17R i17R Core i5-2430M 2.4G...,[],Microsoft Windows 7 Home Premium (64 bit),Red,0,0
3,B005SDDXF8,4.333333,6,,"FusionTech ABC 15 L502X Intel Core i5-2410, 2....",[2nd generation Intel Core i5-2 410M processor...,Windows 7 Home Premium (64-bit),,2,4
4,B0081YPX3Q,2.642857,14,,FusionTech ABC15-9375sLV 15-Inch Laptop (2.1 G...,"[Intel Core i7 3612QM Processor 2.1GHz, 8 GB D...",Windows 7,Silver,7,4


In [15]:
# Convert timestamp to datetime
reviews['timestamp'] = pd.to_datetime(reviews['timestamp'])

# Step 1: Calculate sentiment_ratio_positive
sentiment_summary = reviews.groupby('asin').agg(
    total_positive=('sentiment_title', lambda x: (x + reviews.loc[x.index, 'sentiment_text'] > 0).sum()),
)
sentiment_summary['sentiment_ratio_positive'] = sentiment_summary['total_positive'] / summary.set_index('asin')['num_reviews']

# Step 2: Calculate first_review_date and last_review_date
first_review_date = reviews.groupby('asin')['timestamp'].min()
last_review_date = reviews.groupby('asin')['timestamp'].max()
review_period_hours = (last_review_date - first_review_date).dt.total_seconds() / 3600

# Step 3: Calculate review_frequency (reviews per hour)
review_frequency = summary.set_index('asin')['num_reviews'] / review_period_hours

# Step 4: Calculate value_for_money_score
summary['value_for_money_score'] = summary['avg_rating'] / summary['price']

# Step 5: Merge all calculated fields into the summary DataFrame
summary = summary.merge(sentiment_summary[['sentiment_ratio_positive']], on='asin', how='left')
summary = summary.merge(first_review_date.rename('first_review_date'), on='asin', how='left')
summary = summary.merge(last_review_date.rename('last_review_date'), on='asin', how='left')
summary = summary.merge(review_period_hours.rename('review_period_hours'), on='asin', how='left')
summary = summary.merge(review_frequency.rename('review_frequency'), on='asin', how='left')

In [16]:
print(summary.columns)
print(reviews.columns)

Index(['asin', 'avg_rating', 'num_reviews', 'price', 'title_y', 'features',
       'os', 'color', 'total_negative', 'total_positive',
       'value_for_money_score', 'sentiment_ratio_positive',
       'first_review_date', 'last_review_date', 'review_period_hours',
       'review_frequency'],
      dtype='object')
Index(['rating', 'title_x', 'text', 'asin', 'timestamp', 'helpful_vote',
       'title_y', 'features', 'price', 'sentiment_title', 'sentiment_text',
       'combined_sentiment'],
      dtype='object')


In [17]:
summary.to_csv('../rawdata/processed_summary.csv', index=False)
reviews.to_csv('../rawdata/processed_reviews.csv', index=False)