# Data Exploration

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(project_root)
sys.path.append(project_root)

print("Working dir:", os.getcwd())

Working dir: c:\Users\Asus\Desktop\llm projects\ReviewInsight-Finetune


In [4]:
from src.data_curation.load_data import load_raw_data

df = load_raw_data("artifacts/data/raw/Electronics_5.json")

Loading raw data from artifacts/data/raw/Electronics_5.json...
Loaded 1689188 records with 3 columns.


In [5]:
# Dataset Info
print(f"Shape of the dataframe: {df.shape}")


Shape of the dataframe: (1689188, 3)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1689188 entries, 0 to 1689187
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   product_id      1689188 non-null  object
 1   review_body     1689188 non-null  object
 2   review_summary  1689188 non-null  object
dtypes: object(3)
memory usage: 38.7+ MB


In [7]:
df.head(3)

Unnamed: 0,product_id,review_body,review_summary
0,528881469,We got this GPS for my husband who is an (OTR)...,Gotta have GPS!
1,528881469,"I'm a professional OTR truck driver, and I bou...",Very Disappointed
2,528881469,"Well, what can I say. I've had this unit in m...",1st impression


In [8]:
print(df.iloc[0]["review_body"])

We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started freezing up... could of just been a glitch in that unit.  Worked great when it worked!  Will work great for the normal person as well but does have the "trucker" option. (the big truck routes - tells you when a scale is coming up ect...)  Love the bigger screen, the ease of use, the ease of putting addresses into memory.  Nothing really bad to say about the unit with the exception of it freezing which is probably one in a million and that's just my luck.  I contacted the seller and within minutes of my email I received a email back with instructions for an exchange! VERY impressed all the way around!


In [9]:
# Data Quality

print("Missing values:\n", df.isnull().sum())
duplicates = df.duplicated(subset=["product_id", "review_body"]).sum()
print(f"Total duplicates: {duplicates}")


Missing values:
 product_id        0
review_body       0
review_summary    0
dtype: int64
Total duplicates: 210


In [10]:
# Product Coverage

unique_products = df["product_id"].nunique()
print(f"Unique products: {unique_products}")

Unique products: 63001


In [11]:
# Number of reviews per product

reviews_per_product = df.groupby("product_id")["review_body"].count()
print("\nTop 10 products with most reviews:\n\n", reviews_per_product.sort_values(ascending=False).head(10))
print("\nTop 10 products with fewest reviews:\n", reviews_per_product.sort_values(ascending=True).head(10))


Top 10 products with most reviews:

 product_id
B007WTAJTO    4915
B003ES5ZUU    4143
B00DR0PDNE    3798
B0019EHU8G    3435
B002WE6D44    2813
B003ELYQGG    2652
B0002L5R78    2599
B009SYZ8OC    2542
B00BGGDVOO    2104
B002V88HFE    2082
Name: review_body, dtype: int64

Top 10 products with fewest reviews:
 product_id
B000FLS78Q    5
B000FLNU4C    5
B000FLD01K    5
B000FL8LZA    5
B000FOL7Q2    5
B000FPE1D2    5
B000FPCN1O    5
B000FOVTDS    5
B000FHBNI6    5
B000FGVFP8    5
Name: review_body, dtype: int64


In [12]:
# Review Stats
# -------------------
print("Min reviews per product:", reviews_per_product.min())
print("Max reviews per product:", reviews_per_product.max())
print("Average reviews per product:", reviews_per_product.mean())
print("Median reviews per product:", reviews_per_product.median())
print("Mod reviews per product:", reviews_per_product.mode())

Min reviews per product: 5
Max reviews per product: 4915
Average reviews per product: 26.81208234789924
Median reviews per product: 11.0
Mod reviews per product: 0    5
Name: review_body, dtype: int64


In [13]:
# Text Length Analysis
# -------------------
df["review_length_chars"] = df["review_body"].str.len()
df["review_length_words"] = df["review_body"].str.split().str.len()

In [14]:
# Summary stats
print("\nReview length (characters):")
print(df["review_length_chars"].describe())
print("\nReview length (words):")
print(df["review_length_words"].describe())


Review length (characters):
count    1.689188e+06
mean     6.342017e+02
std      9.030145e+02
min      0.000000e+00
25%      1.770000e+02
50%      3.420000e+02
75%      7.240000e+02
max      3.270300e+04
Name: review_length_chars, dtype: float64

Review length (words):
count    1.689188e+06
mean     1.162784e+02
std      1.614467e+02
min      0.000000e+00
25%      3.300000e+01
50%      6.400000e+01
75%      1.340000e+02
max      6.141000e+03
Name: review_length_words, dtype: float64


In [15]:
df.head()

Unnamed: 0,product_id,review_body,review_summary,review_length_chars,review_length_words
0,528881469,We got this GPS for my husband who is an (OTR)...,Gotta have GPS!,805,149
1,528881469,"I'm a professional OTR truck driver, and I bou...",Very Disappointed,2175,427
2,528881469,"Well, what can I say. I've had this unit in m...",1st impression,4607,846
3,528881469,"Not going to write a long review, even thought...","Great grafics, POOR GPS",2246,449
4,528881469,I've had mine for a year and here's what we go...,"Major issues, only excuses for support",1076,202


In [16]:
df[df["review_length_words"] > 500]

Unnamed: 0,product_id,review_body,review_summary,review_length_chars,review_length_words
2,0528881469,"Well, what can I say. I've had this unit in m...",1st impression,4607,846
237,1400501466,"The Nook Tablet, in both the 16gb version and ...",The BEST Color E-Reader - With Bonus Features,4915,883
248,1400501466,"Months later, I am still happy with this Table...","Outstanding 7"" Tablet",7761,1432
251,1400501466,I pulled the trigger on three of these on pre-...,People shouldn't be surprised about what it is...,6763,1223
252,1400501466,I went to a few places just to try testing the...,GREAT .cbz (comic file) reader an more!,4261,809
...,...,...,...,...,...
1689176,B00L3YHF6O,[Please read this review for full details and ...,Comprehensive review of the new updated SR20A ...,11244,1978
1689177,B00L3YHF6O,Bluetooth speakers have improved a lot over th...,"Oooh, aahh, ROAR!",3172,546
1689180,B00L3YHF6O,My short review:If you have the money to spend...,Best sounding speaker at this price range,8576,1504
1689182,B00L3YHF6O,"Move over Bose, JBL and Klipsch! There's a new...",Why This Will Be Amazon's Top Selling Portable...,5119,900


### Cleaned data

In [17]:
from src.data_curation.preprocess import preprocess_reviews

In [18]:
cleaned_df = preprocess_reviews(df)

In [19]:
cleaned_df.head()

Unnamed: 0,product_id,review_body,review_summary,review_length_chars,review_length_words,word_count
0,528881469,We got this GPS for my husband who is an (OTR)...,Gotta have GPS!,805,149,149
1,528881469,"I'm a professional OTR truck driver, and I bou...",Very Disappointed,2175,427,200
2,528881469,"Not going to write a long review, even thought...","Great grafics, POOR GPS",2246,449,200
3,528881469,I've had mine for a year and here's what we go...,"Major issues, only excuses for support",1076,202,200
4,594451647,I am using this with a Nook HD+. It works as d...,HDMI Nook adapter cable,109,22,22


### Grouping Reviews

In [20]:
from src.data_curation.group_reviews import group_reviews_by_product

product_reviews = group_reviews_by_product(cleaned_df)

In [21]:
product_reviews.head(10)

Unnamed: 0,product_id,reviews,reviews_count
0,0528881469,[We got this GPS for my husband who is an (OTR...,4
1,0594451647,[I am using this with a Nook HD+. It works as ...,5
2,0594481813,[This item is just as was described in the ori...,8
3,0972683275,"[This is a great buy, compared to a $60 or mor...",218
4,1400501466,"[I saw this product on Amazon and thought ""wha...",38
5,1400501520,"[Great Product, Great Sale, I love this Produc...",18
6,1400501776,[The Nook tablet is a great color e-reader wit...,19
7,1400532620,[Highly disappointed. I purchased the new B/W ...,33
8,1400532655,[Not worth saving a few bucks. Go with an iPad...,93
9,140053271X,"[The new Nook ""Simple Touch"" is easy on my eye...",64


In [22]:
for i in product_reviews.iloc[0]["reviews"]:
    print(i, end="\n\n")

We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started freezing up... could of just been a glitch in that unit.  Worked great when it worked!  Will work great for the normal person as well but does have the "trucker" option. (the big truck routes - tells you when a scale is coming up ect...)  Love the bigger screen, the ease of use, the ease of putting addresses into memory.  Nothing really bad to say about the unit with the exception of it freezing which is probably one in a million and that's just my luck.  I contacted the seller and within minutes of my email I received a email back with instructions for an exchange! VERY impressed all the way around!

I'm a professional OTR truck driver, and I bought a TND 700 at a truck stop hoping to make my life easier. Rand McNally, are you listening?First thing I did after charging it was connect it to 

In [23]:
# Summary stats
print(product_reviews["reviews_count"].describe())

count    63000.000000
mean        25.900794
std         74.277488
min          1.000000
25%          6.000000
50%         10.000000
75%         22.000000
max       4833.000000
Name: reviews_count, dtype: float64


### Load and Check final processed product reviews

In [24]:
# Load data
df = pd.read_parquet("artifacts/data/processed/processed_product_reviews.parquet")

In [25]:
df.head()

Unnamed: 0,product_id,reviews,reviews_count
0,594451647,[I am using this with a Nook HD+. It works as ...,5
1,594481813,[This item is just as was described in the ori...,8
2,972683275,[This mount works really well once you get it ...,10
3,1400501466,[this is such a great tablet. one time didn't ...,10
4,1400501520,[I bought the Nook primarily for reading but w...,10


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61457 entries, 0 to 61456
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_id     61457 non-null  object
 1   reviews        61457 non-null  object
 2   reviews_count  61457 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [27]:
df["reviews_count"].describe()

count    61457.000000
mean         8.445303
std          1.959348
min          5.000000
25%          7.000000
50%         10.000000
75%         10.000000
max         10.000000
Name: reviews_count, dtype: float64

In [28]:
for review in df.iloc[1]["reviews"]:
    print(review, end="\n\n")

This item is just as was described in the original description, works without any issues to be seen. Good product

bought for a spare for my 9&#34; Nook HD and it fit perfectly.  Very satisfied with the price much less than on the BN site

My son crewed my HD charger cord so I needed another one, this is exactly like the one my son destroyed.

This is a good beefy 2 amp charger, but it covers two outlets on a power strip. It's ok in a regular wall outlet. The best thing is it uses a standard USB connector so it can charge more than just a Nook (I have a Kindle Fire HD+).

I lost my B&N original cable.  I looked around for an new one.  I tried  a different, cheaper model but it didn't fit my device properly so back to the drawing board.  I ordered this one.  I am satisfied.  It works exactly as expected and fits perfectly.  I would recommend this product to anyone looking for a spare or in lieu of the original usb cable adapter.

It does 2A and charges a DEAD Nook in a few hours. It doe

### Exploring Instruction Answer dataset for training
##### Pros and Cons are generated with GPT-4o-mini from review list for each product.

In [29]:
import json

data = []
with open("instruction_answers.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

In [30]:
print(data[0]["input"])



In [31]:
print(data[0]["answer"])

{"pros": ["Excellent HD picture quality on TV", "Easy to setup and use right out of the box", "Good for traveling, makes hotel TVs smart", "Useful for sharing photos on a bigger screen", "Works well with other HDMI devices", "Allows Nook to be used as a streaming server to TV"], "cons": ["Cable is wobbly and can disconnect easily", "Price is considered unfair", "Only works with Nook HD and HD+ series", "Requires power adapter connected to function", "Loose plug makes it hard to move Nook without holding adapter", "Initial connection may require adjustment", "Incompatible with some TVs, like LG SmartTV 3D"]}


In [32]:
data[1]

{'instruction': 'Generate pros and cons from the following product reviews.',
 'input': ['This item is just as was described in the original description, works without any issues to be seen. Good product',
  'bought for a spare for my 9&#34; Nook HD and it fit perfectly.  Very satisfied with the price much less than on the BN site',
  'My son crewed my HD charger cord so I needed another one, this is exactly like the one my son destroyed.',
  "This is a good beefy 2 amp charger, but it covers two outlets on a power strip. It's ok in a regular wall outlet. The best thing is it uses a standard USB connector so it can charge more than just a Nook (I have a Kindle Fire HD+).",
  "I lost my B&N original cable.  I looked around for an new one.  I tried  a different, cheaper model but it didn't fit my device properly so back to the drawing board.  I ordered this one.  I am satisfied.  It works exactly as expected and fits perfectly.  I would recommend this product to anyone looking for a spar