# Data Processing

In [7]:
# Basic Imports

import matplotlib.pyplot as plt
import pandas as pd
import kagglehub

## Load Dataset

In [1]:
dataset_path = kagglehub.dataset_download("yelp-dataset/yelp-dataset")

chunks = pd.read_json(
    f"{dataset_path}/yelp_academic_dataset_review.json",
    lines=True,
    chunksize=100_000
)

dfs = []
for chunk in chunks:
    dfs.append(chunk[["review_id", "stars", "text", "date"]])
    if len(dfs) >= 3:
        break

df = pd.concat(dfs, ignore_index=True)

## Sort Dataset and Choose Subset

In [2]:
df["date"] = pd.to_datetime(df["date"])

df = df.sort_values(
    by=["date", "review_id"],
    ascending=[False, True]
)

df_subset = df.iloc[:20000].reset_index(drop=True)

## Create Label for Binary Classification

We chose rating of 1~3 to be negative review, and 4~5 to be positive review. In the future, we may choose different binary classes to see how results differ.

In [8]:
df_subset["label"] = (df_subset["stars"] >= 4).astype(int)
df_subset.to_csv("data/yelp_reviews_subset.csv", index=False)

## Subset Summary

In [5]:
df_subset.head()

Unnamed: 0,review_id,stars,text,date,label
0,igUar2sMmvX7Ps1AbvQXgg,1,This is the worst Walmart ever conceived by hu...,2020-07-07 22:59:20,0
1,peB3hSTWXgpsc2SCaBEPtA,5,I've always loved Outback and the fact that th...,2020-06-05 20:58:29,1
2,wrLFWRBu_JkD6CW5XpLj5w,5,LOVE THIS PLACE!! Such amazing ice cream at a ...,2020-05-13 02:10:17,1
3,_Mgbav5Q7LD1FZfIvueQXw,5,Took both my pups for a way ovetdue grooming. ...,2020-05-11 01:18:16,1
4,j0dc6B6xEXTp_zYMuJ3ysQ,1,"i went to get a chicken sandwich, and being al...",2020-05-04 01:57:38,0


In [6]:
print("Subset shape:", df_subset.shape)
print("\nStar rating distribution:")
display(df_subset["stars"].value_counts().sort_index())

print("\nLabel distribution (0 = negative, 1 = positive):")
display(df_subset["label"].value_counts())

Subset shape: (20000, 5)

Star rating distribution:


stars
1     2381
2     1391
3     1718
4     3668
5    10842
Name: count, dtype: int64


Label distribution (0 = negative, 1 = positive):


label
1    14510
0     5490
Name: count, dtype: int64