In [1]:
import json
import numpy as np
import pandas as pd
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
JSON_FILE = '/content/drive/My Drive/yelp_academic_dataset_review.json'

In [4]:
def get_data(filename):
  # Initialize an empty list to store selected attributes
  filtered_data = []

  # Open and process JSON file line by line
  with open(filename, 'r') as file:
      for line in file:
          record = json.loads(line)
          # Extract only specific attributes
          filtered_data.append({'business_id': record['business_id'], 'stars': record['stars']})

  # Create a DataFrame
  return pd.DataFrame(filtered_data)

In [5]:
df = get_data(JSON_FILE)

In [6]:
# Display the first few rows
print(df.head())

              business_id  stars
0  XQfwVwDr-v0ZS3_CbbE5Xw    3.0
1  7ATYjTIgM3jUlt4UM3IypQ    5.0
2  YjUWPpI6HXG530lwP-fb2A    3.0
3  kxX2SOes4o-D3ZQBkiMRfA    5.0
4  e4Vwtrqf-wpJfwesgvdgxQ    4.0


In [7]:
print(len(df)) # total number of entries

6990280


In [8]:
# Rename columns
df = df.rename(columns={'business_id': 'id'})

In [9]:
# Display the first few rows
print(df.head())

                       id  stars
0  XQfwVwDr-v0ZS3_CbbE5Xw    3.0
1  7ATYjTIgM3jUlt4UM3IypQ    5.0
2  YjUWPpI6HXG530lwP-fb2A    3.0
3  kxX2SOes4o-D3ZQBkiMRfA    5.0
4  e4Vwtrqf-wpJfwesgvdgxQ    4.0


In [10]:
# Step 1: Create a column for each star rating
star_columns = pd.get_dummies(df['stars'], prefix='star')  # Create columns for each star rating

In [11]:
# Display the first few rows
print(star_columns.head())

   star_1.0  star_2.0  star_3.0  star_4.0  star_5.0
0     False     False      True     False     False
1     False     False     False     False      True
2     False     False      True     False     False
3     False     False     False     False      True
4     False     False     False      True     False


In [12]:
# Step 2: Concatenate these columns with the original 'id' column
df = pd.concat([df['id'], star_columns], axis=1)

In [13]:
# Display the first few rows
print(df.head())

                       id  star_1.0  star_2.0  star_3.0  star_4.0  star_5.0
0  XQfwVwDr-v0ZS3_CbbE5Xw     False     False      True     False     False
1  7ATYjTIgM3jUlt4UM3IypQ     False     False     False     False      True
2  YjUWPpI6HXG530lwP-fb2A     False     False      True     False     False
3  kxX2SOes4o-D3ZQBkiMRfA     False     False     False     False      True
4  e4Vwtrqf-wpJfwesgvdgxQ     False     False     False      True     False


In [14]:
print(len(df)) # total number of entries

6990280


In [15]:
# Step 3: Group by 'id' and sum the star columns to count each rating
df = df.groupby('id').sum().reset_index()

In [16]:
# Display the first few rows
print(df.head())

                       id  star_1.0  star_2.0  star_3.0  star_4.0  star_5.0
0  ---kPU91CF4Lq2-WlRu9Lw         1         0         2         4        17
1  --0iUa4sNDFiZFrAdIWhZQ         3         0         4         5         2
2  --30_8IhuyMHbSOcNWd6DQ         3         0         0         1         5
3  --7PUidqRWpRSpXebiyxTg         7         2         2         1         0
4  --7jw19RH9JKXgFohspgQw         2         0         1         0        10


In [17]:
print(len(df)) # total number of entries

150346


In [18]:
# Step 4: Create a dictionary inline
df['starCategoryCount'] = df.apply(
    lambda row: {
        '5 stars': row['star_5.0'],
        '4 stars': row['star_4.0'],
        '3 stars': row['star_3.0'],
        '2 stars': row['star_2.0'],
        '1 stars': row['star_1.0']
    }, axis=1
)

In [19]:
# Display the first few rows
print(df.head())

                       id  star_1.0  star_2.0  star_3.0  star_4.0  star_5.0  \
0  ---kPU91CF4Lq2-WlRu9Lw         1         0         2         4        17   
1  --0iUa4sNDFiZFrAdIWhZQ         3         0         4         5         2   
2  --30_8IhuyMHbSOcNWd6DQ         3         0         0         1         5   
3  --7PUidqRWpRSpXebiyxTg         7         2         2         1         0   
4  --7jw19RH9JKXgFohspgQw         2         0         1         0        10   

                                   starCategoryCount  
0  {'5 stars': 17, '4 stars': 4, '3 stars': 2, '2...  
1  {'5 stars': 2, '4 stars': 5, '3 stars': 4, '2 ...  
2  {'5 stars': 5, '4 stars': 1, '3 stars': 0, '2 ...  
3  {'5 stars': 0, '4 stars': 1, '3 stars': 2, '2 ...  
4  {'5 stars': 10, '4 stars': 0, '3 stars': 1, '2...  


In [20]:
# Drop individual star columns
df.drop(columns=['star_1.0', 'star_2.0', 'star_3.0', 'star_4.0', 'star_5.0'], inplace=True)

In [21]:
# Display the first few rows
print(df.head())

                       id                                  starCategoryCount
0  ---kPU91CF4Lq2-WlRu9Lw  {'5 stars': 17, '4 stars': 4, '3 stars': 2, '2...
1  --0iUa4sNDFiZFrAdIWhZQ  {'5 stars': 2, '4 stars': 5, '3 stars': 4, '2 ...
2  --30_8IhuyMHbSOcNWd6DQ  {'5 stars': 5, '4 stars': 1, '3 stars': 0, '2 ...
3  --7PUidqRWpRSpXebiyxTg  {'5 stars': 0, '4 stars': 1, '3 stars': 2, '2 ...
4  --7jw19RH9JKXgFohspgQw  {'5 stars': 10, '4 stars': 0, '3 stars': 1, '2...


In [22]:
print(len(df)) # total number of entries

150346


In [23]:
# Save to JSON
df.to_json('/content/drive/My Drive/Employee_Star_Categories.json', orient='records', lines=True)