In [1]:
import json
import random


# Taking random 1000 business entries from the "yelp_academic_dataset_business" file

In [3]:
# Initialize an empty list to store businesses
businesses = []

# Load the data, assuming each line in the file is a separate JSON object
file_path = 'D:\\yelp_dataset\\yelp_academic_dataset_business.json'  # Update this path as necessary

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Load each line as a JSON object and append to the list
        business = json.loads(line)
        businesses.append(business)

# Now that we have all businesses loaded, we can select 1000 random businesses
random_businesses = random.sample(businesses, min(1000, len(businesses)))

# Extract only the required fields
filtered_businesses = [{'business_id': business['business_id'], 'stars': business['stars'], 'review_count': business['review_count']} for business in random_businesses]

with open('filtered_1000_businesses.json', 'w') as new_file:
    json.dump(filtered_businesses, new_file, indent=4)

print("New JSON file with 1000 random businesses created successfully.")


New JSON file with 1000 random businesses created successfully.


# Code to get the oldest year (No need to run)

In [2]:

def find_oldest_year(file_path):
    oldest_year = float('inf')  # Initialize with a very large number
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                review = json.loads(line)
                year = int(review['date'].split('-')[0])  # Extract year
                if year < oldest_year:
                    oldest_year = year
            except json.JSONDecodeError:
                continue  # Handles possible bad JSON lines
    
    return oldest_year

# Replace 'path_to_your_file' with the actual path to your Review.json file
file_path = 'D:\\yelp_dataset\\yelp_academic_dataset_review.json'
oldest_year = find_oldest_year(file_path)
print(f"The oldest year in the review file is: {oldest_year}")


The oldest year in the review file is: 2005


# Code to get a dataframe (Year, total_no_of_users)

In [6]:
import pandas as pd
from collections import defaultdict

filepath = 'D:\\yelp_dataset\\yelp_academic_dataset_user.json'  # Update this path as necessary


# Initialize a dictionary to hold the count of users per year
user_count_by_year = defaultdict(int)

# Open and process the User.json file
with open(filepath, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            user = json.loads(line)
            year = user['yelping_since'].split('-')[0]  # Extract the year
            user_count_by_year[year] += 1
        except json.JSONDecodeError:
            continue  # Skip lines that are not valid JSON

# Create a DataFrame
df = pd.DataFrame(list(user_count_by_year.items()), columns=['Year', 'Total_Users_by_year'])

# Ensure the DataFrame covers all years from 2005 to the current year
all_years = pd.DataFrame({'Year': [str(year) for year in range(2005, 2025)]})
df = pd.merge(all_years, df, on='Year', how='left').fillna(0)  # Replace 2025 with the current year + 1

# Convert 'Total Users' to integers
df['Total_Users_by_year'] = df['Total_Users_by_year'].astype(int)

print(df)


    Year  Total_Users_by_year
0   2005                  937
1   2006                 5423
2   2007                15340
3   2008                31097
4   2009                64911
5   2010               109054
6   2011               176435
7   2012               195955
8   2013               209762
9   2014               233465
10  2015               247850
11  2016               217620
12  2017               151024
13  2018               133568
14  2019               104655
15  2020                47444
16  2021                40485
17  2022                 2782
18  2023                    0
19  2024                    0


In [9]:
print(type(df.Total_Users_by_year))

<class 'pandas.core.series.Series'>


In [10]:
# Assuming your DataFrame is named df
print(df.dtypes)
total_users = df['Total_Users_by_year'].sum()
print(f"Total number of users: {total_users}")

Year                   object
Total_Users_by_year     int32
dtype: object
Total number of users: 1987807


# Code to get dataframe (Year, total no of reviews)

In [11]:
# Initialize a dictionary to hold the count of reviews per year
review_count_by_year = defaultdict(int)

filepath = 'D:\\yelp_dataset\\yelp_academic_dataset_review.json'

# Open and process the Review.json file
with open(filepath, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            review = json.loads(line)
            year = review['date'].split('-')[0]  # Extract the year
            review_count_by_year[year] += 1
        except json.JSONDecodeError:
            continue  # Skip lines that are not valid JSON

# Create a DataFrame
df_reviews = pd.DataFrame(list(review_count_by_year.items()), columns=['Year', 'Total_Reviews_by_Year'])

# Ensure the DataFrame covers all years from 2005 to the current year
current_year = pd.to_datetime("now").year
all_years = pd.DataFrame({'Year': [str(year) for year in range(2005, current_year+1)]})
df_reviews = pd.merge(all_years, df_reviews, on='Year', how='left').fillna(0)

# Convert 'Total_Reviews_by_Year' to integers
df_reviews['Total_Reviews_by_Year'] = df_reviews['Total_Reviews_by_Year'].astype(int)

print(df_reviews)


    Year  Total_Reviews_by_Year
0   2005                    854
1   2006                   3853
2   2007                  15363
3   2008                  48226
4   2009                  74387
5   2010                 138587
6   2011                 230813
7   2012                 286570
8   2013                 383950
9   2014                 522275
10  2015                 688415
11  2016                 758882
12  2017                 820048
13  2018                 906362
14  2019                 907284
15  2020                 554557
16  2021                 618189
17  2022                  31665
18  2023                      0
19  2024                      0


In [13]:
# Assuming your DataFrame is named df
print(df_reviews.dtypes)
total_reviews = df_reviews['Total_Reviews_by_Year'].sum()
print(f"Total number of reviews: {total_reviews}")

Year                     object
Total_Reviews_by_Year     int32
dtype: object
Total number of reviews: 6990280


In [18]:
# Merging df and df_reviews on the 'Year' column
merged_df = pd.merge(df, df_reviews, on='Year', how='outer')

# Renaming columns to 'Year', 'Users', and 'Reviews'
merged_df.columns = ['Year', 'Users', 'Reviews']

# Fill NaN values with 0 (assuming missing values mean no users or reviews for those years)
merged_df = merged_df.fillna(0)

# Convert 'Users' and 'Reviews' columns to integers
merged_df['Users'] = merged_df['Users'].astype(int)
merged_df['Reviews'] = merged_df['Reviews'].astype(int)
# Using .iloc to select all rows except the last two
merged_df = merged_df.iloc[:-2]

print(merged_df)


    Year   Users  Reviews
0   2005     937      854
1   2006    5423     3853
2   2007   15340    15363
3   2008   31097    48226
4   2009   64911    74387
5   2010  109054   138587
6   2011  176435   230813
7   2012  195955   286570
8   2013  209762   383950
9   2014  233465   522275
10  2015  247850   688415
11  2016  217620   758882
12  2017  151024   820048
13  2018  133568   906362
14  2019  104655   907284
15  2020   47444   554557
16  2021   40485   618189
17  2022    2782    31665
