In [1]:
import requests

url = "https://business.yelp.com/external-assets/files/Yelp-JSON.zip"
filename = "Yelp-JSON.zip"

# Adding a User-Agent header to mimic a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, stream=True, headers=headers)
response.raise_for_status() # Raise an exception for HTTP errors

with open(filename, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

print(f"Downloaded {filename} successfully.")

Downloaded Yelp-JSON.zip successfully.


In [2]:
!unzip Yelp-JSON.zip
print("Extracted Yelp-JSON.zip successfully.")

Archive:  Yelp-JSON.zip
   creating: Yelp JSON/
  inflating: Yelp JSON/Yelp Dataset Documentation & ToS copy.pdf  
  inflating: __MACOSX/Yelp JSON/._Yelp Dataset Documentation & ToS copy.pdf  
  inflating: Yelp JSON/yelp_dataset.tar  
  inflating: __MACOSX/Yelp JSON/._yelp_dataset.tar  
Extracted Yelp-JSON.zip successfully.


In [3]:
!tar -xf "Yelp JSON/yelp_dataset.tar" -C .

In [4]:
!du -sh *

80K	Dataset_User_Agreement.pdf
16K	__MACOSX
55M	sample_data
114M	yelp_academic_dataset_business.json
274M	yelp_academic_dataset_checkin.json
5.0G	yelp_academic_dataset_review.json
173M	yelp_academic_dataset_tip.json
3.2G	yelp_academic_dataset_user.json
4.1G	Yelp JSON
4.1G	Yelp-JSON.zip


In [5]:
import pandas as pd
import os
import random
import io

# Redefine the paths to the JSON files (assuming they are in the current directory)
business_file = 'yelp_academic_dataset_business.json'
review_file = 'yelp_academic_dataset_review.json'
user_file = 'yelp_academic_dataset_user.json'

# Define sampling rates
sample_rate_business = 0.50 # 50% for business file
sample_rate_large = 0.10    # 10% for review and user files

# --- Re-Sample and Load DataFrames ---
print(f"Re-sampling and loading {business_file} with rate {sample_rate_business*100}%...")
sampled_business_lines = []
with open(business_file, 'r', encoding='utf-8') as f:
    for line in f:
        if random.random() < sample_rate_business:
            sampled_business_lines.append(line)
df_business_sampled = pd.read_json(io.StringIO(''.join(sampled_business_lines)), lines=True)
print("df_business_sampled shape:", df_business_sampled.shape)

print(f"Re-sampling and loading {review_file} with rate {sample_rate_large*100}%...")
sampled_review_lines = []
with open(review_file, 'r', encoding='utf-8') as f:
    for line in f:
        if random.random() < sample_rate_large:
            sampled_review_lines.append(line)
df_review_sampled = pd.read_json(io.StringIO(''.join(sampled_review_lines)), lines=True)
print("df_review_sampled shape:", df_review_sampled.shape)

print(f"Re-sampling and loading {user_file} with rate {sample_rate_large*100}%...")
sampled_user_lines = []
with open(user_file, 'r', encoding='utf-8') as f:
    for line in f:
        if random.random() < sample_rate_large:
            sampled_user_lines.append(line)
df_user_sampled = pd.read_json(io.StringIO(''.join(sampled_user_lines)), lines=True)
print("df_user_sampled shape:", df_user_sampled.shape)

print("Sampled Yelp dataset files re-loaded into DataFrames successfully.")

# --- 1. Distribution of business ratings (stars) from df_business_sampled ---
print("\n--- 1. Distribution of business ratings (stars) from df_business_sampled ---")
dist_business_stars_sampled = df_business_sampled['stars'].value_counts().sort_index()
print("Distribution of Sampled Business Ratings:")
print(dist_business_stars_sampled)

# --- 2. Top 10 business categories by count from df_business_sampled ---
print("\n--- 2. Top 10 business categories by count from df_business_sampled ---")
# Filter out rows where 'categories' is None or NaN
df_business_sampled_clean_categories = df_business_sampled.dropna(subset=['categories'])

# Split categories string into a list of categories and then explode them into separate rows
all_categories_sampled = df_business_sampled_clean_categories['categories'].str.split(', ').explode()

# Count the occurrences of each category
category_counts_sampled = all_categories_sampled.value_counts()

# Get the top 10 most frequent categories
top_10_categories_sampled = category_counts_sampled.head(10)

print("Top 10 Sampled Business Categories:")
print(top_10_categories_sampled)

# --- 3. Distribution of review stars from df_review_sampled ---
print("\n--- 3. Distribution of review stars from df_review_sampled ---")
dist_review_stars_sampled = df_review_sampled['stars'].value_counts().sort_index()
print("Distribution of Sampled Review Stars:")
print(dist_review_stars_sampled)

# --- 4. Top 10 users by review count from df_user_sampled ---
print("\n--- 4. Top 10 users by review count from df_user_sampled ---")
top_10_users_by_review_count_sampled = df_user_sampled.sort_values(by='review_count', ascending=False).head(10)
print("Top 10 Sampled Users by Review Count:")
print(top_10_users_by_review_count_sampled[['name', 'review_count', 'average_stars']])

Re-sampling and loading yelp_academic_dataset_business.json with rate 50.0%...
df_business_sampled shape: (75109, 14)
Re-sampling and loading yelp_academic_dataset_review.json with rate 10.0%...
df_review_sampled shape: (699062, 9)
Re-sampling and loading yelp_academic_dataset_user.json with rate 10.0%...
df_user_sampled shape: (198717, 22)
Sampled Yelp dataset files re-loaded into DataFrames successfully.

--- 1. Distribution of business ratings (stars) from df_business_sampled ---
Distribution of Sampled Business Ratings:
stars
1.0     1042
1.5     2477
2.0     4810
2.5     7142
3.0     9221
3.5    13215
4.0    15494
4.5    13579
5.0     8129
Name: count, dtype: int64

--- 2. Top 10 business categories by count from df_business_sampled ---
Top 10 Sampled Business Categories:
categories
Restaurants         26038
Food                13885
Shopping            12273
Home Services        7224
Beauty & Spas        7070
Nightlife            6090
Health & Medical     5930
Local Services     

In [23]:
# Sentiment Analysis on Yelp Reviews  (SpecialTopic_HM02)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [24]:
# فقط از reviewها استفاده می‌کنیم
df_sent = df_review_sampled[['text', 'stars']].copy()

# حذف ردیف‌های ناقص
df_sent = df_sent.dropna(subset=['text', 'stars'])

# ستاره‌ها را به int تبدیل می‌کنیم
df_sent['stars'] = df_sent['stars'].astype(int)

# تعریف برچسب طبق صورت سؤال:
# در review ها، کامنت‌های با 3 و 4 و 5 ستاره → positive
# بقیه → negative
def make_label(stars):
    return 'positive' if stars in [3, 4, 5] else 'negative'

df_sent['label'] = df_sent['stars'].apply(make_label)

print("تعداد هر کلاس:")
print(df_sent['label'].value_counts())
print("\nنسبت هر کلاس:")
print(df_sent['label'].value_counts(normalize=True))

df_sent.head()


تعداد هر کلاس:
label
positive    537350
negative    161712
Name: count, dtype: int64

نسبت هر کلاس:
label
positive    0.768673
negative    0.231327
Name: proportion, dtype: float64


Unnamed: 0,text,stars,label
0,"Yes, this is the only sushi place in town. How...",4,positive
1,I was really between 3 and 4 stars for this on...,4,positive
2,After my ROTD yesterday of a different Sweet ...,4,positive
3,I stopped here because it was by the hotel I w...,5,positive
4,I just started going to Sushi Nara this month ...,5,positive


In [25]:
# برای سریع‌تر شدن آموزش، می‌توانیم یک نمونه تصادفی از داده‌ها برداریم.
# اگر دوست نداری، می‌توانی این سلول را اجرا نکنی یا N را کوچک‌تر/بزرگ‌تر کنی.

N = 50000  # مثلا ۵۰هزار نمونه
if len(df_sent) > N:
    df_sent = df_sent.sample(n=N, random_state=42)
    print(f"بعد از نمونه‌گیری: {len(df_sent)} ردیف")
else:
    print(f"کل داده استفاده می‌شود: {len(df_sent)} ردیف")

df_sent['label'].value_counts()


بعد از نمونه‌گیری: 50000 ردیف


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positive,38456
negative,11544


In [26]:
X = df_sent['text']
y = df_sent['label']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 40000
Test size: 10000


In [28]:
# Baseline: همیشه کلاس اکثریت را پیش‌بینی می‌کنیم

majority_class = y_train.value_counts().idxmax()
print("Majority class:", majority_class)

y_pred_majority = [majority_class] * len(y_test)

print("=== Baseline: Majority Class ===")
print("Accuracy:", accuracy_score(y_test, y_pred_majority))
print("\nClassification report:\n", classification_report(y_test, y_pred_majority))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_majority))


Majority class: positive
=== Baseline: Majority Class ===
Accuracy: 0.7691

Classification report:
               precision    recall  f1-score   support

    negative       0.00      0.00      0.00      2309
    positive       0.77      1.00      0.87      7691

    accuracy                           0.77     10000
   macro avg       0.38      0.50      0.43     10000
weighted avg       0.59      0.77      0.67     10000


Confusion matrix:
 [[   0 2309]
 [   0 7691]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Simple heuristic sentiment classifier for Yelp
# شبیه Simple Hiuristic method در SpecialTopics_01، فقط اینجا انگلیسی

positive_words = ["good", "great", "excellent", "amazing", "fantastic", "love", "loved",
                  "awesome", "delicious", "tasty", "friendly", "perfect", "nice"]
negative_words = ["bad", "terrible", "awful", "horrible", "worst", "disgusting",
                  "slow", "rude", "cold", "overpriced", "dirty", "poor"]

def heuristic_sentiment(text):
    text_l = str(text).lower()
    if any(w in text_l for w in positive_words):
        return "positive"
    if any(w in text_l for w in negative_words):
        return "negative"
    # اگر هیچ نشانه‌ای نداشت، از baseline کمک می‌گیریم
    return majority_class

y_pred_heuristic = [heuristic_sentiment(t) for t in X_test]

print("=== Heuristic classifier ===")
print("Accuracy:", accuracy_score(y_test, y_pred_heuristic))
print("\nClassification report:\n", classification_report(y_test, y_pred_heuristic))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_heuristic))


=== Heuristic classifier ===
Accuracy: 0.8235

Classification report:
               precision    recall  f1-score   support

    negative       0.89      0.27      0.41      2309
    positive       0.82      0.99      0.90      7691

    accuracy                           0.82     10000
   macro avg       0.85      0.63      0.65     10000
weighted avg       0.83      0.82      0.78     10000


Confusion matrix:
 [[ 622 1687]
 [  78 7613]]


In [30]:
# Bag of Words + Multinomial Naive Bayes

bow_vectorizer = CountVectorizer(
    stop_words='english',
    max_features=20000,
    # ngram_range=(1,2),
    # min_df=5,
)

X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow  = bow_vectorizer.transform(X_test)

nb_bow = MultinomialNB()
nb_bow.fit(X_train_bow, y_train)

y_pred_bow = nb_bow.predict(X_test_bow)

print("=== Bag of Words + MultinomialNB ===")
print("Accuracy:", accuracy_score(y_test, y_pred_bow))
print("\nClassification report:\n", classification_report(y_test, y_pred_bow))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_bow))


=== Bag of Words + MultinomialNB ===
Accuracy: 0.8827

Classification report:
               precision    recall  f1-score   support

    negative       0.73      0.77      0.75      2309
    positive       0.93      0.92      0.92      7691

    accuracy                           0.88     10000
   macro avg       0.83      0.84      0.84     10000
weighted avg       0.89      0.88      0.88     10000


Confusion matrix:
 [[1787  522]
 [ 651 7040]]


In [31]:
# TF-IDF + Multinomial Naive Bayes

tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=20000,
    # ngram_range=(1,2),
    # min_df=5,
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)

print("=== TF-IDF + MultinomialNB ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("\nClassification report:\n", classification_report(y_test, y_pred_tfidf))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_tfidf))


=== TF-IDF + MultinomialNB ===
Accuracy: 0.8724

Classification report:
               precision    recall  f1-score   support

    negative       0.88      0.51      0.65      2309
    positive       0.87      0.98      0.92      7691

    accuracy                           0.87     10000
   macro avg       0.88      0.75      0.79     10000
weighted avg       0.87      0.87      0.86     10000


Confusion matrix:
 [[1188 1121]
 [ 155 7536]]


In [32]:
# Boolean Bag of Words + Multinomial Naive Bayes

bool_vectorizer = CountVectorizer(
    binary=True,
    stop_words='english',
    max_features=20000
)

X_train_bool = bool_vectorizer.fit_transform(X_train)
X_test_bool  = bool_vectorizer.transform(X_test)

nb_bool = MultinomialNB()
nb_bool.fit(X_train_bool, y_train)

y_pred_bool = nb_bool.predict(X_test_bool)

print("=== Boolean BOW + MultinomialNB ===")
print("Accuracy:", accuracy_score(y_test, y_pred_bool))
print("\nClassification report:\n", classification_report(y_test, y_pred_bool))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred_bool))


=== Boolean BOW + MultinomialNB ===
Accuracy: 0.8876

Classification report:
               precision    recall  f1-score   support

    negative       0.74      0.79      0.76      2309
    positive       0.93      0.92      0.93      7691

    accuracy                           0.89     10000
   macro avg       0.84      0.85      0.84     10000
weighted avg       0.89      0.89      0.89     10000


Confusion matrix:
 [[1815  494]
 [ 630 7061]]
