In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Yelp dataset files
business_df = pd.read_json("/Users/sarthak/Downloads/Yelp JSON/yelp_dataset/yelp_academic_dataset_business.json", lines=True)


In [None]:
reviews_df = pd.read_json("/Users/sarthak/Downloads/Yelp JSON/yelp_dataset/yelp_academic_dataset_review.json", lines=True)

In [None]:
print(business_df.shape) 

In [None]:
print(reviews_df.shape) 

In [None]:
# Display initial dataset structure and information
print("\nBusiness Dataset Overview:")
print(business_df.info())
print(business_df.describe(include='all'))

In [None]:
print("\nReview Dataset Overview:")
print(reviews_df.info())

In [None]:
print("Unique Cities in Dataset:", business_df['city'].unique())



In [None]:
business_df = business_df.dropna(subset=['city'])


In [None]:
print(business_df.columns)


In [None]:
# Count the number of businesses per city
city_counts = business_df['city'].value_counts()

# Display the top 10 cities with the highest number of businesses
top_cities = city_counts.head(10)
print("\nTop 10 Cities with the Most Businesses:")
print(top_cities)

In [None]:
# Visualization - Top 10 Cities with Most Businesses
plt.figure(figsize=(12,5))
sns.barplot(x=top_cities.index, y=top_cities.values, palette='coolwarm')
plt.xlabel("City")
plt.ylabel("Number of Businesses")
plt.title("Top 10 Cities with the Most Businesses")
plt.xticks(rotation=45)
plt.show()

In [None]:

# Check city names to ensure correct filtering
print("Sample city names:", business_df['city'].dropna().unique()[:20])  # Print first 20 unique city names

# Define selected cities (make sure names match actual dataset)
selected_cities = ["Philadelphia", "tucson", "Tampa", "Indianapolis", "Nashville"]

# Drop NaN values in 'city' column
business_df = business_df.dropna(subset=['city'])

# Convert 'city' column to lowercase for consistent matching
business_df['city'] = business_df['city'].astype(str).str.lower()
selected_cities_lower = [c.lower() for c in selected_cities]

# Apply filtering (case insensitive)
filtered_business_df = business_df[business_df['city'].isin(selected_cities_lower)]

# Print results
print("Filtered dataset shape:", filtered_business_df.shape)
print(filtered_business_df['city'].value_counts())

In [None]:
# Define selected cities for analysis
selected_cities = ["philadelphia", "tucson", "tampa", "indianapolis", "nashville"]

# Check available city names before filtering
print("\nUnique city names in dataset:")
print(business_df['city'].unique())

# Filter dataset for selected cities
filtered_business_df = business_df[business_df['city'].isin(selected_cities)]
print("\nFiltered Business Dataset Overview:")
print(filtered_business_df.info())

In [None]:
# Extract only restaurant businesses
restaurant_df = filtered_business_df[filtered_business_df['categories'].str.contains('Restaurant', na=False)]
print("\nFiltered Restaurant Dataset Overview:")
print(restaurant_df.info())


In [None]:
# Count restaurants per city
restaurant_counts = restaurant_df['city'].value_counts()
print("\nNumber of Restaurants per City:")
print(restaurant_counts)

In [None]:
# Merge reviews with restaurant data
print("\nMerging reviews with restaurant data...")
reviews_df = reviews_df.merge(restaurant_df[['business_id', 'city']], on='business_id', how='inner')
print("Filtered Review Dataset Overview:")
print(reviews_df.info())

In [None]:
# Compute average star rating per city
avg_star_ratings = reviews_df.groupby('city')['stars'].mean()
print("\nAverage Star Ratings per City:")
print(avg_star_ratings)


In [None]:
# Compute review length statistics
reviews_df['review_length'] = reviews_df['text'].apply(len)
review_length_stats = reviews_df.groupby('city')['review_length'].describe()
print("\nReview Length Statistics per City:")
print(review_length_stats)

In [None]:
# Count reviews per year
reviews_df['year'] = pd.to_datetime(reviews_df['date']).dt.year
review_counts_by_year = reviews_df.groupby(['city', 'year']).size().unstack().fillna(0)


In [None]:
# Visualization - Restaurant count per city
plt.figure(figsize=(10,5))
sns.barplot(x=restaurant_counts.index, y=restaurant_counts.values, palette='viridis')
plt.xlabel("City")
plt.ylabel("Number of Restaurants")
plt.title("Number of Restaurants per City")
plt.show()


In [None]:
# Visualization - Average star rating per city
plt.figure(figsize=(10,5))
sns.barplot(x=avg_star_ratings.index, y=avg_star_ratings.values, palette='coolwarm')
plt.xlabel("City")
plt.ylabel("Average Star Rating")
plt.title("Average Star Rating per City")
plt.show()

In [None]:
# Visualization - Review length distribution per city
plt.figure(figsize=(12,6))
sns.boxplot(x='city', y='review_length', data=reviews_df, palette='Set2')
plt.xlabel("City")
plt.ylabel("Review Length")
plt.title("Distribution of Review Lengths per City")
plt.show()

In [None]:

# Visualization - Review counts over years per city
plt.figure(figsize=(12,6))
review_counts_by_year.T.plot(kind='line', figsize=(12,6), marker='o')
plt.xlabel("Year")
plt.ylabel("Number of Reviews")
plt.title("Review Trends Over Time per City")
plt.legend(title="City")
plt.show()

In [None]:
# Save processed data for further analysis
filtered_business_df.to_csv("filtered_business_data.csv", index=False)
reviews_df.to_csv("filtered_reviews_data.csv", index=False)

In [None]:
# Convert star ratings into positive (1) and negative (0) sentiment labels
reviews_df['sentiment'] = reviews_df['stars'].apply(lambda x: 1 if x > 3 else 0)

# Ground truth sentiment labels
y_true = reviews_df['sentiment'].values


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Create raw text model (Before Pre-processing)
vectorizer_raw = CountVectorizer()  # No text cleaning applied
X_raw = vectorizer_raw.fit_transform(reviews_df['text'])
model_raw = MultinomialNB()
model_raw.fit(X_raw, y_true)
y_pred_raw = model_raw.predict(X_raw)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create cleaned text model (After Pre-processing)
vectorizer_clean = TfidfVectorizer(stop_words='english', lowercase=True)
X_clean = vectorizer_clean.fit_transform(reviews_df['text'])
model_clean = MultinomialNB()
model_clean.fit(X_clean, y_true)
y_pred_clean = model_clean.predict(X_clean)
