In [66]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load data from JSON files
with open('/Users/semergy/yelp/yelp_business.json', 'r') as file:
    business_data = json.load(file)

with open('/Users/semergy/yelp/yelp_checkin.json', 'r') as file:
    checkin_data = json.load(file)

with open('/Users/semergy/yelp/yelp_data.json', 'r') as file:
    data_data = json.load(file)

with open('/Users/semergy/yelp/yelp_photo.json', 'r') as file:
    photo_data = json.load(file)

with open('/Users/semergy/yelp/yelp_review.json', 'r') as file:
    review_data = json.load(file)

with open('/Users/semergy/yelp/yelp_tip.json', 'r') as file:
    tip_data = json.load(file)

with open('/Users/semergy/yelp/yelp_user.json', 'r') as file:
    user_data = json.load(file)

# Convert JSON data to pandas DataFrames
business_df = pd.DataFrame(business_data)
checkin_df = pd.DataFrame(checkin_data)
data_df = pd.DataFrame(data_data)
photo_df = pd.DataFrame(photo_data)
review_df = pd.DataFrame(review_data)
tip_df = pd.DataFrame(tip_data)
user_df = pd.DataFrame(user_data)

# Merge dataframes on the 'business_id' column
merged_df = business_df
for df in [checkin_df, data_df, photo_df, review_df, tip_df, user_df]:
    merged_df = pd.merge(merged_df, df, on='business_id', how='left', suffixes=('', '_dup'))

# Remove duplicate columns
merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith('_dup')]

# Drop irrelevant or non-numeric columns
columns_to_drop = [
    'address', 'categories', 'city', 'hours', 'name', 'neighborhood', 
    'postal_code', 'state', 'time', 'business_id', 'attributes'
]
merged_df = merged_df.drop(columns=columns_to_drop)

# Fill NaN values with 0
merged_df_filled = merged_df.fillna(0)

# Calculate the correlation matrix
correlation_matrix = merged_df_filled.corr()

# Extract the correlation values with the target variable 'stars'
correlation_with_stars = correlation_matrix['stars']

# Print the correlations with 'stars'
print("Correlations with 'stars':\n", correlation_with_stars)

# Identify features with the highest positive and negative correlations
positive_correlations = correlation_with_stars[correlation_with_stars > 0].sort_values(ascending=False)
negative_correlations = correlation_with_stars[correlation_with_stars < 0].sort_values()

print("\nPositive correlations:\n", positive_correlations)
print("\nNegative correlations:\n", negative_correlations)

# Select features with significant correlations (arbitrarily setting a threshold of 0.05 for this example)
significant_features = correlation_with_stars[abs(correlation_with_stars) > 0.05].index.tolist()

# Remove 'stars' from the list of features to avoid including the target variable as a feature
significant_features.remove('stars')

# Create a DataFrame of just the target column (Yelp ratings stars)
target_df = merged_df_filled[['stars']]

# Create a DataFrame of the selected significant features
features_df = merged_df_filled[significant_features]

print("\nSignificant features selected for training the model:\n", significant_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, test_size=0.2, random_state=1)

# Linear regression model 
stars_predict = LinearRegression()
stars_predict.fit(X_train, y_train)

# Make predictions using the test data
y_pred = stars_predict.predict(X_test)

# Evaluate the model performance
r2 = r2_score(y_test, y_pred)
print("\nR^2 Score:", r2)

# Select the first 5 rows from the test set for predictions
sample_data = X_test.iloc[:5]

# Use the trained model to make predictions on the sample data
sample_predictions = stars_predict.predict(sample_data)

# Print the predictions and the actual values
print("\nSample Predictions:", sample_predictions)
print("\nActual Values:", y_test.iloc[:5].values)


Correlations with 'stars':
 alcohol?                     -0.043332
good_for_kids                -0.030382
has_bike_parking              0.068084
has_wifi                     -0.039857
is_open                       0.051913
latitude                     -0.093255
longitude                    -0.082265
price_range                  -0.052565
review_count                  0.032413
stars                         1.000000
take_reservations            -0.024486
takes_credit_cards            0.037748
weekday_checkins              0.004130
weekend_checkins              0.007863
pic_count                     0.001727
average_caption_length        0.000040
number_pics                   0.001727
average_review_age           -0.125645
average_review_length        -0.277081
average_review_sentiment      0.782187
number_funny_votes            0.001320
number_cool_votes             0.043375
number_useful_votes          -0.000066
average_tip_length           -0.052899
number_tips                   0.0140