In [2]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp39-cp39-macosx_10_9_x86_64.whl size=507971 sha256=8e160d95a2428f63adb109a3741b542997f34c9ce68ce70b12768f82fd817514
  Stored in directory: /Users/sarthak/Library/Caches/pip/wheels/42/41/d3/a56ae864ad22cc6583ec9312be43fbc611c87e53dc49aac953
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [3]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pickle

# Load the filtered reviews data
reviews_df = pd.read_csv("filtered_reviews_data.csv")

In [4]:
# Step 1: Prepare data for collaborative filtering using Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reviews_df[['user_id', 'business_id', 'stars']], reader)

# Step 2: Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Train SVD model
svd = SVD(n_factors=100, biased=True, verbose=True)
svd.fit(trainset)

# Step 4: Evaluate model
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Model RMSE: {rmse:.4f}")



Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 1.2172
Model RMSE: 1.2172


In [5]:
# Step 5: Save model
with open("svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

# Step 6: Create prediction function
def recommend_for_user(user_id, top_n=5):
    unique_businesses = reviews_df['business_id'].unique()
    reviewed = set(reviews_df[reviews_df['user_id'] == user_id]['business_id'])
    to_predict = [bid for bid in unique_businesses if bid not in reviewed]
    
    predictions = [
        (bid, svd.predict(user_id, bid).est)
        for bid in to_predict
    ]
    top_predictions = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]

    # Map business IDs back to names
    business_df = pd.read_csv("filtered_business_data.csv")
    id_to_name = dict(zip(business_df['business_id'], business_df['name']))
    recommendations = [(id_to_name.get(bid, "Unknown"), score) for bid, score in top_predictions]

    print(f"\nTop {top_n} Recommendations for User {user_id}:")
    for name, score in recommendations:
        print(f"{name} - Predicted Rating: {score:.2f}")

# Example usage
example_user = reviews_df['user_id'].iloc[0]
recommend_for_user(example_user)


Top 5 Recommendations for User 8g_iMtfSiwikVnbP2etR0A:
Milktooth - Predicted Rating: 5.00
Tavern - Predicted Rating: 5.00
Chase's Hop Shop - Predicted Rating: 5.00
Angelina's - Predicted Rating: 5.00
Sips Specialty Coffee House - Predicted Rating: 5.00


In [6]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import pickle

# Load filtered reviews and business data
reviews_df = pd.read_csv("filtered_reviews_data.csv")
business_df = pd.read_csv("filtered_business_data.csv")

In [7]:
# Step 1: Prepare data for collaborative filtering using Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reviews_df[['user_id', 'business_id', 'stars']], reader)

# Step 2: Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Train SVD model
svd = SVD(n_factors=100, biased=True, verbose=True)
svd.fit(trainset)

# Step 4: Evaluate model
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Model RMSE: {rmse:.4f}")



Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 1.2170
Model RMSE: 1.2170


In [8]:
# Step 5: Save model
with open("svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

# Step 6: Precompute average sentiment per business
sentiment_avg = reviews_df.groupby('business_id')['vader_sentiment'].mean().to_dict()


KeyError: 'Column not found: vader_sentiment'

In [None]:
# Step 7: Enhanced recommendation function
def recommend_for_user(user_id, top_n=5, city=None):
    user_reviews = reviews_df[reviews_df['user_id'] == user_id]
    reviewed = set(user_reviews['business_id'])

    # Filter candidate businesses
    candidate_df = business_df[~business_df['business_id'].isin(reviewed)]
    if city:
        candidate_df = candidate_df[candidate_df['city'].str.lower() == city.lower()]

    # Predict and rank
    recommendations = []
    for _, row in candidate_df.iterrows():
        bid = row['business_id']
        name = row['name']
        pred_rating = svd.predict(user_id, bid).est
        sentiment = sentiment_avg.get(bid, 0.0)
        final_score = 0.6 * pred_rating + 0.4 * sentiment  # Weighted combination
        recommendations.append((name, pred_rating, sentiment, final_score))

    top_recs = sorted(recommendations, key=lambda x: x[3], reverse=True)[:top_n]

    print(f"\nTop {top_n} Recommendations for User {user_id}{' in ' + city if city else ''}:")
    for name, rating, sentiment, score in top_recs:
        print(f"{name} | Predicted Rating: {rating:.2f} | Sentiment: {sentiment:.2f} | Final Score: {score:.2f}")

# Example usage
example_user = reviews_df['user_id'].iloc[0]
recommend_for_user(example_user, top_n=5, city='Tucson')
