In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
import pickle
import json
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")

All imports successful!


In [2]:
df = pd.read_csv('/Users/sajibhossain/Desktop/RestaurantRecommendationSystem/app/ai_service/src/data/processed/processed_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Sample restaurants: {df['name'].head().tolist()}")

# Check data quality
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"Data types:")
print(df.dtypes.value_counts())

Dataset shape: (51717, 42)
Columns: ['name', 'location', 'cuisines', 'rest_type', 'price_quality_ratio', 'popularity_score', 'quality_score', 'textblob_polarity', 'textblob_subjectivity', 'vote_density', 'votes', 'cluster_distance', 'avg_word_length', 'cuisines_target_encoded', 'review_length', 'cuisine_similarity_mean', 'cuisine_similarity_std', 'restaurant_cluster', 'sentiment_score', 'avg_sentence_length', 'word_count', 'sentence_count', 'positive_words', 'review_count', 'cuisines_count_encoded', 'rest_type_target_encoded', 'cost_clean', 'rest_type_count_encoded', 'cost_per_person', 'negative_words', 'location_similarity', 'has_detailed_review', 'location_count_encoded', 'location_popularity', 'location_target_encoded', 'book_table_binary', 'service_score', 'is_new_restaurant', 'cuisine_count', 'online_order_binary', 'cuisine_similarity_max', 'rating_clean']
Sample restaurants: ['Jalsa', 'Spice Elephant', 'San Churro Cafe', 'Addhuri Udupi Bhojana', 'Grand Village']

Missing values:


In [3]:

def prepare_features_for_similarity(df):
    
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    exclude_features = ['rating_clean', 'votes'] 
    similarity_features = [col for col in numerical_features if col not in exclude_features]
    
    print(f"Selected {len(similarity_features)} features for similarity calculation")
    
    feature_matrix = df[similarity_features].copy()
    
    feature_matrix = feature_matrix.fillna(feature_matrix.median())
    
    feature_matrix = feature_matrix.astype(np.float32)
    
    scaler = StandardScaler()
    feature_matrix_scaled = scaler.fit_transform(feature_matrix)
    
    feature_matrix_scaled = feature_matrix_scaled.astype(np.float32)
    
    return feature_matrix_scaled, similarity_features, scaler


feature_matrix, similarity_features, scaler = prepare_features_for_similarity(df)

print(f"Feature matrix shape: {feature_matrix.shape}")
print(f"Memory usage: {feature_matrix.nbytes / 1024 / 1024:.2f} MB")
print(f"Sample features: {similarity_features[:10]}")

Selected 36 features for similarity calculation
Feature matrix shape: (51717, 36)
Memory usage: 7.10 MB
Sample features: ['price_quality_ratio', 'popularity_score', 'quality_score', 'textblob_polarity', 'textblob_subjectivity', 'vote_density', 'cluster_distance', 'avg_word_length', 'cuisines_target_encoded', 'review_length']


In [4]:
def create_feature_weights(similarity_features):
    
    weights = {}
    
    restaurant_chars = [
        'cuisine_similarity_mean', 'cuisine_similarity_max', 'cuisine_similarity_std',
        'location_similarity', 'location_popularity',
        'cost_clean', 'cost_per_person',
        'rest_type_target_encoded', 'rest_type_count_encoded',
        'service_score', 'online_order_binary', 'book_table_binary'
    ]
    
    review_patterns = [
        'sentiment_score', 'textblob_polarity', 'textblob_subjectivity',
        'positive_words', 'negative_words',
        'review_length', 'review_count', 'word_count', 'sentence_count',
        'avg_sentence_length', 'avg_word_length', 'has_detailed_review'
    ]
    
    quality_popularity = [
        'quality_score', 'popularity_score', 'vote_density',
        'restaurant_cluster', 'cluster_distance',
        'price_quality_ratio', 'is_new_restaurant'
    ]
    
    for feature in similarity_features:
        if feature in restaurant_chars:
            weights[feature] = 0.40 / len(restaurant_chars)
        elif feature in review_patterns:
            weights[feature] = 0.35 / len(review_patterns)
        elif feature in quality_popularity:
            weights[feature] = 0.25 / len(quality_popularity)
        else:
            weights[feature] = 0.01  
    
    return weights

feature_weights = create_feature_weights(similarity_features)

print("Feature weights created:")
print(f"Restaurant characteristics weight: {sum([v for k, v in feature_weights.items() if 'cuisine' in k or 'location' in k or 'cost' in k or 'rest_type' in k or 'service' in k]):.2f}")
print(f"User review patterns weight: {sum([v for k, v in feature_weights.items() if 'sentiment' in k or 'textblob' in k or 'positive' in k or 'negative' in k or 'review' in k or 'word' in k or 'sentence' in k]):.2f}")
print(f"Quality & popularity weight: {sum([v for k, v in feature_weights.items() if 'quality' in k or 'popularity' in k or 'cluster' in k or 'price' in k]):.2f}")

Feature weights created:
Restaurant characteristics weight: 0.38
User review patterns weight: 0.35
Quality & popularity weight: 0.21


In [5]:

def create_similarity_matrix_chunked(feature_matrix, feature_weights, similarity_features, chunk_size=1000):
    """Create similarity matrix in chunks to avoid memory issues"""
    
    n_restaurants = feature_matrix.shape[0]
    print(f"Creating similarity matrix for {n_restaurants} restaurants...")
    
    weighted_matrix = feature_matrix.copy()
    for i, feature in enumerate(similarity_features):
        if feature in feature_weights:
            weighted_matrix[:, i] *= feature_weights[feature]
    
    similarity_matrix = np.zeros((n_restaurants, n_restaurants), dtype=np.float32)
    
    for i in range(0, n_restaurants, chunk_size):
        end_i = min(i + chunk_size, n_restaurants)
        print(f"Processing chunk {i//chunk_size + 1}/{(n_restaurants + chunk_size - 1)//chunk_size}")
        
        for j in range(0, n_restaurants, chunk_size):
            end_j = min(j + chunk_size, n_restaurants)
            
            chunk_similarity = cosine_similarity(
                weighted_matrix[i:end_i], 
                weighted_matrix[j:end_j]
            )
            
            similarity_matrix[i:end_i, j:end_j] = chunk_similarity
    
    return similarity_matrix

similarity_matrix = create_similarity_matrix_chunked(feature_matrix, feature_weights, similarity_features)

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"Memory usage: {similarity_matrix.nbytes / 1024 / 1024:.2f} MB")
print(f"Similarity range: {similarity_matrix.min():.3f} to {similarity_matrix.max():.3f}")
print(f"Average similarity: {similarity_matrix.mean():.3f}")

Creating similarity matrix for 51717 restaurants...
Processing chunk 1/52
Processing chunk 2/52
Processing chunk 3/52
Processing chunk 4/52
Processing chunk 5/52
Processing chunk 6/52
Processing chunk 7/52
Processing chunk 8/52
Processing chunk 9/52
Processing chunk 10/52
Processing chunk 11/52
Processing chunk 12/52
Processing chunk 13/52
Processing chunk 14/52
Processing chunk 15/52
Processing chunk 16/52
Processing chunk 17/52
Processing chunk 18/52
Processing chunk 19/52
Processing chunk 20/52
Processing chunk 21/52
Processing chunk 22/52
Processing chunk 23/52
Processing chunk 24/52
Processing chunk 25/52
Processing chunk 26/52
Processing chunk 27/52
Processing chunk 28/52
Processing chunk 29/52
Processing chunk 30/52
Processing chunk 31/52
Processing chunk 32/52
Processing chunk 33/52
Processing chunk 34/52
Processing chunk 35/52
Processing chunk 36/52
Processing chunk 37/52
Processing chunk 38/52
Processing chunk 39/52
Processing chunk 40/52
Processing chunk 41/52
Processing chu

In [6]:

class RestaurantRecommender:
    def __init__(self, df, similarity_matrix, feature_weights, similarity_features):
        self.df = df
        self.similarity_matrix = similarity_matrix
        self.feature_weights = feature_weights
        self.similarity_features = similarity_features
        self.restaurant_names = df['name'].tolist()
        
        self.name_to_index = {name: idx for idx, name in enumerate(self.restaurant_names)}
        self.index_to_name = {idx: name for name, idx in self.name_to_index.items()}
    
    def find_restaurant_index(self, restaurant_name):
        """Find restaurant index by name (with fuzzy matching)"""
        if restaurant_name in self.name_to_index:
            return self.name_to_index[restaurant_name]
        
        for name in self.restaurant_names:
            if restaurant_name.lower() in name.lower() or name.lower() in restaurant_name.lower():
                return self.name_to_index[name]
        
        return None
    
    def get_similarity_explanation(self, restaurant1_idx, restaurant2_idx):
        """Generate explanation for why restaurants are similar"""
        restaurant1 = self.df.iloc[restaurant1_idx]
        restaurant2 = self.df.iloc[restaurant2_idx]
        
        explanations = []
        
        if restaurant1['cuisines'] == restaurant2['cuisines']:
            explanations.append(f"Same cuisines: {restaurant1['cuisines']}")
        
        if restaurant1['location'] == restaurant2['location']:
            explanations.append(f"Same location: {restaurant1['location']}")
        
        cost_diff = abs(restaurant1['cost_clean'] - restaurant2['cost_clean'])
        if cost_diff < 100:
            explanations.append(f"Similar cost range: ₹{restaurant1['cost_clean']:.0f} vs ₹{restaurant2['cost_clean']:.0f}")
        
        rating_diff = abs(restaurant1['rating_clean'] - restaurant2['rating_clean'])
        if rating_diff < 0.5:
            explanations.append(f"Similar ratings: {restaurant1['rating_clean']:.1f} vs {restaurant2['rating_clean']:.1f}")
        
        if restaurant1['rest_type'] == restaurant2['rest_type']:
            explanations.append(f"Same restaurant type: {restaurant1['rest_type']}")
        
        return explanations
    
    def recommend_restaurants(self, restaurant_name, top_n=10, min_similarity=0.3):
        """Main recommendation function"""
        
        restaurant_idx = self.find_restaurant_index(restaurant_name)
        
        if restaurant_idx is None:
            return {
                'error': f'Restaurant "{restaurant_name}" not found in dataset',
                'suggestions': self.restaurant_names[:5]
            }
        
        restaurant_similarities = self.similarity_matrix[restaurant_idx]
        
        similar_indices = np.argsort(restaurant_similarities)[::-1][1:top_n+1]
        similar_scores = restaurant_similarities[similar_indices]
        
        valid_indices = similar_indices[similar_scores >= min_similarity]
        valid_scores = similar_scores[similar_scores >= min_similarity]
        
        if len(valid_indices) == 0:
            return {
                'error': f'No similar restaurants found for "{restaurant_name}" with similarity >= {min_similarity}',
                'restaurant_name': restaurant_name
            }
        
        recommendations = []
        for idx, score in zip(valid_indices, valid_scores):
            restaurant_data = self.df.iloc[idx]
            explanations = self.get_similarity_explanation(restaurant_idx, idx)
            
            recommendation = {
                'name': restaurant_data['name'],
                'similarity_score': float(score),
                'location': restaurant_data['location'],
                'cuisines': restaurant_data['cuisines'],
                'cost_for_two': f"₹{restaurant_data['cost_clean']:.0f}",
                'rating': float(restaurant_data['rating_clean']) if pd.notna(restaurant_data['rating_clean']) else None,
                'restaurant_type': restaurant_data['rest_type'],
                'online_order': restaurant_data['online_order'],
                'book_table': restaurant_data['book_table'],
                'why_similar': explanations
            }
            recommendations.append(recommendation)
        
        return {
            'input_restaurant': self.df.iloc[restaurant_idx]['name'],
            'recommendations': recommendations,
            'total_recommendations': len(recommendations)
        }


recommender = RestaurantRecommender(df, similarity_matrix, feature_weights, similarity_features)
print("Restaurant recommender initialized successfully!")

Restaurant recommender initialized successfully!


In [7]:
os.makedirs('models', exist_ok=True)

model_data = {
    'feature_weights': feature_weights,
    'similarity_features': similarity_features,
    'restaurant_names': df['name'].tolist(),
    'df_columns': df.columns.tolist(),
    'feature_matrix_shape': feature_matrix.shape,
    'scaler': scaler
}

with open('models/restaurant_recommender.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Saving similarity matrix...")

np.save('models/similarity_matrix.npy', similarity_matrix)

np.savez_compressed('models/similarity_matrix_compressed.npz', similarity_matrix)

def save_top_k_similarities(similarity_matrix, restaurant_names, k=20):
    """Save only top-K similar restaurants for each restaurant"""
    top_k_data = {}
    
    for i, restaurant_name in enumerate(restaurant_names):
        similarities = similarity_matrix[i]
        
        top_indices = np.argsort(similarities)[::-1][1:k+1]  
        top_similarities = similarities[top_indices]
        
        top_k_data[restaurant_name] = {
            'similar_restaurants': [restaurant_names[j] for j in top_indices],
            'similarity_scores': top_similarities.tolist()
        }
    
    with open('models/top_k_similarities.json', 'w') as f:
        json.dump(top_k_data, f)
    
    return top_k_data

top_k_data = save_top_k_similarities(similarity_matrix, df['name'].tolist(), k=20)
print("Saved top-20 similarities as JSON")

df.to_csv('models/restaurant_data.csv', index=False)

with open('models/feature_weights.json', 'w') as f:
    json.dump(feature_weights, f, indent=2)

training_summary = {
    'total_restaurants': len(df),
    'total_features': len(similarity_features),
    'similarity_matrix_shape': similarity_matrix.shape,
    'feature_categories': {
        'restaurant_characteristics': len([f for f in similarity_features if 'cuisine' in f or 'location' in f or 'cost' in f or 'rest_type' in f or 'service' in f]),
        'user_review_patterns': len([f for f in similarity_features if 'sentiment' in f or 'textblob' in f or 'positive' in f or 'negative' in f or 'review' in f or 'word' in f or 'sentence' in f]),
        'quality_popularity': len([f for f in similarity_features if 'quality' in f or 'popularity' in f or 'cluster' in f or 'price' in f])
    },
    'similarity_stats': {
        'min_similarity': float(similarity_matrix.min()),
        'max_similarity': float(similarity_matrix.max()),
        'mean_similarity': float(similarity_matrix.mean()),
        'std_similarity': float(similarity_matrix.std())
    },
    'memory_usage_mb': {
        'feature_matrix': feature_matrix.nbytes / 1024 / 1024,
        'similarity_matrix': similarity_matrix.nbytes / 1024 / 1024
    },
    'file_sizes_mb': {
        'similarity_matrix_npy': os.path.getsize('models/similarity_matrix.npy') / 1024 / 1024 if os.path.exists('models/similarity_matrix.npy') else 0,
        'top_k_similarities_json': os.path.getsize('models/top_k_similarities.json') / 1024 / 1024 if os.path.exists('models/top_k_similarities.json') else 0
    }
}

with open('models/training_summary.json', 'w') as f:
    json.dump(training_summary, f, indent=2)

print("Model training completed and saved successfully!")
print("\nFiles created:")
print("- models/restaurant_recommender.pkl (main model)")
print("- models/similarity_matrix.npy (efficient numpy format)")
print("- models/similarity_matrix_compressed.npz (compressed numpy)")
print("- models/top_k_similarities.json (top-20 similarities - recommended)")
print("- models/restaurant_data.csv (restaurant data)")
print("- models/feature_weights.json (feature weights)")
print("- models/training_summary.json (training summary)")

print(f"\nTraining Summary:")
print(f"Total restaurants: {training_summary['total_restaurants']}")
print(f"Total features: {training_summary['total_features']}")
print(f"Similarity matrix shape: {training_summary['similarity_matrix_shape']}")
print(f"Average similarity: {training_summary['similarity_stats']['mean_similarity']:.3f}")
print(f"Memory usage: {training_summary['memory_usage_mb']['similarity_matrix']:.2f} MB")
print(f"File sizes: .npy={training_summary['file_sizes_mb']['similarity_matrix_npy']:.1f}MB, .json={training_summary['file_sizes_mb']['top_k_similarities_json']:.1f}MB")

Saving similarity matrix...
Saved top-20 similarities as JSON
Model training completed and saved successfully!

Files created:
- models/restaurant_recommender.pkl (main model)
- models/similarity_matrix.npy (efficient numpy format)
- models/similarity_matrix_compressed.npz (compressed numpy)
- models/top_k_similarities.json (top-20 similarities - recommended)
- models/restaurant_data.csv (restaurant data)
- models/feature_weights.json (feature weights)
- models/training_summary.json (training summary)

Training Summary:
Total restaurants: 51717
Total features: 36
Similarity matrix shape: (51717, 51717)
Average similarity: 0.008
Memory usage: 10202.97 MB
File sizes: .npy=10203.0MB, .json=7.5MB


In [8]:

def quick_validation():
    """Quick validation of the trained model"""
    
    test_restaurants = ['Pizza Hut', 'McDonald\'s', 'KFC']
    validation_results = []
    
    for restaurant in test_restaurants:
        try:
            result = recommender.recommend_restaurants(restaurant, top_n=3)
            if 'error' not in result:
                validation_results.append({
                    'restaurant': restaurant,
                    'recommendations_found': result['total_recommendations'],
                    'avg_similarity': np.mean([rec['similarity_score'] for rec in result['recommendations']])
                })
            else:
                print(f"Error: {restaurant}: {result['error']}")
        except Exception as e:
            print(f"Error: {restaurant}: {e}")
    
    print("Quick Model Validation:")
    print(f"Successfully tested {len(validation_results)} restaurants")
    for result in validation_results:
        print(f"- {result['restaurant']}: {result['recommendations_found']} recommendations, avg similarity: {result['avg_similarity']:.3f}")
    
    if len(validation_results) > 0:
        print("Model validation successful!")
    else:
        print("Model validation failed. Check for issues.")

def recommend_restaurants_fixed(self, restaurant_name, top_n=10, min_similarity=0.3):
    
    restaurant_idx = self.find_restaurant_index(restaurant_name)
    
    if restaurant_idx is None:
        return {
            'error': f'Restaurant "{restaurant_name}" not found in dataset',
            'suggestions': self.restaurant_names[:5]
        }
    
    restaurant_similarities = self.similarity_matrix[restaurant_idx]
    
    similar_indices = np.argsort(restaurant_similarities)[::-1][1:top_n+1]
    similar_scores = restaurant_similarities[similar_indices]
    
    valid_indices = similar_indices[similar_scores >= min_similarity]
    valid_scores = similar_scores[similar_scores >= min_similarity]
    
    if len(valid_indices) == 0:
        return {
            'error': f'No similar restaurants found for "{restaurant_name}" with similarity >= {min_similarity}',
            'restaurant_name': restaurant_name
        }
    
    recommendations = []
    for idx, score in zip(valid_indices, valid_scores):
        restaurant_data = self.df.iloc[idx]
        
        recommendation = {
            'name': restaurant_data['name'],
            'similarity_score': float(score),
            'location': restaurant_data['location'],
            'cuisines': restaurant_data['cuisines'],
            'cost_for_two': f"₹{restaurant_data['cost_clean']:.0f}",
            'rating': float(restaurant_data['rating_clean']) if pd.notna(restaurant_data['rating_clean']) else None,
            'restaurant_type': restaurant_data['rest_type'],
            'online_order': restaurant_data['online_order_binary'],  
            'book_table': restaurant_data['book_table_binary'],      
            'why_similar': self.get_similarity_explanation(restaurant_idx, idx)
        }
        recommendations.append(recommendation)
    
    return {
        'input_restaurant': self.df.iloc[restaurant_idx]['name'],
        'recommendations': recommendations,
        'total_recommendations': len(recommendations)
    }

recommender.recommend_restaurants = recommend_restaurants_fixed.__get__(recommender, RestaurantRecommender)

quick_validation()

Quick Model Validation:
Successfully tested 3 restaurants
- Pizza Hut: 3 recommendations, avg similarity: 0.950
- McDonald's: 3 recommendations, avg similarity: 0.953
- KFC: 3 recommendations, avg similarity: 0.838
Model validation successful!
