In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import category_encoders as ce
from textblob import TextBlob
import re
import ast
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")

All imports successful!


In [None]:
df = pd.read_csv('app/ai_service/src/data/raw/zomato.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

Dataset shape: (51717, 17)
Columns: ['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes', 'phone', 'location', 'rest_type', 'dish_liked', 'cuisines', 'approx_cost(for two people)', 'reviews_list', 'menu_item', 'listed_in(type)', 'listed_in(city)']


Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [4]:
def create_basic_features(df):
    df['rating_clean'] = df['rate'].apply(lambda x: 
        float(str(x).split('/')[0].strip()) if pd.notna(x) and x != 'NEW' and '/' in str(x) else np.nan
    )
    df['is_new_restaurant'] = (df['rate'] == 'NEW').astype(int)
    
    df['cost_clean'] = df['approx_cost(for two people)'].apply(
        lambda x: float(str(x).replace(',', '')) if pd.notna(x) else np.nan
    )
    df['cost_per_person'] = df['cost_clean'] / 2
    
    df['cuisine_list'] = df['cuisines'].apply(
        lambda x: [c.strip() for c in str(x).split(',')] if pd.notna(x) else []
    )
    df['cuisine_count'] = df['cuisine_list'].apply(len)
    
    location_counts = df['location'].value_counts()
    df['location_popularity'] = df['location'].map(location_counts) / location_counts.max()
    
    df['online_order_binary'] = (df['online_order'] == 'Yes').astype(int)
    df['book_table_binary'] = (df['book_table'] == 'Yes').astype(int)
    df['service_score'] = df['online_order_binary'] + df['book_table_binary']
    
    df['popularity_score'] = df['rating_clean'].fillna(3.5) * np.log1p(df['votes'])
    df['vote_density'] = df['votes'] / df['votes'].max()
    
    df['quality_score'] = (df['rating_clean'].fillna(3.5) * 
                          np.log1p(df['votes']) * 
                          (1 / (df['cost_clean'].fillna(500) / 100)))
    
    df['price_quality_ratio'] = df['rating_clean'].fillna(3.5) / (df['cost_clean'].fillna(500) / 100)
    
    return df

df = create_basic_features(df)
print("Basic features created successfully!")

Basic features created successfully!


In [5]:
class TextProcessor:
    def __init__(self):
        self.positive_words = {
            'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 
            'delicious', 'tasty', 'yummy', 'awesome', 'perfect', 'best', 'love',
            'enjoy', 'satisfied', 'happy', 'pleased', 'outstanding', 'superb',
            'fresh', 'authentic', 'flavorful', 'spicy', 'crispy', 'juicy',
            'tender', 'aromatic', 'savory', 'sweet', 'creamy', 'rich'
        }
        self.negative_words = {
            'bad', 'terrible', 'awful', 'horrible', 'disgusting', 'worst',
            'hate', 'disappointed', 'poor', 'mediocre', 'average', 'bland',
            'overpriced', 'expensive', 'slow', 'rude', 'dirty', 'cold',
            'stale', 'burnt', 'undercooked', 'overcooked', 'greasy', 'dry',
            'tasteless', 'boring', 'unhygienic', 'crowded', 'noisy'
        }
    
    def extract_text_features(self, reviews_str):
        try:
            reviews = ast.literal_eval(reviews_str)
            if not isinstance(reviews, list):
                return self._get_default_features()
            
            all_text = ' '.join([str(review[1]) if len(review) > 1 else '' for review in reviews])
            
            return self._extract_features(all_text, reviews)
                
        except:
            return self._get_default_features()
    
    def _extract_features(self, text, reviews):
        sentences = re.split(r'[.!?]+', text)
        words = re.findall(r'\b\w+\b', text.lower())
        
        positive_count = sum(1 for word in words if word in self.positive_words)
        negative_count = sum(1 for word in words if word in self.negative_words)
        
        total_words = len(words) if words else 1
        sentiment_score = (positive_count - negative_count) / total_words
        
        try:
            blob = TextBlob(text)
            textblob_polarity = blob.sentiment.polarity
            textblob_subjectivity = blob.sentiment.subjectivity
        except:
            textblob_polarity = 0
            textblob_subjectivity = 0.5
        
        word_lengths = [len(word) for word in words]
        
        return {
            'positive_words': positive_count,
            'negative_words': negative_count,
            'sentiment_score': sentiment_score,
            'textblob_polarity': textblob_polarity,
            'textblob_subjectivity': textblob_subjectivity,
            'review_length': len(text),
            'review_count': len(reviews),
            'word_count': len(words),
            'sentence_count': len([s for s in sentences if s.strip()]),
            'avg_sentence_length': len(words) / len([s for s in sentences if s.strip()]) if sentences else 0,
            'avg_word_length': np.mean(word_lengths) if word_lengths else 0,
            'has_detailed_review': 1 if len(text) > 100 else 0
        }
    
    def _get_default_features(self):
        return {
            'positive_words': 0, 'negative_words': 0, 'sentiment_score': 0,
            'textblob_polarity': 0, 'textblob_subjectivity': 0.5,
            'review_length': 0, 'review_count': 0, 'word_count': 0,
            'sentence_count': 0, 'avg_sentence_length': 0, 'avg_word_length': 0,
            'has_detailed_review': 0
        }

text_processor = TextProcessor()
review_features = df['reviews_list'].apply(text_processor.extract_text_features)
review_df = pd.DataFrame(review_features.tolist())
df = pd.concat([df, review_df], axis=1)

print(f"Text features extracted: {list(review_df.columns)}")

Text features extracted: ['positive_words', 'negative_words', 'sentiment_score', 'textblob_polarity', 'textblob_subjectivity', 'review_length', 'review_count', 'word_count', 'sentence_count', 'avg_sentence_length', 'avg_word_length', 'has_detailed_review']


In [6]:

class CategoricalEncoder:
    def __init__(self):
        self.target_encoders = {}
        self.count_encoders = {}
    
    def fit_transform_encoding(self, df, categorical_cols, target_col='rating_clean'):
        df_encoded = df.copy()
        
        if target_col in df_encoded.columns:
            target_median = df_encoded[target_col].median()
            df_encoded[target_col] = df_encoded[target_col].fillna(target_median)
        
        for col in categorical_cols:
            if col in df.columns:
                target_encoder = ce.TargetEncoder(cols=[col], smoothing=1.0)
                df_encoded[f'{col}_target_encoded'] = target_encoder.fit_transform(
                    df_encoded[col], df_encoded[target_col]
                )
                
                count_encoder = ce.CountEncoder(cols=[col])
                df_encoded[f'{col}_count_encoded'] = count_encoder.fit_transform(df_encoded[col])
                
                self.target_encoders[col] = target_encoder
                self.count_encoders[col] = count_encoder
        
        return df_encoded

categorical_cols = ['location', 'rest_type', 'cuisines']
encoder = CategoricalEncoder()
df = encoder.fit_transform_encoding(df, categorical_cols)
print("Categorical encoding completed!")

Categorical encoding completed!


In [7]:

class ClusterGenerator:
    def __init__(self, n_clusters=15):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        self.pca = PCA(n_components=0.95)
    
    def create_cluster_features(self, df):
        cluster_features = [
            'rating_clean', 'cost_clean', 'votes', 'cuisine_count',
            'location_popularity', 'sentiment_score', 'textblob_polarity',
            'popularity_score', 'quality_score'
        ]
        
        cluster_data = df[cluster_features].fillna(df[cluster_features].median())
        
        scaler = StandardScaler()
        cluster_data_scaled = scaler.fit_transform(cluster_data)
        
        cluster_data_pca = self.pca.fit_transform(cluster_data_scaled)
        
        clusters = self.kmeans.fit_predict(cluster_data_pca)
        
        df['restaurant_cluster'] = clusters
        
        cluster_centers = self.kmeans.cluster_centers_
        cluster_distances = []
        
        for i, row in enumerate(cluster_data_pca):
            cluster_id = clusters[i]
            distance = np.linalg.norm(row - cluster_centers[cluster_id])
            cluster_distances.append(distance)
        
        df['cluster_distance'] = cluster_distances
        
        return df

cluster_generator = ClusterGenerator(n_clusters=15)
df = cluster_generator.create_cluster_features(df)
print("Clustering features created!")

Clustering features created!


In [8]:

class SimilarityGenerator:
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
    
    def create_similarity_features(self, df):
        cuisine_texts = df['cuisines'].fillna('').astype(str)
        cuisine_tfidf = self.tfidf_vectorizer.fit_transform(cuisine_texts)
        cuisine_similarity = cosine_similarity(cuisine_tfidf)
        
        df['cuisine_similarity_mean'] = cuisine_similarity.mean(axis=1)
        df['cuisine_similarity_max'] = cuisine_similarity.max(axis=1)
        df['cuisine_similarity_std'] = cuisine_similarity.std(axis=1)
        
        location_embeddings = df.groupby('location').agg({
            'rating_clean': 'mean',
            'cost_clean': 'mean',
            'votes': 'mean',
            'cuisine_count': 'mean'
        }).fillna(0)
        
        location_similarity = cosine_similarity(location_embeddings)
        
        location_sim_features = []
        for loc in df['location']:
            if loc in location_embeddings.index:
                loc_idx = location_embeddings.index.get_loc(loc)
                location_sim_features.append(location_similarity[loc_idx].mean())
            else:
                location_sim_features.append(0)
        
        df['location_similarity'] = location_sim_features
        
        return df


similarity_generator = SimilarityGenerator()
df = similarity_generator.create_similarity_features(df)
print("Similarity features created!")

Similarity features created!


In [9]:
class FeatureSelector:
    def __init__(self, target_col='rating_clean'):
        self.target_col = target_col
        self.selected_features = []
    
    def select_features(self, df, method='mutual_info', k=60):
        feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        feature_cols = [col for col in feature_cols if col != self.target_col]
        
        X = df[feature_cols].fillna(0)
        y = df[self.target_col].fillna(df[self.target_col].median())  # Fixed this line
        
        if method == 'mutual_info':
            mi_scores = mutual_info_regression(X, y, random_state=42)
            feature_scores = pd.Series(mi_scores, index=feature_cols)
        
        elif method == 'f_regression':
            f_scores, _ = f_regression(X, y)
            feature_scores = pd.Series(f_scores, index=feature_cols)
        
        elif method == 'random_forest':
            rf = RandomForestRegressor(n_estimators=100, random_state=42)
            rf.fit(X, y)
            feature_scores = pd.Series(rf.feature_importances_, index=feature_cols)
        
        top_features = feature_scores.nlargest(k).index.tolist()
        self.selected_features = top_features
        
        return top_features, feature_scores


feature_selector = FeatureSelector()
selected_features, feature_scores = feature_selector.select_features(df, method='mutual_info', k=60)

print("Top 20 features by importance:")
print(feature_scores.nlargest(20))

Top 20 features by importance:
price_quality_ratio        2.319179
popularity_score           2.257198
quality_score              1.835570
textblob_polarity          1.208964
textblob_subjectivity      1.118228
vote_density               1.041067
votes                      1.040661
cluster_distance           1.039325
avg_word_length            0.956844
cuisines_target_encoded    0.948857
review_length              0.887392
cuisine_similarity_mean    0.770675
cuisine_similarity_std     0.770400
restaurant_cluster         0.748637
sentiment_score            0.739344
avg_sentence_length        0.732775
word_count                 0.605589
sentence_count             0.378794
positive_words             0.319561
review_count               0.315298
dtype: float64


In [10]:
def create_final_dataset(df, selected_features):
    essential_features = ['name', 'location', 'cuisines', 'rest_type']
    
    final_features = essential_features + selected_features
    
    final_df = df[final_features].copy()
    
    numerical_features = final_df.select_dtypes(include=[np.number]).columns
    final_df[numerical_features] = final_df[numerical_features].fillna(final_df[numerical_features].median())
    
    final_df['rating_clean'] = df['rating_clean']
    
    return final_df

final_df = create_final_dataset(df, selected_features)

print(f"Final dataset shape: {final_df.shape}")
print(f"Total features: {len(final_df.columns)}") 
final_df.to_csv('/Users/sajibhossain/Desktop/RestaurantRecommendationSystem/app/ai_service/src/data/processed/processed_data.csv', index=False)
print("Processed dataset saved")
numerical_features = final_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = final_df.select_dtypes(include=['object']).columns.tolist()

print(f"\nFeature Summary:")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Total features: {len(final_df.columns)}")

Final dataset shape: (51717, 42)
Total features: 42
Processed dataset saved

Feature Summary:
Numerical features: 38
Categorical features: 4
Total features: 42
