In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from ast import literal_eval
import os
from tabulate import tabulate
from termcolor import colored

class ImprovedRecommender:
    def __init__(self, data_dir='../../data/recommendation'):
        self.data_dir = data_dir
        self.encoders = {}
        self.xgb_model = None
        self.locations_df = None
        self.users_df = None
        self.reviews_df = None
        self.rmse = None
        self.r2 = None
        self.weights = {'cbf': 0.4, 'cf': 0.35, 'ml': 0.25}
        self.similar_users = []
        self.activity_matched_users = []
        self.unrated_locations = []
        self.model_features = []

        # Emojis for different location types
        self.location_emojis = {
            'Beaches': '🏖️',
            'Mountains': '⛰️',
            'Waterfalls': '💦',
            'Lakes': '🌊',
            'National Parks': '🌲',
            'Museums': '🏛️',
            'Historical Sites': '🏰',
            'Theme Parks': '🎢',
            'Gardens': '🌷',
            'Wildlife': '🦁',
            'Temples': '🛕',
            'Churches': '⛪',
            'Urban Areas': '🏙️',
            'Markets': '🛍️',
            'Food Spots': '🍽️',
            'Adventure': '🧗',
            'Unknown': '📍'
        }

        # Emojis for activities
        self.activity_emojis = {
            'Swimming': '🏊',
            'Hiking': '🥾',
            'Trekking': '🏔️',
            'Surfing': '🏄',
            'Photography': '📸',
            'Camping': '⛺',
            'Bird Watching': '🦅',
            'Wildlife Spotting': '🦓',
            'Exploring Exhibits': '🖼️',
            'Sightseeing': '👀',
            'Shopping': '🛒',
            'Dining': '🍴',
            'Boating': '🚣',
            'Fishing': '🎣',
            'Cycling': '🚴',
            'Relaxing': '🧘'
        }

    def load_data(self):
        """Load and preprocess the datasets"""
        # Load datasets
        self.locations_df = pd.read_csv(os.path.join(self.data_dir, 'location_data.csv'))
        self.users_df = pd.read_csv(os.path.join(self.data_dir, 'user_data.csv'))
        self.reviews_df = pd.read_csv(os.path.join(self.data_dir, 'review_data.csv'))
        print(f"🔄 Loaded {len(self.locations_df)} locations, {len(self.users_df)} users, {len(self.reviews_df)} reviews")

        # Process activity lists
        self.locations_df['Activities'] = self.locations_df['Activities'].apply(
            lambda x: literal_eval(x) if isinstance(x, str) and x.startswith('[') else [x] if isinstance(x, str) else [])
        self.users_df['Preferred_Activities'] = self.users_df['Preferred_Activities'].apply(
            lambda x: literal_eval(x) if isinstance(x, str) and x.startswith('[') else [x] if isinstance(x, str) else [])

        # Add more features for locations
        self._engineer_location_features()

        # Add more features for users
        self._engineer_user_features()

        # Encode categorical features
        self._encode_categorical_features()

        # Enhance the reviews dataset with more features
        self._enhance_reviews_dataset()

        # Train the ML model
        self._train_enhanced_model()

    def _engineer_location_features(self):
        """Create more features for locations"""
        # Extract location features from names and descriptions if available
        if 'Description' in self.locations_df.columns:
            # Create text feature from description
            self.locations_df['Description_Length'] = self.locations_df['Description'].apply(
                lambda x: len(str(x)) if pd.notna(x) else 0)

        # Count number of activities per location
        self.locations_df['Activity_Count'] = self.locations_df['Activities'].apply(len)

        # Create one-hot encoding for common activities
        all_activities = [item for sublist in self.locations_df['Activities'].tolist() for item in sublist]
        top_activities = pd.Series(all_activities).value_counts().head(10).index.tolist()

        for activity in top_activities:
            self.locations_df[f'Has_{activity.replace(" ", "_")}'] = self.locations_df['Activities'].apply(
                lambda x: 1 if activity in x else 0)

        # Categorize locations into more specific types if needed
        if 'Location_Type' in self.locations_df.columns:
            # Count locations by type to identify common categories
            location_type_counts = self.locations_df['Location_Type'].value_counts()
            print(f"Location types: {dict(location_type_counts)}")

    def _engineer_user_features(self):
        """Create more features for users"""
        # Count number of preferred activities per user
        self.users_df['Preferred_Activity_Count'] = self.users_df['Preferred_Activities'].apply(len)

        # Create aggregated user stats from reviews
        if not self.reviews_df.empty:
            # User's average rating
            user_ratings = self.reviews_df.groupby('User_ID')['Rating'].agg(['mean', 'count', 'std']).reset_index()
            user_ratings.columns = ['User_ID', 'Avg_Rating', 'Rating_Count', 'Rating_Std']

            # Add a feature for rating variability - how much does this user vary their ratings?
            user_ratings['Rating_Variability'] = user_ratings['Rating_Std'].fillna(0)

            # Merge with users dataframe
            self.users_df = pd.merge(self.users_df, user_ratings, on='User_ID', how='left')

            # Fill NaN values for users without ratings
            for col in ['Avg_Rating', 'Rating_Count', 'Rating_Std', 'Rating_Variability']:
                self.users_df[col] = self.users_df[col].fillna(0)

            print(f"Added user rating statistics (avg, count, variability)")

    def _encode_categorical_features(self):
        """Encode categorical features for ML model"""
        # Encode basic categorical features
        categorical_cols = ['User_Age_Group', 'Gender', 'Travel_Companion', 'User_Country']
        for col in categorical_cols:
            if col in self.users_df.columns:
                encoder = LabelEncoder()
                encoder.fit(self.users_df[col].fillna('Unknown'))
                self.users_df[f'{col}_encoded'] = encoder.transform(self.users_df[col].fillna('Unknown'))
                self.encoders[col] = encoder
                print(f"Encoded {col} with {len(encoder.classes_)} unique values")

        # Encode location type
        if 'Location_Type' in self.locations_df.columns:
            encoder = LabelEncoder()
            encoder.fit(self.locations_df['Location_Type'].fillna('Unknown'))
            self.locations_df['Location_Type_encoded'] = encoder.transform(self.locations_df['Location_Type'].fillna('Unknown'))
            self.encoders['Location_Type'] = encoder

    def _enhance_reviews_dataset(self):
        """Add more features to the reviews dataset for ML training"""
        # Add user information
        user_cols = ['User_ID', 'User_Age_Group_encoded', 'Gender_encoded', 'Travel_Companion_encoded']
        if 'User_Country_encoded' in self.users_df.columns:
            user_cols.append('User_Country_encoded')
        if 'Avg_Rating' in self.users_df.columns:
            user_cols.extend(['Avg_Rating', 'Rating_Count', 'Rating_Variability'])

        user_info = self.users_df[user_cols]
        self.reviews_df = pd.merge(self.reviews_df, user_info, on='User_ID', how='left')

        # Add location information
        location_cols = ['Location_Name']
        if 'Location_Type_encoded' in self.locations_df.columns:
            location_cols.append('Location_Type_encoded')
        location_cols.extend([col for col in self.locations_df.columns if col.startswith('Has_')])
        location_cols.append('Activity_Count')

        location_info = self.locations_df[location_cols]
        self.reviews_df = pd.merge(self.reviews_df, location_info, on='Location_Name', how='left')

        # Create activity match feature between user and location
        self._add_activity_match_features()

        print(f"Enhanced reviews dataset with {len(self.reviews_df.columns)} features")

    def _add_activity_match_features(self):
        """Add features that capture the match between user preferences and location activities"""
        # This is computationally expensive, so we'll create a subset of the data
        reviews_sample = self.reviews_df.copy()

        # Create a function to calculate activity match ratio
        def calculate_activity_match(user_id, location_name):
            user_activities = self.users_df.loc[self.users_df['User_ID'] == user_id, 'Preferred_Activities'].values
            if len(user_activities) == 0:
                return 0
            user_activities = user_activities[0]

            location_activities = self.locations_df.loc[self.locations_df['Location_Name'] == location_name, 'Activities'].values
            if len(location_activities) == 0:
                return 0
            location_activities = location_activities[0]

            # Count matches
            matched = set(user_activities).intersection(set(location_activities))
            if not user_activities:
                return 0
            return len(matched) / len(user_activities)

        # Apply to a sample to avoid memory issues
        sample_size = min(5000, len(reviews_sample))
        sample_indices = np.random.choice(reviews_sample.index, sample_size, replace=False)

        # Calculate match ratio for the sample
        match_ratios = {}
        for idx in sample_indices:
            user_id = reviews_sample.loc[idx, 'User_ID']
            location_name = reviews_sample.loc[idx, 'Location_Name']
            match_ratios[idx] = calculate_activity_match(user_id, location_name)

        # Add to reviews dataframe
        reviews_sample['Activity_Match_Ratio'] = pd.Series(match_ratios)

        # Propagate to other reviews with the same user-location pairs
        activity_match_df = reviews_sample.loc[sample_indices, ['User_ID', 'Location_Name', 'Activity_Match_Ratio']]

        # Merge back to full reviews dataframe
        self.reviews_df = pd.merge(
            self.reviews_df,
            activity_match_df,
            on=['User_ID', 'Location_Name'],
            how='left'
        )

        # Fill NaN values
        self.reviews_df['Activity_Match_Ratio'] = self.reviews_df['Activity_Match_Ratio'].fillna(0)

        print(f"Added activity match feature, sampled from {sample_size} reviews")

    def _train_enhanced_model(self):
        """Train an enhanced XGBoost model with more features and hyperparameter tuning"""
        # Encode locations for ML
        location_encoder = LabelEncoder()
        location_encoder.fit(self.reviews_df['Location_Name'])
        self.reviews_df['Location_Name_encoded'] = location_encoder.transform(self.reviews_df['Location_Name'])
        self.locations_df['Location_Name_encoded'] = location_encoder.transform(self.locations_df['Location_Name'])

        # Prepare feature set - include all encoded features and engineered features
        features = ['User_Age_Group_encoded', 'Gender_encoded', 'Travel_Companion_encoded', 'Location_Name_encoded']

        # Add location type if available
        if 'Location_Type_encoded' in self.reviews_df.columns:
            features.append('Location_Type_encoded')

        # Add user country if available
        if 'User_Country_encoded' in self.reviews_df.columns:
            features.append('User_Country_encoded')

        # Add user rating behavior features if available
        for col in ['Avg_Rating', 'Rating_Count', 'Rating_Variability']:
            if col in self.reviews_df.columns:
                features.append(col)

        # Add activity features
        features.append('Activity_Match_Ratio')
        features.append('Activity_Count')

        # Add activity-specific features
        for col in self.reviews_df.columns:
            if col.startswith('Has_'):
                features.append(col)

        # Remove any features with all NaN values
        valid_features = []
        for feature in features:
            if feature in self.reviews_df.columns and not self.reviews_df[feature].isnull().all():
                valid_features.append(feature)

        self.model_features = valid_features
        print(f"Training with {len(valid_features)} features: {valid_features}")

        # Prepare training data
        X = self.reviews_df[valid_features].fillna(0)  # Fill any remaining NaNs
        y = self.reviews_df['Rating']

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print(f"XGBoost: Training with {len(X_train)} samples, testing with {len(X_test)} samples (20% split)")

        # Hyperparameter tuning with RandomizedSearchCV
        param_dist = {
            'n_estimators': [50, 100, 200, 300],
            'max_depth': [3, 4, 5, 6, 7],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'min_child_weight': [1, 3, 5, 7],
            'gamma': [0, 0.1, 0.2, 0.3],
            'reg_alpha': [0, 0.1, 1, 10],
            'reg_lambda': [0, 0.1, 1, 10],
        }

        print("Starting hyperparameter tuning with RandomizedSearchCV...")

        # Create base XGBoost model
        base_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

        # Use RandomizedSearchCV for efficiency
        random_search = RandomizedSearchCV(
            base_model,
            param_distributions=param_dist,
            n_iter=20,  # Try 20 parameter combinations
            scoring='neg_mean_squared_error',
            cv=3,  # 3-fold cross-validation
            verbose=1,
            random_state=42,
            n_jobs=-1  # Use all cores
        )

        # Fit the random search model
        random_search.fit(X_train, y_train)

        # Get the best model
        self.xgb_model = random_search.best_estimator_

        print(f"Best parameters: {random_search.best_params_}")

        # Evaluate with both RMSE and R-squared
        y_pred = self.xgb_model.predict(X_test)
        self.rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        self.r2 = r2_score(y_test, y_pred)

        print(f"XGBoost model performance: RMSE={self.rmse:.4f}, R²={self.r2:.4f}")
        print(f"Interpretation:")
        print(f"  - RMSE: Average prediction error is {self.rmse:.2f} stars")

        if self.r2 < 0.3:
            print(f"  - R²: Model explains {self.r2:.2f} of rating variance (below average)")
        elif self.r2 < 0.5:
            print(f"  - R²: Model explains {self.r2:.2f} of rating variance (good)")
        else:
            print(f"  - R²: Model explains {self.r2:.2f} of rating variance (excellent)")

        # Feature importance analysis
        importance = dict(zip(valid_features, self.xgb_model.feature_importances_))
        sorted_importance = {k: v for k, v in sorted(importance.items(), key=lambda item: item[1], reverse=True)}
        print(f"Top 10 feature importance: {list(sorted_importance.items())[:10]}")

        return self.r2

    def recommend(self, age_group, gender, travel_companion, activities, country=None, top_n=5):
        """Generate recommendations for a user with the given attributes"""
        # Implementation follows similar logic to the original code
        # but leverages the enhanced features and model

        # Reset tracking variables
        self.similar_users = []
        self.activity_matched_users = []
        self.unrated_locations = []

        # Encode user parameters
        encoded_params = {}
        for param, value in [('User_Age_Group', age_group), ('Gender', gender),
                             ('Travel_Companion', travel_companion)]:
            encoded_params[param] = self.encoders[param].transform([value])[0]

        if country and 'User_Country' in self.encoders:
            if country in self.encoders['User_Country'].classes_:
                encoded_params['User_Country'] = self.encoders['User_Country'].transform([country])[0]

        # PHASE 1: Find locations with matching activities
        print(f"\n=== PHASE 1: Content-Based Filtering ===")
        print(f"User Profile: {age_group} {gender} with {travel_companion}")
        print(f"Looking for activities: {activities}")

        cbf_results = self._find_matching_locations(activities)
        if cbf_results.empty:
            print("No locations match your activities.")
            return pd.DataFrame()

        # PHASE 2: Get similar user ratings
        print(f"\n=== PHASE 2: Collaborative Filtering ===")
        self._find_similar_users(encoded_params, activities)
        cf_results = self._get_ratings_for_locations(cbf_results['Location_Name'])

        # Track which locations have no ratings from similar users
        rated_locations = set(cf_results['Location_Name']) if not cf_results.empty else set()
        self.unrated_locations = [loc for loc in cbf_results['Location_Name'] if loc not in rated_locations]
        print(f"Found ratings for {len(rated_locations)} locations")
        print(f"{len(self.unrated_locations)} locations have no ratings from similar users")

        # PHASE 3: Get ML predictions with enhanced model
        print(f"\n=== PHASE 3: Machine Learning Predictions ===")
        ml_results = self._get_enhanced_ml_predictions(encoded_params, activities, cbf_results['Location_Name'])

        # PHASE 4: Combine all results
        print(f"\n=== PHASE 4: Combining Results ===")
        recommendations = pd.merge(cbf_results, cf_results, on='Location_Name', how='left')
        recommendations = pd.merge(recommendations, ml_results, on='Location_Name', how='left')
        recommendations.fillna({'CF_Score': 0, 'ML_Score': 0, 'Review_Count': 0}, inplace=True)

        # Calculate final score - adjust weights based on data availability
        if recommendations['CF_Score'].sum() == 0:
            # No CF data, rely more on ML and CBF
            adjusted_weights = {'cbf': 0.6, 'cf': 0, 'ml': 0.4}
        elif recommendations['ML_Score'].sum() == 0:
            # No ML data, rely more on CF and CBF
            adjusted_weights = {'cbf': 0.6, 'cf': 0.4, 'ml': 0}
        else:
            # Use default weights
            adjusted_weights = self.weights

        recommendations['Final_Score'] = (
                adjusted_weights['cbf'] * recommendations['CBF_Score'] +
                adjusted_weights['cf'] * recommendations['CF_Score'] / 5.0 +
                adjusted_weights['ml'] * recommendations['ML_Score'] / 5.0
        )

        print(f"Applied weights: CBF={adjusted_weights['cbf']}, CF={adjusted_weights['cf']}, ML={adjusted_weights['ml']}")

        # Sort and return top recommendations
        top_recommendations = recommendations.sort_values('Final_Score', ascending=False).head(top_n)
        print(f"Generated {len(top_recommendations)} top recommendations")

        # Print detailed user info
        self._print_user_details()
        self._print_unrated_details()

        return self._format_results(top_recommendations, activities)

    def _find_matching_locations(self, activities):
        """Find locations with matching activities"""
        # Similar to original implementation but with enhanced scoring
        cbf_results = []
        for _, loc in self.locations_df.iterrows():
            matched = set(activities).intersection(set(loc['Activities']))
            if matched:
                # Enhanced scoring that considers both match ratio and activity count
                match_ratio = len(matched)/len(activities)
                # Bonus for locations with exactly matching activities (not too many extras)
                activity_focus_score = 1.0
                if len(loc['Activities']) > 0:
                    activity_focus_score = min(len(matched) / len(loc['Activities']) + 0.5, 1.5)

                score = match_ratio * activity_focus_score

                # Boost popular location types if we have that data
                type_boost = 1.0
                if 'Location_Type' in loc and loc['Location_Type'] in ['Beaches', 'Waterfalls', 'National Parks']:
                    # Popular types for outdoor activities get a slight boost
                    type_boost = 1.1

                final_score = score * type_boost

                cbf_results.append({
                    'Location_Name': loc['Location_Name'],
                    'CBF_Score': final_score,
                    'Matched_Activities': list(matched),
                    'Location_Type': loc['Location_Type'] if 'Location_Type' in loc else 'Unknown'
                })

        cbf_df = pd.DataFrame(cbf_results)

        # Print activity match statistics
        if not cbf_df.empty:
            match_counts = cbf_df.groupby(cbf_df.apply(lambda x: len(x['Matched_Activities']), axis=1)).size()
            print("Activity match distribution:")
            for count, num_locations in match_counts.items():
                print(f"  {count}/{len(activities)} activities: {num_locations} locations")

            # Print top matches
            top_matches = cbf_df.sort_values('CBF_Score', ascending=False).head(5)
            print("\nTop activity matches:")
            for i, (_, row) in enumerate(top_matches.iterrows()):
                print(f"  {i+1}. {row['Location_Name']} - Matched: {row['Matched_Activities']}")

        return cbf_df

    def _find_similar_users(self, encoded_params, activities):
        """Find users with similar demographics and preferences"""
        # Start with demographic matching
        query = []
        for param, value in encoded_params.items():
            if param.endswith('_encoded'):
                col = param
            else:
                col = f'{param}_encoded'
            query.append(f"(self.users_df['{col}'] == {value})")

        query_str = " & ".join(query)
        similar_users_df = self.users_df[eval(query_str)].copy()

        print(f"Found {len(similar_users_df)} users with matching demographics")

        # Record similar users for transparency
        for _, user in similar_users_df.iterrows():
            # Check activity overlap
            user_activities = user['Preferred_Activities']
            matched_activities = set(activities).intersection(set(user_activities))
            activity_match_score = len(matched_activities) / len(activities) if activities else 0

            # Simplified boosting: only boost if ALL activities match
            # Only users with 100% match get 1.25x boost, everyone else gets 1.0x
            if activity_match_score == 1.0 and len(matched_activities) == len(activities):
                boost = 1.25  # Perfect match - all requested activities present
            else:
                boost = 1.0  # Any other case - no boost

            self.similar_users.append({
                'id': user['User_ID'],
                'country': user['User_Country'] if 'User_Country' in user else 'Unknown',
                'age': user['User_Age_Group'],
                'gender': user['Gender'],
                'activities': user['Preferred_Activities'],
                'boost': boost,
                'activity_match_score': activity_match_score
            })

            if activity_match_score == 1.0:
                self.activity_matched_users.append(user['User_ID'])

        # Sort by activity match
        self.similar_users.sort(key=lambda x: x['activity_match_score'], reverse=True)

        print(f"Users with all matching activities (1.25x boost): {len(self.activity_matched_users)}")

    def _get_ratings_for_locations(self, locations):
        """Get ratings from similar users for the given locations"""
        # If no similar users, return empty DataFrame
        if not self.similar_users:
            return pd.DataFrame(columns=['Location_Name', 'CF_Score'])

        # Get ratings from similar users for matched locations
        cf_data = []
        user_review_counts = {}

        for user in self.similar_users:
            # Get this user's ratings for matched locations
            user_reviews = self.reviews_df[
                (self.reviews_df['User_ID'] == user['id']) &
                (self.reviews_df['Location_Name'].isin(locations))
                ]

            # Track how many reviews each user has
            user_review_counts[user['id']] = len(user_reviews)

            # Apply activity match boost to ratings
            for _, review in user_reviews.iterrows():
                cf_data.append({
                    'Location_Name': review['Location_Name'],
                    'Rating': review['Rating'] * user['boost'],
                    'Original_Rating': review['Rating'],
                    'User_ID': user['id'],
                    'Boost': user['boost']
                })

        # Calculate average adjusted ratings with confidence weighting
        if cf_data:
            cf_df = pd.DataFrame(cf_data)

            # Apply confidence weighting - locations with more reviews get more weight
            rating_counts = cf_df.groupby('Location_Name').size().reset_index(name='count')
            max_count = rating_counts['count'].max()
            confidence_factor = rating_counts['count'] / max_count if max_count > 0 else 1
            confidence_boost = 0.5 + (0.5 * confidence_factor)
            rating_counts['confidence_boost'] = confidence_boost

            # Calculate weighted scores
            cf_scores = cf_df.groupby('Location_Name').agg({
                'Rating': 'mean',
                'Original_Rating': 'mean',
                'User_ID': 'count',
                'Boost': 'mean'
            }).reset_index()

            cf_scores = pd.merge(cf_scores, rating_counts[['Location_Name', 'confidence_boost']], on='Location_Name')
            cf_scores['CF_Score'] = cf_scores['Rating'] * cf_scores['confidence_boost']

            cf_scores.rename(columns={
                'Original_Rating': 'Avg_Rating',
                'User_ID': 'Review_Count',
                'Boost': 'Avg_Boost'
            }, inplace=True)

            # Print CF stats
            print(f"\nCollaborative Filtering Stats:")
            print(f"  Total reviews used: {len(cf_df)}")
            print(f"  Users who provided reviews: {len([u for u, c in user_review_counts.items() if c > 0])}")
            if len(cf_scores) > 0:
                print(f"  Average reviews per location: {cf_scores['Review_Count'].mean():.1f}")
                print(f"  Average boost applied: +{((cf_scores['Avg_Boost'].mean()-1.0)*100):.1f}%")
                print(f"  Average confidence boost: +{((cf_scores['confidence_boost'].mean()-1.0)*100):.1f}%")

            return cf_scores[['Location_Name', 'CF_Score', 'Avg_Rating', 'Review_Count']]
        else:
            print("  No reviews found from similar users")
            return pd.DataFrame(columns=['Location_Name', 'CF_Score', 'Avg_Rating', 'Review_Count'])

    def _get_enhanced_ml_predictions(self, encoded_params, activities, locations):
        """Generate ML predictions with enhanced feature set"""
        if not self.xgb_model:
            return pd.DataFrame(columns=['Location_Name', 'ML_Score'])

        # Get location data
        loc_data = self.locations_df[self.locations_df['Location_Name'].isin(locations)]

        # Create prediction input with all available features
        pred_data = []
        for _, loc in loc_data.iterrows():
            # Basic user and location features
            prediction_row = {
                'User_Age_Group_encoded': encoded_params.get('User_Age_Group', 0),
                'Gender_encoded': encoded_params.get('Gender', 0),
                'Travel_Companion_encoded': encoded_params.get('Travel_Companion', 0),
                'Location_Name_encoded': loc['Location_Name_encoded'],
                'Location_Name': loc['Location_Name'],
            }

            # Add location type if available
            if 'Location_Type' in loc:
                prediction_row['Location_Type'] = loc['Location_Type']
                if 'Location_Type_encoded' in loc and 'Location_Type_encoded' in self.model_features:
                    prediction_row['Location_Type_encoded'] = loc['Location_Type_encoded']

            # Add user country if available
            if 'User_Country' in encoded_params and 'User_Country_encoded' in self.model_features:
                prediction_row['User_Country_encoded'] = encoded_params['User_Country']

            # Add activity count
            if 'Activity_Count' in loc and 'Activity_Count' in self.model_features:
                prediction_row['Activity_Count'] = loc['Activity_Count']

            # Add activity-specific features
            for col in loc.index:
                if col.startswith('Has_') and col in self.model_features:
                    prediction_row[col] = loc[col]

            # Calculate activity match ratio
            matched = set(activities).intersection(set(loc['Activities']))
            activity_match_ratio = len(matched) / len(activities) if activities else 0
            if 'Activity_Match_Ratio' in self.model_features:
                prediction_row['Activity_Match_Ratio'] = activity_match_ratio

            # Add user rating behavior if needed
            for col in ['Avg_Rating', 'Rating_Count', 'Rating_Variability']:
                if col in self.model_features:
                    # Use average from similar users as a proxy
                    if self.similar_users and col == 'Avg_Rating':
                        avg_rating = np.mean([u.get('avg_rating', 3.5) for u in self.similar_users[:10]])
                        prediction_row[col] = avg_rating
                    elif col == 'Rating_Count':
                        prediction_row[col] = 5  # Assume moderate experience
                    elif col == 'Rating_Variability':
                        prediction_row[col] = 0.5  # Assume moderate variability

            pred_data.append(prediction_row)

        # Make predictions
        if pred_data:
            pred_df = pd.DataFrame(pred_data)

            # Ensure all model features are present
            for feature in self.model_features:
                if feature not in pred_df.columns:
                    pred_df[feature] = 0

            # Make predictions using the model
            predictions = self.xgb_model.predict(pred_df[self.model_features])

            # Add predictions to results
            results = pd.DataFrame({
                'Location_Name': pred_df['Location_Name'],
                'Location_Type': pred_df['Location_Type'] if 'Location_Type' in pred_df.columns else 'Unknown',
                'ML_Score': predictions
            })

            # Cap predictions to valid range
            results['ML_Score'] = results['ML_Score'].clip(1, 5)

            # Print XGBoost stats
            print(f"XGBoost prediction stats:")
            print(f"  Min score: {results['ML_Score'].min():.2f}")
            print(f"  Max score: {results['ML_Score'].max():.2f}")
            print(f"  Average score: {results['ML_Score'].mean():.2f}")

            # Store unrated predictions for transparency
            self.ml_unrated_predictions = results[results['Location_Name'].isin(self.unrated_locations)]

            return results

        return pd.DataFrame(columns=['Location_Name', 'ML_Score'])



    def _format_results(self, recommendations, activities):
        """Format the final recommendation results with emojis and better formatting"""
        results = []
        for _, rec in recommendations.iterrows():
            loc_data = self.locations_df[self.locations_df['Location_Name'] == rec['Location_Name']].iloc[0]
            matched = set(activities).intersection(set(loc_data['Activities']))

            # Get location type and its emoji
            loc_type = loc_data['Location_Type'] if 'Location_Type' in loc_data else 'Unknown'
            loc_emoji = self.location_emojis.get(loc_type, '📍')

            # Get emojis for matched activities (up to 3)
            activity_emojis = []
            for activity in loc_data['Activities'][:3]:
                emoji = self.activity_emojis.get(activity, '🔍')
                activity_emojis.append(f"{emoji} {activity}")

            results.append({
                'Location': rec['Location_Name'],
                'Type': f"{loc_emoji} {loc_type}",
                'Match': f"{len(matched)}/{len(activities)}",
                'Activities': ', '.join(activity_emojis),
                'CBF': round(rec['CBF_Score'], 2),
                'CF': round(rec.get('CF_Score', 0), 2),
                'ML': round(rec.get('ML_Score', 0), 2),
                'Reviews': int(rec.get('Review_Count', 0)),
                'Score': round(rec['Final_Score'], 2)
            })

        return pd.DataFrame(results)

    def _print_user_details(self):
        """Print similar user details with improved formatting"""
        print(f"\n🧑‍🤝‍🧑 SIMILAR USERS ({len(self.similar_users)}) 🧑‍🤝‍🧑")
        print("=" * 70)

        if not self.similar_users:
            print("❌ No similar users found")
            return

        print(f"👥 Users with activity bonus (≥ 1.5x): {len(self.activity_matched_users)}")

        # Show at least 10 users if available
        users_to_show = min(10, len(self.similar_users))
        print(f"\n👁️ Showing {users_to_show} similar users:")
        for i, user in enumerate(self.similar_users[:users_to_show]):
            print(f"\n👤 User {i+1}: {user['age']} {user['gender']} from {user['country']}")

            # Format activities with emojis
            activity_list = []
            for activity in user['activities'][:5]:
                emoji = self.activity_emojis.get(activity, '🔍')
                activity_list.append(f"{emoji} {activity}")

            print(f"  🎯 Activities: {', '.join(activity_list)}")

            # Format match score with color indicator
            match_score = user['activity_match_score']
            if match_score >= 0.8:
                match_indicator = "🟢"  # High match
            elif match_score >= 0.5:
                match_indicator = "🟡"  # Medium match
            else:
                match_indicator = "🔴"  # Low match

            print(f"  {match_indicator} Activity match: {match_score:.2f} (boost: {user['boost']}x)")

            # Show this user's reviews
            user_reviews = self.reviews_df[self.reviews_df['User_ID'] == user['id']]
            if not user_reviews.empty:
                print(f"  📝 Total reviews: {len(user_reviews)}")

                # Show top 3 highest-rated places
                top_reviews = user_reviews.sort_values('Rating', ascending=False).head(3)
                if not top_reviews.empty:
                    print(f"  ⭐ Highly rated places:")
                    for _, review in top_reviews.iterrows():
                        # Add stars based on rating
                        stars = "⭐" * int(review['Rating'])
                        print(f"    - {review['Location_Name']}: {stars} ({review['Rating']}/5)")

    def _print_unrated_details(self):
        """Print ML predictions for unrated locations with better formatting"""
        if not hasattr(self, 'ml_unrated_predictions') or self.ml_unrated_predictions.empty:
            return

        print(f"\n🤖 XGBOOST PREDICTIONS FOR UNRATED LOCATIONS 🤖")
        print("=" * 70)
        print(f"These locations had no ratings from similar users:")

        # Show top ML predictions for unrated locations
        top_unrated = self.ml_unrated_predictions.sort_values('ML_Score', ascending=False).head(5)
        for i, (_, pred) in enumerate(top_unrated.iterrows()):
            loc_type = pred['Location_Type']
            loc_emoji = self.location_emojis.get(loc_type, '📍')

            # Stars based on predicted score
            stars = "⭐" * int(pred['ML_Score'])
            half_star = "✨" if pred['ML_Score'] % 1 >= 0.5 else ""

            print(f"  {i+1}. {pred['Location_Name']} {loc_emoji} {loc_type}: {stars}{half_star} ({pred['ML_Score']:.1f}/5)")

        # Group by location type
        type_avg = self.ml_unrated_predictions.groupby('Location_Type')['ML_Score'].mean()
        if not type_avg.empty:
            print("\n📊 Average predictions by location type:")
            for loc_type, score in type_avg.items():
                loc_emoji = self.location_emojis.get(loc_type, '📍')
                stars = "⭐" * int(score)
                half_star = "✨" if score % 1 >= 0.5 else ""
                print(f"  {loc_emoji} {loc_type}: {stars}{half_star} ({score:.1f}/5)")

    def recommend(self, age_group, gender, travel_companion, activities, country=None, top_n=5):
        """Generate recommendations with beautiful formatted output"""
        # Reset tracking variables
        self.similar_users = []
        self.activity_matched_users = []
        self.unrated_locations = []

        # Encode user parameters
        encoded_params = {}
        for param, value in [('User_Age_Group', age_group), ('Gender', gender),
                             ('Travel_Companion', travel_companion)]:
            encoded_params[param] = self.encoders[param].transform([value])[0]

        if country and 'User_Country' in self.encoders:
            if country in self.encoders['User_Country'].classes_:
                encoded_params['User_Country'] = self.encoders['User_Country'].transform([country])[0]

        # Create a header for the recommendation process
        print("\n" + "=" * 70)
        print("🌟 PERSONALIZED TRAVEL RECOMMENDATIONS 🌟".center(70))
        print("=" * 70)

        # Format profile with emojis
        gender_emoji = "👨" if gender == "Male" else "👩" if gender == "Female" else "🧑"
        companion_emoji = {
            "Friends": "👫",
            "Family": "👨‍👩‍👧",
            "Solo": "🧍",
            "Partner": "💑",
            "Group": "👥"
        }.get(travel_companion, "👥")

        # Format activities with emojis
        activity_list = []
        for activity in activities:
            emoji = self.activity_emojis.get(activity, '🔍')
            activity_list.append(f"{emoji} {activity}")

        # PHASE 1: Find locations with matching activities
        print(f"\n🧩 PHASE 1: CONTENT-BASED FILTERING 🧩")
        print("=" * 70)
        print(f"🧑‍💼 User Profile: {gender_emoji} {age_group} {gender} with {companion_emoji} {travel_companion}")
        print(f"🎯 Looking for activities: {', '.join(activity_list)}")

        cbf_results = self._find_matching_locations(activities)
        if cbf_results.empty:
            print("❌ No locations match your activities.")
            return pd.DataFrame()

        # PHASE 2: Get similar user ratings
        print(f"\n👥 PHASE 2: COLLABORATIVE FILTERING 👥")
        print("=" * 70)
        self._find_similar_users(encoded_params, activities)
        cf_results = self._get_ratings_for_locations(cbf_results['Location_Name'])

        # Track which locations have no ratings from similar users
        rated_locations = set(cf_results['Location_Name']) if not cf_results.empty else set()
        self.unrated_locations = [loc for loc in cbf_results['Location_Name'] if loc not in rated_locations]
        print(f"📊 Found ratings for {len(rated_locations)} locations")
        print(f"⚠️ {len(self.unrated_locations)} locations have no ratings from similar users")

        # PHASE 3: Get ML predictions with enhanced model
        print(f"\n🤖 PHASE 3: MACHINE LEARNING PREDICTIONS 🤖")
        print("=" * 70)
        ml_results = self._get_enhanced_ml_predictions(encoded_params, activities, cbf_results['Location_Name'])

        # PHASE 4: Combine all results
        print(f"\n🔄 PHASE 4: COMBINING RESULTS 🔄")
        print("=" * 70)
        recommendations = pd.merge(cbf_results, cf_results, on='Location_Name', how='left')
        recommendations = pd.merge(recommendations, ml_results, on='Location_Name', how='left')
        recommendations.fillna({'CF_Score': 0, 'ML_Score': 0, 'Review_Count': 0}, inplace=True)

        # Calculate final score - adjust weights based on data availability
        if recommendations['CF_Score'].sum() == 0:
            # No CF data, rely more on ML and CBF
            adjusted_weights = {'cbf': 0.6, 'cf': 0, 'ml': 0.4}
        elif recommendations['ML_Score'].sum() == 0:
            # No ML data, rely more on CF and CBF
            adjusted_weights = {'cbf': 0.6, 'cf': 0.4, 'ml': 0}
        else:
            # Use default weights
            adjusted_weights = self.weights

        recommendations['Final_Score'] = (
                adjusted_weights['cbf'] * recommendations['CBF_Score'] +
                adjusted_weights['cf'] * recommendations['CF_Score'] / 5.0 +
                adjusted_weights['ml'] * recommendations['ML_Score'] / 5.0
        )

        print(f"⚖️ Applied weights: CBF={adjusted_weights['cbf']:.2f}, CF={adjusted_weights['cf']:.2f}, ML={adjusted_weights['ml']:.2f}")

        # Sort and return top recommendations
        top_recommendations = recommendations.sort_values('Final_Score', ascending=False).head(top_n)
        print(f"✅ Generated {len(top_recommendations)} top recommendations")

        # Print detailed user info
        self._print_user_details()
        self._print_unrated_details()

        formatted_results = self._format_results(top_recommendations, activities)

        return formatted_results

In [2]:
def main():
    # Make a nice header
    print("\n" + "=" * 70)
    print("🌍 TOURISM RECOMMENDATION SYSTEM 🌍".center(70))
    print("=" * 70)

    recommender = ImprovedRecommender()
    print("🔄 Loading data and training models...")
    recommender.load_data()

    # You can customize these parameters based on user input
    results = recommender.recommend(
        age_group='36-50',
        gender='Male',
        travel_companion='Friends',
        activities=['Exploring Exhibits', 'Wildlife Spotting', 'Trekking', 'Bird Watching']
    )

    # Display formatted results
    print("\n" + "=" * 70)
    print("🏆 TOP RECOMMENDED LOCATIONS 🏆".center(70))
    print("=" * 70)

    # Create a simpler, more eye-friendly table format
    for i, row in results.iterrows():
        # Top recommendation gets special formatting
        if i == 0:
            print(f"\n🥇 TOP PICK: {row['Location']} ({row['Type']})")
        else:
            print(f"\n{i+1}. {row['Location']} ({row['Type']})")

        print(f"   ⭐ Overall Score: {row['Score']}/1.0")
        print(f"   🎯 Activity Match: {row['Match']} activities")
        print(f"   🏆 Ratings: CBF: {row['CBF']}, CF: {row['CF']}, ML: {row['ML']}")

        # Show activities with emojis
        activities = row['Activities'].split(', ')
        if len(activities) > 0:
            print(f"   🎪 Activities: {', '.join(activities[:3])}")

        # Add review count if available
        if row['Reviews'] > 0:
            print(f"   📝 Based on {row['Reviews']} reviews")

        # Add separator between recommendations
        if i < len(results) - 1:
            print("   " + "-" * 50)

    # Top recommendation highlight
    if not results.empty:
        top_rec = results.iloc[0]
        print("=" * 70)
        print(f"• 🥇 Top recommendation: {top_rec['Location']} with score {top_rec['Score']}")
        print(f"• 💯 Match quality: {top_rec['Match']} of your requested activities")
        print("-" * 70)


    return results

if __name__ == "__main__":
    main()


                  🌍 TOURISM RECOMMENDATION SYSTEM 🌍                   
🔄 Loading data and training models...
🔄 Loaded 76 locations, 10673 users, 16156 reviews
Location types: {'Religious Sites': 14, 'Beaches': 11, 'Farms': 8, 'Museums': 7, 'National Parks': 7, 'Nature & Wildlife Areas': 7, 'Historic Sites': 6, 'Waterfalls': 6, 'Gardens': 5, 'Bodies of Water': 4, 'Zoological Gardens': 1}
Added user rating statistics (avg, count, variability)
Encoded User_Age_Group with 4 unique values
Encoded Gender with 2 unique values
Encoded Travel_Companion with 4 unique values
Encoded User_Country with 131 unique values
Added activity match feature, sampled from 5000 reviews
Enhanced reviews dataset with 28 features
Training with 21 features: ['User_Age_Group_encoded', 'Gender_encoded', 'Travel_Companion_encoded', 'Location_Name_encoded', 'Location_Type_encoded', 'User_Country_encoded', 'Avg_Rating', 'Rating_Count', 'Rating_Variability', 'Activity_Match_Ratio', 'Activity_Count', 'Has_Sightseeing',