In [58]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

class RecommendationSystem:
    def __init__(self):
        self.user_features = None
        self.post_features = None
        self.user_encoder = LabelEncoder()
        self.post_encoder = LabelEncoder()
        self.tfidf_vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
        self.scaler = StandardScaler()

    def load_and_split_data(self, users_df, posts_df, engagements_df, test_size=0.2):
        self.users_df = users_df.copy()
        self.posts_df = posts_df.copy()
        self.engagements_df = engagements_df.copy()

        print("Dataset shapes:")
        print(f"Users: {self.users_df.shape}")
        print(f"Posts: {self.posts_df.shape}")
        print(f"Engagements: {self.engagements_df.shape}")

        # Split engagements into train and test
        self.engagements_train, self.engagements_test = train_test_split(
            self.engagements_df, test_size=test_size, random_state=42, stratify=self.engagements_df['user_id']
        )

        print(f"Training engagements: {len(self.engagements_train)}")
        print(f"Test engagements: {len(self.engagements_test)}")

        # Preprocess data using only training engagements
        self._preprocess_users()
        self._preprocess_posts()
        self._preprocess_engagements()

    def _preprocess_users(self):

        # Handle missing values
        self.users_df['top_3_interests'] = self.users_df['top_3_interests'].fillna('')
        self.users_df['age'] = self.users_df['age'].fillna(self.users_df['age'].median())
        self.users_df['gender'] = self.users_df['gender'].fillna('unknown')
        self.users_df['past_engagement_score'] = self.users_df['past_engagement_score'].fillna(0)

        # Convert top_3_interests to list
        self.users_df['interests_list'] = self.users_df['top_3_interests'].apply(
            lambda x: [interest.strip().lower() for interest in str(x).split(',')] if x else []
        )

        # Get all unique interests
        all_interests = set()
        for interests in self.users_df['interests_list']:
            all_interests.update(interests)
        all_interests = sorted(list(all_interests))

        print(f"Found {len(all_interests)} unique interests: {all_interests}")

        # Create interest features
        for interest in all_interests:
            self.users_df[f'interest_{interest}'] = self.users_df['interests_list'].apply(
                lambda x: 1 if interest in x else 0
            )

        # Encode gender
        self.users_df['gender_encoded'] = LabelEncoder().fit_transform(self.users_df['gender'])

        # Normalize numerical features
        self.users_df['age_normalized'] = self.scaler.fit_transform(self.users_df[['age']])
        self.users_df['engagement_score_normalized'] = self.scaler.fit_transform(
            self.users_df[['past_engagement_score']]
        )

    def _preprocess_posts(self):

        # Handle missing values
        self.posts_df['tags'] = self.posts_df['tags'].fillna('')
        self.posts_df['content_type'] = self.posts_df['content_type'].fillna('unknown')

        # Process tags
        self.posts_df['tags_list'] = self.posts_df['tags'].apply(
            lambda x: [tag.strip().lower() for tag in str(x).split(',')] if x else []
        )

        # Create tag features using TF-IDF
        all_tags = [' '.join(tags) for tags in self.posts_df['tags_list']]
        if all_tags:
            tag_features = self.tfidf_vectorizer.fit_transform(all_tags).toarray()
            tag_feature_names = [f'tag_{i}' for i in range(tag_features.shape[1])]
            tag_df = pd.DataFrame(tag_features, columns=tag_feature_names, index=self.posts_df.index)
            self.posts_df = pd.concat([self.posts_df, tag_df], axis=1)

        # Encode content_type
        self.posts_df['content_type_encoded'] = LabelEncoder().fit_transform(
            self.posts_df['content_type']
        )

    def _preprocess_engagements(self):

        # Encode user_id and post_id
        self.engagements_train['user_id_encoded'] = self.user_encoder.fit_transform(
            self.engagements_train['user_id']
        )
        self.engagements_train['post_id_encoded'] = self.post_encoder.fit_transform(
            self.engagements_train['post_id']
        )

        # Also encode test data using the same encoders
        self.engagements_test['user_id_encoded'] = self.user_encoder.transform(
            self.engagements_test['user_id']
        )
        self.engagements_test['post_id_encoded'] = self.post_encoder.transform(
            self.engagements_test['post_id']
        )

        # Create engagement matrix from training data only
        self.engagement_matrix = self.engagements_train.pivot(
            index='user_id_encoded',
            columns='post_id_encoded',
            values='engagement'
        ).fillna(0)

        # Convert to binary (any engagement > 0 means user engaged with post)
        self.engagement_matrix = (self.engagement_matrix > 0).astype(int)

        print(f"Engagement matrix shape: {self.engagement_matrix.shape}")
        print(f"Total training engagements: {self.engagement_matrix.sum().sum()}")

    def create_features(self):

        # User features: demographics + interests
        interest_cols = [col for col in self.users_df.columns if col.startswith('interest_')]
        user_demo_features = ['age_normalized', 'gender_encoded', 'engagement_score_normalized']

        user_interest_features = self.users_df[interest_cols].values if interest_cols else np.zeros((len(self.users_df), 1))
        user_demo_features = self.users_df[user_demo_features].values

        self.user_features = np.hstack([user_demo_features, user_interest_features])

        # Post features: content type + tags
        tag_cols = [col for col in self.posts_df.columns if col.startswith('tag_')]
        post_content_features = self.posts_df[['content_type_encoded']].values
        post_tag_features = self.posts_df[tag_cols].values if tag_cols else np.zeros((len(self.posts_df), 1))

        self.post_features = np.hstack([post_content_features, post_tag_features])

        print(f"User features shape: {self.user_features.shape}")
        print(f"Post features shape: {self.post_features.shape}")

    def collaborative_filtering(self, user_id, top_k=3, n_factors=10):
        try:
            user_idx = self.user_encoder.transform([user_id])[0]

            # Perform SVD on the engagement matrix
            svd = TruncatedSVD(n_components=n_factors, random_state=42)
            user_factors = svd.fit_transform(self.engagement_matrix)
            item_factors = svd.components_.T

            # Reconstruct the matrix
            predicted_scores = user_factors @ item_factors.T

            # Get predictions for this user
            user_predictions = predicted_scores[user_idx]

            # Get posts user has already engaged with in TRAINING data
            user_engaged_train = set(self.engagements_train[
                self.engagements_train['user_id'] == user_id
            ]['post_id'])
            engaged_indices = [self.post_encoder.transform([pid])[0] for pid in user_engaged_train
                             if pid in self.post_encoder.classes_]

            # Set already engaged posts to -inf
            user_predictions[engaged_indices] = -np.inf

            # Get top recommendations
            valid_indices = np.where(user_predictions > -np.inf)[0]
            if len(valid_indices) == 0:
                return self._get_popular_posts(user_engaged_train, top_k), np.zeros(top_k)

            top_indices = np.argsort(user_predictions)[::-1][:top_k]
            recommended_posts = self.post_encoder.inverse_transform(top_indices)

            return recommended_posts, user_predictions[top_indices]

        except Exception as e:
            print(f"Error in collaborative filtering for user {user_id}: {e}")
            user_engaged_train = set(self.engagements_train[
                self.engagements_train['user_id'] == user_id
            ]['post_id'])
            return self._get_popular_posts(user_engaged_train, top_k), np.zeros(top_k)

    def content_based_recommendation(self, user_id, top_k=3):
        try:
            user_idx = self.user_encoder.transform([user_id])[0]
            user_row = self.users_df[self.users_df['user_id'] == user_id].iloc[0]
            user_interests = user_row['interests_list']

            if not user_interests:
                user_engaged_train = set(self.engagements_train[
                    self.engagements_train['user_id'] == user_id
                ]['post_id'])
                return self._get_popular_posts(user_engaged_train, top_k), np.zeros(top_k)

            # Calculate similarity scores for all posts
            scores = []
            for post_idx, post_row in self.posts_df.iterrows():
                post_id = post_row['post_id']
                post_tags = post_row['tags_list']

                # Calculate interest overlap
                overlap = len(set(user_interests) & set(post_tags))
                score = overlap / max(len(user_interests), 1)

                # Add content type preference boost
                if post_row['content_type'] in ['video', 'image']:
                    score *= 1.1

                scores.append((post_id, score))

            # Sort by score
            scores.sort(key=lambda x: x[1], reverse=True)

            # Filter out posts user already engaged with in training
            user_engaged_train = set(self.engagements_train[
                self.engagements_train['user_id'] == user_id
            ]['post_id'])

            recommendations = []
            final_scores = []
            for post_id, score in scores:
                if post_id not in user_engaged_train and len(recommendations) < top_k:
                    recommendations.append(post_id)
                    final_scores.append(score)

            # If not enough recommendations, add popular ones
            if len(recommendations) < top_k:
                additional = self._get_popular_posts(
                    user_engaged_train.union(set(recommendations)),
                    top_k - len(recommendations)
                )
                recommendations.extend(additional)
                final_scores.extend([0.1] * len(additional))

            return np.array(recommendations), np.array(final_scores)

        except Exception as e:
            print(f"Error in content-based recommendation for user {user_id}: {e}")
            user_engaged_train = set(self.engagements_train[
                self.engagements_train['user_id'] == user_id
            ]['post_id'])
            return self._get_popular_posts(user_engaged_train, top_k), np.zeros(top_k)

    def _get_popular_posts(self, exclude_posts, top_k):
        post_engagement_counts = self.engagements_train.groupby('post_id').size()
        popular_posts = post_engagement_counts.sort_values(ascending=False).index.tolist()

        recommendations = []
        for post in popular_posts:
            if post not in exclude_posts and len(recommendations) < top_k:
                recommendations.append(post)

        # If still not enough, use any posts
        if len(recommendations) < top_k:
            all_posts = set(self.posts_df['post_id'])
            remaining_posts = list(all_posts - exclude_posts - set(recommendations))
            if remaining_posts:
                additional = np.random.choice(remaining_posts,
                                           size=min(top_k - len(recommendations), len(remaining_posts)),
                                           replace=False)
                recommendations.extend(additional)

        return recommendations

    def hybrid_recommendation(self, user_id, top_k=3):
        # Get recommendations from both methods
        content_recs, content_scores = self.content_based_recommendation(user_id, top_k * 2)
        collab_recs, collab_scores = self.collaborative_filtering(user_id, top_k * 2)

        # Normalize scores
        if len(content_scores) > 0 and np.max(content_scores) > 0:
            content_scores = content_scores / np.max(content_scores)
        if len(collab_scores) > 0 and np.max(collab_scores) > 0:
            collab_scores = collab_scores / np.max(collab_scores)

        # Combine recommendations
        combined_scores = {}

        # Add content-based scores (weight: 0.6)
        for post, score in zip(content_recs, content_scores):
            combined_scores[post] = score * 0.6

        # Add collaborative scores (weight: 0.4)
        for post, score in zip(collab_recs, collab_scores):
            if post in combined_scores:
                combined_scores[post] += score * 0.4
            else:
                combined_scores[post] = score * 0.4

        # Sort by combined score and get top K
        sorted_posts = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
        final_recommendations = [post for post, score in sorted_posts[:top_k]]
        final_scores = [score for post, score in sorted_posts[:top_k]]

        return np.array(final_recommendations), np.array(final_scores)

    def generate_recommendations(self):

        recommendations = {}

        for user_id in self.users_df['user_id']:
            rec_posts, scores = self.hybrid_recommendation(user_id)
            recommendations[user_id] = {
                'recommended_posts': rec_posts,
                'scores': scores
            }

        return recommendations

    def save_recommendations_to_csv(self, recommendations, filename='recommendations.csv'):
        print(f"\nSaving recommendations to {filename}...")

        # Create a list to store the recommendation data
        recommendation_data = []

        for user_id, rec_data in recommendations.items():
            recommended_posts = rec_data['recommended_posts']
            scores = rec_data['scores']

            # Create a row for this user
            row = {
                'user_id': user_id,
                'recommended_post_1': recommended_posts[0] if len(recommended_posts) > 0 else None,
                'recommended_post_2': recommended_posts[1] if len(recommended_posts) > 1 else None,
                'recommended_post_3': recommended_posts[2] if len(recommended_posts) > 2 else None,
                'score_1': scores[0] if len(scores) > 0 else 0,
                'score_2': scores[1] if len(scores) > 1 else 0,
                'score_3': scores[2] if len(scores) > 2 else 0
            }
            recommendation_data.append(row)

        # Create DataFrame and save to CSV
        recommendations_df = pd.DataFrame(recommendation_data)
        recommendations_df.to_csv(filename, index=False)

        print(f"Successfully saved {len(recommendations_df)} user recommendations to {filename}")

        self.save_detailed_recommendations(recommendations, filename='detailed_recommendations.csv')

        return recommendations_df

    def save_detailed_recommendations(self, recommendations, filename='detailed_recommendations.csv'):
        detailed_data = []

        for user_id, rec_data in recommendations.items():
            user_info = self.users_df[self.users_df['user_id'] == user_id].iloc[0]
            recommended_posts = rec_data['recommended_posts']
            scores = rec_data['scores']

            for i, (post_id, score) in enumerate(zip(recommended_posts, scores)):
                post_info = self.posts_df[self.posts_df['post_id'] == post_id].iloc[0]

                row = {
                    'user_id': user_id,
                    'user_interests': user_info['top_3_interests'],
                    'recommendation_rank': i + 1,
                    'post_id': post_id,
                    'post_content_type': post_info['content_type'],
                    'post_tags': post_info['tags'],
                    'recommendation_score': score,
                    'match_reason': self._get_match_reason(user_info, post_info)
                }
                detailed_data.append(row)

        detailed_df = pd.DataFrame(detailed_data)
        detailed_df.to_csv(filename, index=False)
        print(f"Successfully saved detailed recommendations to {filename}")

        return detailed_df

    def _get_match_reason(self, user_info, post_info):
        user_interests = user_info['interests_list']
        post_tags = post_info['tags_list']

        common_interests = set(user_interests) & set(post_tags)
        if common_interests:
            return f"Common interests: {', '.join(common_interests)}"
        else:
            return "Popular content or collaborative filtering"

    def evaluate_recommendations(self, recommendations, k=3):

        hits = 0
        total_recommendations = 0
        user_coverage = 0

        for user_id, rec_data in recommendations.items():
            recommended_posts = rec_data['recommended_posts'][:k]

            user_test_engagements = set(self.engagements_test[
                self.engagements_test['user_id'] == user_id
            ]['post_id'])

            user_hits = 0
            for post in recommended_posts:
                if post in user_test_engagements:
                    hits += 1
                    user_hits += 1
                total_recommendations += 1

            if user_hits > 0:
                user_coverage += 1

        precision = hits / total_recommendations if total_recommendations > 0 else 0
        coverage = user_coverage / len(self.users_df) if len(self.users_df) > 0 else 0

        print(f"Precision@{k}: {precision:.4f}")
        print(f"User Coverage@{k}: {coverage:.4f}")
        print(f"Total Recommendations: {total_recommendations}")
        print(f"Total Hits: {hits}")
        print(f"Users with at least one hit: {user_coverage}/{len(self.users_df)}")

        return precision, coverage

def main():
    users_df = pd.read_csv('Users.csv')
    posts_df = pd.read_csv('Posts.csv')
    engagements_df = pd.read_csv('Engagements.csv')

    rec_system = RecommendationSystem()
    rec_system.load_and_split_data(users_df, posts_df, engagements_df, test_size=0.2)
    rec_system.create_features()

    recommendations = rec_system.generate_recommendations()

    print("\n" + "=" * 60)
    print("DETAILED RECOMMENDATIONS (First 10 Users)")
    print("=" * 60)

    for i, (user_id, rec_data) in enumerate(list(recommendations.items())[:10]):
        user_info = users_df[users_df['user_id'] == user_id].iloc[0]
        print(f"\nUser {user_id} (Interests: {user_info['top_3_interests']}):")
        print("-" * 50)

        user_test_engagements = set(rec_system.engagements_test[
            rec_system.engagements_test['user_id'] == user_id
        ]['post_id'])

        print(f"Test engagements to predict: {len(user_test_engagements)} posts")

        for j, (post_id, score) in enumerate(zip(rec_data['recommended_posts'], rec_data['scores'])):
            post_info = posts_df[posts_df['post_id'] == post_id].iloc[0]
            is_hit = "✓ HIT!" if post_id in user_test_engagements else " "

            print(f"  {is_hit} {j+1}. Post {post_id} (Score: {score:.3f})")
            print(f"     Type: {post_info['content_type']}, Tags: {post_info['tags']}")

    print("\n" + "=" * 60)
    print("EVALUATION RESULTS (TEST DATA)")
    print("=" * 60)
    precision, coverage = rec_system.evaluate_recommendations(recommendations)

    # Save recommendations to CSV
    recommendations_df = rec_system.save_recommendations_to_csv(recommendations)

    print("\n" + "=" * 60)
    print("METHOD COMPARISON")
    print("=" * 60)

    methods = [
        ('Content-Based', rec_system.content_based_recommendation),
        ('Collaborative', rec_system.collaborative_filtering),
        ('Hybrid', rec_system.hybrid_recommendation)
    ]

    for method_name, method_func in methods:
        method_recs = {}
        for user_id in rec_system.users_df['user_id']:
            rec_posts, scores = method_func(user_id)
            method_recs[user_id] = {'recommended_posts': rec_posts, 'scores': scores}

        precision, coverage = rec_system.evaluate_recommendations(method_recs)
        print(f"{method_name:20} Precision@3: {precision:.4f}, Coverage: {coverage:.4f}")

    return rec_system, recommendations, recommendations_df

if __name__ == "__main__":
    rec_system, recommendations, recommendations_df = main()

Dataset shapes:
Users: (50, 5)
Posts: (100, 4)
Engagements: (1000, 3)
Training engagements: 800
Test engagements: 200
Found 10 unique interests: ['art', 'fashion', 'fitness', 'food', 'gaming', 'literature', 'music', 'sports', 'tech', 'travel']
Engagement matrix shape: (50, 100)
Total training engagements: 392
User features shape: (50, 13)
Post features shape: (100, 11)

DETAILED RECOMMENDATIONS (First 10 Users)

User U1 (Interests: sports, art, gaming):
--------------------------------------------------
Test engagements to predict: 4 posts
    1. Post P78 (Score: 0.600)
     Type: video, Tags: sports, art
    2. Post P22 (Score: 0.545)
     Type: audio, Tags: sports, art
    3. Post P12 (Score: 0.400)
     Type: video, Tags: fitness

User U2 (Interests: travel, food, fashion):
--------------------------------------------------
Test engagements to predict: 4 posts
    1. Post P1 (Score: 0.700)
     Type: video, Tags: sports, food
    2. Post P5 (Score: 0.600)
     Type: image, Tags: foo