<a href="https://colab.research.google.com/github/Niharika2475/ADM_project/blob/main/ADM_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
# 1. Data Collection
# (Using a synthetic dataset as described in the methodology)
def generate_synthetic_data(num_users=100, num_products=500, num_interactions=10000):
    """
    Generates synthetic data for a product recommendation system.

    Args:
        num_users: Number of users.
        num_products: Number of products.
        num_interactions: Number of user-product interactions.

    Returns:
        DataFrame: A pandas DataFrame containing synthetic data.
    """
    users = np.random.randint(1, num_users + 1, num_interactions)
    products = np.random.randint(1, num_products + 1, num_interactions)
    # Simulate ratings with a bias towards higher ratings
    ratings = np.random.choice([3, 4, 5], num_interactions, p=[0.2, 0.3, 0.5])
    timestamps = pd.to_datetime(np.random.randint(1609459200, 1640995200, num_interactions), unit='s')  # 2021

    # Create product categories and descriptions
    categories = ['Electronics', 'Clothing', 'Home Goods', 'Books', 'Sports']
    product_categories = np.random.choice(categories, num_products)
    product_names = [f"Product_{i}" for i in range(1, num_products + 1)]
    product_descriptions = [
        "High-quality product with excellent features.",
        "Comfortable and stylish design.",
        "Durable and long-lasting materials.",
        "Interesting and informative read.",
        "Perfect for sports enthusiasts."
    ]
    product_descriptions = np.random.choice(product_descriptions, num_products)

    # Create user follow relationships (simplified)
    user_relationships = {}
    for user_id in range(1, num_users + 1):
        # Each user follows a random number of other users
        followers = np.random.choice(range(1, num_users + 1), np.random.randint(0, 10), replace=False)
        user_relationships[user_id] = list(followers)

    # Create a DataFrame for products
    products_df = pd.DataFrame({
        'product_id': range(1, num_products + 1),
        'category': product_categories,
        'name': product_names,
        'description': product_descriptions
    })

    # Create a DataFrame for the interactions
    df = pd.DataFrame({
        'user_id': users,
        'product_id': products,
        'rating': ratings,
        'timestamp': timestamps
    })

    return df, products_df, user_relationships

# Generate the synthetic data
interactions_df, products_df, user_relationships = generate_synthetic_data()

In [3]:
# 2. Data Preprocessing
def preprocess_text(text):
    """
    Cleans and preprocesses text data.

    Args:
        text: The text to preprocess.

    Returns:
        str: The preprocessed text.
    """
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Tokenize
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

# Apply preprocessing to product descriptions
products_df['description'] = products_df['description'].apply(preprocess_text)


In [4]:
# 3. Feature Engineering
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(products_df['description'])

# Sentiment Analysis (Simplified - in a real application, use a dedicated library)
def get_sentiment(text):
    """
    Assigns a simplified sentiment score to text (Positive, Negative, Neutral).

    Args:
        text: The text to analyze.

    Returns:
        str: The sentiment (Positive, Negative, or Neutral).
    """
    # Very basic sentiment analysis for demonstration
    if 'good' in text or 'great' in text or 'excellent' in text:
        return 'Positive'
    elif 'bad' in text or 'terrible' in text or 'awful' in text:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to product descriptions
products_df['sentiment'] = products_df['description'].apply(get_sentiment)

In [8]:
# 4. Model Implementation

# 4.1 Content-Based Recommender
def get_content_based_recommendations(product_id, tfidf_matrix, products_df, top_n=5):
    """
    Generates content-based recommendations for a given product.

    Args:
        product_id: The ID of the product to find recommendations for.
        tfidf_matrix: The TF-IDF matrix of product descriptions.
        products_df: DataFrame containing product information.
        top_n: The number of top recommendations to return.

    Returns:
        list: A list of recommended product IDs.
    """
    try:
        product_index = products_df[products_df['product_id'] == product_id].index[0]
    except IndexError:
        print(f"Product ID {product_id} not found.")
        return []

    cosine_scores = cosine_similarity(tfidf_matrix[product_index], tfidf_matrix).flatten()
    related_product_indices = cosine_scores.argsort()[-2:-(top_n + 2):-1]  # Exclude the input product itself
    recommended_products = products_df.iloc[related_product_indices]['product_id'].tolist()
    return recommended_products

# 4.2 Collaborative Filtering Recommender
def get_collaborative_filtering_recommendations(user_id, interactions_df, top_n=5):
    """
    Generates collaborative filtering recommendations for a given user.

    Args:
        user_id: The ID of the user to find recommendations for.
        interactions_df: DataFrame containing user-product interactions.
        top_n: The number of top recommendations to return.

    Returns:
        list: A list of recommended product IDs.
    """
    user_ratings = interactions_df[interactions_df['user_id'] == user_id]
    if user_ratings.empty:
        print(f"No ratings found for user {user_id}.")
        return []

    similar_users = interactions_df[interactions_df['product_id'].isin(user_ratings['product_id'])]['user_id'].unique()
    similar_users = similar_users[similar_users != user_id]  # Exclude the current user

    if len(similar_users) == 0:
        print(f"No similar users found for user {user_id}.")
        return []

    similar_user_ratings = interactions_df[interactions_df['user_id'].isin(similar_users)]
    # Products that similar users liked but the current user has not interacted with
    recommended_products = similar_user_ratings[~similar_user_ratings['product_id'].isin(user_ratings['product_id'])]['product_id'].unique()

    if len(recommended_products) == 0:
        return []

    # Calculate average rating by similar users for each product
    product_avg_ratings = similar_user_ratings.groupby('product_id')['rating'].mean().sort_values(ascending=False)
    recommended_products = [product_id for product_id in product_avg_ratings.index if product_id in recommended_products]
    return recommended_products[:top_n] # Limit to top_n

# 4.3 Hybrid Recommender
def get_hybrid_recommendations(user_id, product_id, interactions_df, tfidf_matrix, products_df, top_n=5):
    """
    Generates hybrid recommendations for a given user and product.

    Args:
        user_id: The ID of the user.
        product_id: The ID of the product.
        interactions_df: DataFrame containing user-product interactions.
        tfidf_matrix: The TF-IDF matrix of product descriptions.
        products_df: DataFrame containing product information.
        top_n: The number of top recommendations to return.

    Returns:
        list: A list of recommended product IDs.
    """
    content_based_recs = get_content_based_recommendations(product_id, tfidf_matrix, products_df, top_n=top_n)
    collaborative_recs = get_collaborative_filtering_recommendations(user_id, interactions_df, top_n=top_n)

    # Weight the recommendations (70% collaborative, 30% content-based)
    weighted_recs = {}
    for rec in collaborative_recs:
        weighted_recs[rec] = weighted_recs.get(rec, 0) + 0.7
    for rec in content_based_recs:
        weighted_recs[rec] = weighted_recs.get(rec, 0) + 0.3

    # Sort by weighted score and get top N
    final_recommendations = sorted(weighted_recs.items(), key=lambda x: x[1], reverse=True)
    final_recommendations = [product_id for product_id, _ in final_recommendations[:top_n]]
    return final_recommendations

# Get recommendations for a user and product
user_id = 10
product_id = 25
hybrid_recommendations = get_hybrid_recommendations(user_id, product_id, interactions_df, tfidf_matrix, products_df, top_n=5)
print(f"Hybrid recommendations for user {user_id} and product {product_id}: {hybrid_recommendations}")

# 5. Evaluation and Optimization
# Split data into training and testing sets
train_df, test_df = train_test_split(interactions_df, test_size=0.2, random_state=42)

def evaluate_recommendations(test_df, train_df, products_df, tfidf_matrix, top_n=5):
    """
    Evaluates the performance of the hybrid recommendation system.

    Args:
        test_df: DataFrame containing the test data.
        train_df: DataFrame containing the training data.
        products_df: DataFrame containing product information.
        tfidf_matrix: The TF-IDF matrix of product descriptions.
        top_n: Number of recommendations to generate.

    Returns:
        dict: A dictionary containing precision, recall, and F1 score.
    """
    # We'll use training data for generating recommendations
    evaluation_interactions = train_df.copy()

    total_recommendations = 0
    total_relevant_items = len(test_df)
    total_hits = 0

    # Group test data by user to get all products each user interacted with
    test_user_products = test_df.groupby('user_id')['product_id'].apply(list).to_dict()

    # For each user in the test set
    for user_id, relevant_products in test_user_products.items():
        # Check if user has any ratings in the training set
        user_train_ratings = train_df[train_df['user_id'] == user_id]
        if user_train_ratings.empty:
            continue

        # Use one of their training products to generate recommendations
        sample_product_id = user_train_ratings['product_id'].iloc[0]

        # Get recommendations for this user based on training data
        recommendations = get_hybrid_recommendations(
            user_id,
            sample_product_id,
            evaluation_interactions,
            tfidf_matrix,
            products_df,
            top_n=top_n
        )

        if not recommendations:
            continue

        # Count the recommendations
        total_recommendations += len(recommendations)

        # Count hits (recommended products that the user actually interacted with in the test set)
        hits = len(set(recommendations).intersection(set(relevant_products)))
        total_hits += hits

    # Calculate metrics
    precision = total_hits / total_recommendations if total_recommendations > 0 else 0
    recall = total_hits / total_relevant_items if total_relevant_items > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

Hybrid recommendations for user 10 and product 25: [336, 139, 484, 240, 133]


In [7]:
# 6. Visualizations

def compare_recommendation_algorithms(user_id, product_id, interactions_df, tfidf_matrix, products_df, top_n=5):
    """
    Compares the performance of different recommendation algorithms.

    Args:
        user_id: The ID of the user.
        product_id: The ID of the product.
        interactions_df: DataFrame containing user-product interactions.
        tfidf_matrix: The TF-IDF matrix of product descriptions.
        products_df: DataFrame containing product information.
        top_n: Number of recommendations to generate.

    Returns:
        A matplotlib figure comparing the performance metrics.
    """
    # Get recommendations from each algorithm
    content_recs = get_content_based_recommendations(product_id, tfidf_matrix, products_df, top_n)
    collab_recs = get_collaborative_filtering_recommendations(user_id, interactions_df, top_n)
    hybrid_recs = get_hybrid_recommendations(user_id, product_id, interactions_df, tfidf_matrix, products_df, top_n)

    # We'll check how many categories are covered by each recommendation method
    content_categories = products_df[products_df['product_id'].isin(content_recs)]['category'].nunique()
    collab_categories = products_df[products_df['product_id'].isin(collab_recs)]['category'].nunique()
    hybrid_categories = products_df[products_df['product_id'].isin(hybrid_recs)]['category'].nunique()

    # Let's check how many of the recommendations are the same between algorithms
    content_collab_overlap = len(set(content_recs).intersection(set(collab_recs)))
    content_hybrid_overlap = len(set(content_recs).intersection(set(hybrid_recs)))
    collab_hybrid_overlap = len(set(collab_recs).intersection(set(hybrid_recs)))

    # Create data for visualization
    algorithm_names = ['Content-Based', 'Collaborative', 'Hybrid']
    category_diversity = [content_categories, collab_categories, hybrid_categories]

    # Create the matplotlib figure
    plt.figure(figsize=(12, 6))

    # Plot category diversity
    plt.subplot(1, 2, 1)
    plt.bar(algorithm_names, category_diversity, color=['#3498db', '#2ecc71', '#e74c3c'])
    plt.title('Category Diversity by Algorithm')
    plt.ylabel('Number of Unique Categories')

    # Plot overlap between algorithms
    plt.subplot(1, 2, 2)
    overlaps = [
        ['Content-Collaborative', content_collab_overlap],
        ['Content-Hybrid', content_hybrid_overlap],
        ['Collaborative-Hybrid', collab_hybrid_overlap]
    ]

    plt.bar([item[0] for item in overlaps], [item[1] for item in overlaps], color='#9b59b6')
    plt.title('Recommendation Overlap Between Algorithms')
    plt.ylabel('Number of Shared Recommendations')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.savefig('algorithm_comparison.png')
    plt.close()

    print("Algorithm comparison visualization saved as 'algorithm_comparison.png'")
    return None

def visualize_user_recommendations(user_id, product_id, interactions_df, tfidf_matrix, products_df, top_n=5):
    """
    Visualizes recommendations for a specific user.

    Args:
        user_id: The ID of the user.
        product_id: The ID of the product.
        interactions_df: DataFrame containing user-product interactions.
        tfidf_matrix: The TF-IDF matrix of product descriptions.
        products_df: DataFrame containing product information.
        top_n: Number of recommendations to generate.

    Returns:
        A matplotlib figure showing user's recommendations.
    """
    # Get the user's history
    user_history = interactions_df[interactions_df['user_id'] == user_id]
    history_products = products_df[products_df['product_id'].isin(user_history['product_id'])]

    # Get recommendations
    recommendations = get_hybrid_recommendations(user_id, product_id, interactions_df, tfidf_matrix, products_df, top_n)
    recommended_products = products_df[products_df['product_id'].isin(recommendations)]

    # Count categories in history and recommendations
    history_categories = history_products['category'].value_counts()
    recommended_categories = recommended_products['category'].value_counts()

    # Create a combined dataframe for plotting
    all_categories = set(history_categories.index) | set(recommended_categories.index)
    comparison_data = []

    for category in all_categories:
        history_count = history_categories.get(category, 0)
        recommended_count = recommended_categories.get(category, 0)
        comparison_data.append({
            'Category': category,
            'History': history_count,
            'Recommended': recommended_count
        })

    comparison_df = pd.DataFrame(comparison_data)

    # Create the visualization
    plt.figure(figsize=(12, 8))

    # Plot the user's history vs recommendations by category
    plt.subplot(2, 1, 1)
    comparison_df.set_index('Category').plot(kind='bar', ax=plt.gca())
    plt.title(f'User {user_id} - History vs Recommendations by Category')
    plt.ylabel('Count')

    # Plot the ratings distribution for this user
    plt.subplot(2, 1, 2)
    user_ratings = user_history['rating'].value_counts().sort_index()
    plt.bar(user_ratings.index, user_ratings.values, color='#f39c12')
    plt.title(f'User {user_id} - Rating Distribution')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.xticks(range(1, 6))  # Ratings from 1 to 5

    plt.tight_layout()
    plt.savefig(f'user_{user_id}_recommendations.png')
    plt.close()

    print(f"User recommendations visualization saved as 'user_{user_id}_recommendations.png'")
    return None

def visualize_category_recommendations(interactions_df, products_df, tfidf_matrix, sample_size=20):
    """
    Analyzes and visualizes the category distribution of recommendations across users.

    Args:
        interactions_df: DataFrame containing user-product interactions.
        products_df: DataFrame containing product information.
        tfidf_matrix: The TF-IDF matrix of product descriptions.
        sample_size: Number of users to sample for the analysis.

    Returns:
        A matplotlib figure showing category distribution in recommendations.
    """
    # Sample some users randomly
    sampled_users = np.random.choice(interactions_df['user_id'].unique(), size=min(sample_size, len(interactions_df['user_id'].unique())), replace=False)

    # Collect recommendations for each user
    all_recommendations = []
    category_counts = {category: 0 for category in products_df['category'].unique()}

    for user_id in sampled_users:
        # Get a sample product for this user
        user_products = interactions_df[interactions_df['user_id'] == user_id]['product_id'].tolist()
        if not user_products:
            continue

        sample_product = user_products[0]

        # Get recommendations
        recommendations = get_hybrid_recommendations(user_id, sample_product, interactions_df, tfidf_matrix, products_df)
        if not recommendations:
            continue

        # Get categories for these recommendations
        rec_categories = products_df[products_df['product_id'].isin(recommendations)]['category'].tolist()
        for category in rec_categories:
            category_counts[category] += 1

        all_recommendations.extend(recommendations)

    # Create category distribution visualization
    plt.figure(figsize=(14, 8))

    # Plot overall category distribution in recommendations
    plt.subplot(1, 2, 1)
    sorted_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)
    categories = [item[0] for item in sorted_categories]
    counts = [item[1] for item in sorted_categories]

    plt.bar(categories, counts, color=['#3498db', '#2ecc71', '#e74c3c', '#f39c12', '#9b59b6'])
    plt.title('Category Distribution in Recommendations')
    plt.ylabel('Count')
    plt.xticks(rotation=45)

    # Plot a heatmap of recommendations vs user ratings
    plt.subplot(1, 2, 2)

    # Get average rating by category
    category_ratings = {}
    for category in products_df['category'].unique():
        category_products = products_df[products_df['category'] == category]['product_id'].tolist()
        category_interactions = interactions_df[interactions_df['product_id'].isin(category_products)]
        if not category_interactions.empty:
            category_ratings[category] = category_interactions['rating'].mean()
        else:
            category_ratings[category] = 0

    # Create a dataframe for the heatmap
    rating_data = [{'Category': cat, 'Avg Rating': rating} for cat, rating in category_ratings.items()]
    rating_df = pd.DataFrame(rating_data).sort_values('Avg Rating', ascending=False)

    # Plot the average ratings
    plt.barh(rating_df['Category'], rating_df['Avg Rating'], color='#e67e22')
    plt.title('Average Rating by Category')
    plt.xlabel('Average Rating')
    plt.xlim(0, 5)

    plt.tight_layout()
    plt.savefig('category_analysis.png')
    plt.close()

    print("Category analysis visualization saved as 'category_analysis.png'")
    return None

# Evaluate the Hybrid model
evaluation_results = evaluate_recommendations(test_df, train_df, products_df, tfidf_matrix)
print("Evaluation Results for Hybrid Recommender:", evaluation_results)

# Generate all three visualizations
compare_recommendation_algorithms(user_id, product_id, interactions_df, tfidf_matrix, products_df)
visualize_user_recommendations(user_id, product_id, interactions_df, tfidf_matrix, products_df)
visualize_category_recommendations(interactions_df, products_df, tfidf_matrix)

print("\nAll visualizations have been generated.")

Evaluation Results for Hybrid Recommender: {'precision': 0.034, 'recall': 0.0085, 'f1_score': 0.013600000000000001}
Algorithm comparison visualization saved as 'algorithm_comparison.png'
User recommendations visualization saved as 'user_10_recommendations.png'
Category analysis visualization saved as 'category_analysis.png'

All visualizations have been generated.
