<a href="https://colab.research.google.com/github/Rifthi-tech/recommendation_project/blob/main/Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setting Up the Environment

In [29]:
!pip install pymongo
# Core data processing
import pandas as pd
import numpy as np

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Database
from pymongo import MongoClient
import pymongo

# Utilities
import json
from datetime import datetime
import pickle



# 2. Import data

In [39]:
df = pd.read_csv("/content/sample.csv/sample.csv")  # Replace with your actual file path
print(df.head())

                            uniq_id  \
0  c2d766ca982eca8304150849735ffef9   
1  7f7036a6d550aaa89d34c77bd39a5e48   
2  f449ec65dcbc041b6ae5e6a32717d01b   
3  0973b37acd0c664e3de26e97e5571454   
4  bc940ea42ee6bef5ac7cea3fb5cfbee7   

                                         product_url  \
0  http://www.flipkart.com/alisha-solid-women-s-c...   
1  http://www.flipkart.com/fabhomedecor-fabric-do...   
2  http://www.flipkart.com/aw-bellies/p/itmeh4grg...   
3  http://www.flipkart.com/alisha-solid-women-s-c...   
4  http://www.flipkart.com/sicons-all-purpose-arn...   

                            product_name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3                            product_url   
4  Sicons All Purpose Arnica Dog Shampoo   

                               product_category_tree  Unnamed: 4  \
0  ["Clothing >> Women's Clothing >> Lingerie, Sl...         NaN   
1  ["Furniture >> Living Ro

# 3. Data Cleaning

In [47]:
import pandas as pd
import numpy as np

def generate_sample_data():
    """Generate sample product, rating, and purchase data"""
    # Sample products
    products = pd.DataFrame({
        'product_id': [101, 102, 103, 104, 105],
        'name': ['Laptop', 'Smartphone', 'Headphones', 'Keyboard', 'Mouse'],
        'price': [999.99, 699.99, '149.99', 79.99, np.nan],  # Intentional issues
        'category': ['Electronics', 'Electronics', None, 'Accessories', 'Accessories'],
        'description': ['High-performance', '', 'Noise-cancelling', None, 'Wireless']
    })

    # Sample ratings (with some invalid entries)
    ratings = pd.DataFrame({
        'user_id': ['U1', 'U2', 'U3', 'U4', 'U5'],
        'product_id': [101, 102, 999, 104, 105],
        'rating': [5, 4, 0, 6, 3],  # 0 and 6 are invalid
        'timestamp': ['2023-01-01', '2023-01-02', '2023-01-03', 'invalid_date', '2023-01-05']
    })

    # Sample purchases
    purchases = pd.DataFrame({
        'purchase_id': [1, 2, 3, 4, 5],
        'user_id': ['U1', 'U2', 'U3', 'U4', 'U5'],
        'product_id': [101, 102, 103, 104, 105],
        'purchase_date': ['2023-01-01 10:00', '2023-01-02 11:30', 'invalid', '2023-01-04', None]
    })

    return products, ratings, purchases

def clean_data(products, ratings, purchases):



    products.fillna({
        'description': 'No description',
        'category': 'uncategorized'
    }, inplace=True)

    products['price'] = pd.to_numeric(products['price'], errors='coerce')
    products['price'].fillna(products['price'].mean(), inplace=True)



    ratings = ratings[ratings['rating'].between(1, 5)]  # Keep only valid ratings
    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], errors='coerce')
    ratings.dropna(subset=['timestamp'], inplace=True)



    purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], errors='coerce')
    purchases.dropna(subset=['purchase_date'], inplace=True)

    print("\n Data cleaning completed successfully!")
    return products, ratings, purchases

def main():


    # 1. Generate sample data

    products, ratings, purchases = generate_sample_data()

    # 2. Clean data
    clean_products, clean_ratings, clean_purchases = clean_data(products.copy(),
                                                              ratings.copy(),
                                                              purchases.copy())

    # 3. Show ALL results


    print("\n CLEANED PRODUCTS (ALL):")
    print(clean_products.to_markdown(tablefmt="grid", numalign="center"))

    print("\n CLEANED RATINGS (ALL):")
    print(clean_ratings.to_markdown(tablefmt="grid", numalign="center"))

    print("\n CLEANED PURCHASES (ALL):")
    print(clean_purchases.to_markdown(tablefmt="grid", numalign="center"))


    print(" CLEANING STATS:")
    print(f"Products: {len(clean_products)} records (Original: {len(products)}) - All kept with fixes")
    print(f"Ratings: {len(clean_ratings)} valid records (Original: {len(ratings)}) - {len(ratings)-len(clean_ratings)} removed")
    print(f"Purchases: {len(clean_purchases)} valid records (Original: {len(purchases)}) - {len(purchases)-len(clean_purchases)} removed")
    print("="*50)

    print("\n SUCCESS: All data has been cleaned and is now ready for analysis!")

if __name__ == "__main__":
    main()


 Data cleaning completed successfully!

 CLEANED PRODUCTS (ALL):
+----+--------------+------------+---------+---------------+------------------+
|    |  product_id  | name       |  price  | category      | description      |
| 0  |     101      | Laptop     | 999.99  | Electronics   | High-performance |
+----+--------------+------------+---------+---------------+------------------+
| 1  |     102      | Smartphone | 699.99  | Electronics   |                  |
+----+--------------+------------+---------+---------------+------------------+
| 2  |     103      | Headphones | 149.99  | uncategorized | Noise-cancelling |
+----+--------------+------------+---------+---------------+------------------+
| 3  |     104      | Keyboard   |  79.99  | Accessories   | No description   |
+----+--------------+------------+---------+---------------+------------------+
| 4  |     105      | Mouse      | 482.49  | Accessories   | Wireless         |
+----+--------------+------------+---------+----------

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  products['price'].fillna(products['price'].mean(), inplace=True)


# 4.Feature Extraction

In [50]:
def display_all_features(product_features, user_features, tfidf):
    """Displays **all** extracted features"""

    print(" Full Feature Extraction Preview")


    # Show all product features
    print("\n Product TF-IDF Features (All):")
    feature_names = tfidf.get_feature_names_out()
    print(f"TF-IDF terms: {', '.join(feature_names)}")

    # Show all user features
    print("\n User Features (Complete dataset):")
    print(user_features.to_markdown(tablefmt="grid", numalign="center", stralign="center"))

# Example usage after feature extraction
if product_features is not None:
    display_all_features(product_features, user_features, tfidf)

 Full Feature Extraction Preview

 Product TF-IDF Features (All):
TF-IDF terms: 4k, audio, bluetooth, computer, ergonomic, gaming, hd, headphones, keyboard, lights, mechanical, monitor, mouse, rgb, ultra, wireless

 User Features (Complete dataset):
+----+-----------+--------------+
|    |  user_id  |  avg_rating  |
| 0  |    U1     |     4.2      |
+----+-----------+--------------+
| 1  |    U2     |     3.8      |
+----+-----------+--------------+
| 2  |    U3     |     4.5      |
+----+-----------+--------------+
| 3  |    U4     |     3.2      |
+----+-----------+--------------+
| 4  |    U5     |      4       |
+----+-----------+--------------+


# 5. Model Training

In [52]:
def display_all_model_outputs(results):
    """Displays **all** trained model outputs"""

    print(" Full Model Training Outputs")


    # User similarity matrix
    print("\n User Similarity Matrix (All users):")
    print(pd.DataFrame(results['user_similarity']).to_markdown(tablefmt="grid", floatfmt=".2f"))

    # Product similarity matrix
    print("\n Product Similarity Matrix (All products):")
    print(pd.DataFrame(results['product_similarity']).to_markdown(tablefmt="grid", floatfmt=".2f"))

    # Purchase matrix
    print("\n Purchase Matrix (Complete dataset):")
    print(results['purchase_matrix'].to_markdown(tablefmt="grid"))

    # Hybrid model weights
    print("\n Hybrid Model Weights:")
    print(pd.DataFrame.from_dict(results['hybrid_weights'], orient='index', columns=['Weight']).to_markdown(tablefmt="grid"))

# Example usage after model training
if model_results is not None:
    display_all_model_outputs(model_results)

 Full Model Training Outputs

 User Similarity Matrix (All users):
+----+------+------+------+------+
|    |    0 |    1 |    2 |    3 |
|  0 | 1.00 | 0.00 | 0.00 | 0.00 |
+----+------+------+------+------+
|  1 | 0.00 | 1.00 | 0.00 | 0.00 |
+----+------+------+------+------+
|  2 | 0.00 | 0.00 | 1.00 | 0.00 |
+----+------+------+------+------+
|  3 | 0.00 | 0.00 | 0.00 | 1.00 |
+----+------+------+------+------+

 Product Similarity Matrix (All products):
+----+------+------+------+------+------+
|    |    0 |    1 |    2 |    3 |    4 |
|  0 | 1.00 | 0.77 | 0.63 | 0.83 | 0.72 |
+----+------+------+------+------+------+
|  1 | 0.77 | 1.00 | 0.93 | 0.84 | 0.74 |
+----+------+------+------+------+------+
|  2 | 0.63 | 0.93 | 1.00 | 0.82 | 0.67 |
+----+------+------+------+------+------+
|  3 | 0.83 | 0.84 | 0.82 | 1.00 | 0.89 |
+----+------+------+------+------+------+
|  4 | 0.72 | 0.74 | 0.67 | 0.89 | 1.00 |
+----+------+------+------+------+------+

 Purchase Matrix (Complete dataset

# 6. Recommendation Functions

In [55]:
# Collaborative Recommendation Function (Show all results)
def collaborative_recommendation(user_id, model_data):
    try:
        user_idx = np.where(model_data['user_ids'] == user_id)[0][0]
        user_sim_scores = model_data['user_similarity'][user_idx]

        # Get top similar users
        similar_users = np.argsort(user_sim_scores)[-10:-1]

        # Get products rated by similar users
        similar_users_products = model_data['user_product_matrix'].iloc[similar_users]
        recommended_products = similar_users_products.mean(axis=0).sort_values(ascending=False)

        # Filter out products already purchased/rated
        user_products = set(model_data['user_product_matrix'].loc[user_id]
                            [model_data['user_product_matrix'].loc[user_id] > 0].index)
        recommended_products = recommended_products[~recommended_products.index.isin(user_products)]

        # Success message
        print(" Collaborative Recommendation completed successfully!")
        print("\nAll recommended products:", recommended_products.index.tolist())

        return recommended_products.index.tolist()

    except Exception as e:
        print(" Error in Collaborative Recommendation:", e)
        return []

# Content-Based Recommendation Function (Show all results)
def content_based_recommendation(product_id, model_data):
    try:
        product_idx = np.where(model_data['product_ids'] == product_id)[0][0]
        sim_scores = list(enumerate(model_data['product_similarity'][product_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:]  # Exclude self and get all results

        similar_products = [model_data['product_ids'][i] for i, _ in sim_scores]

        # Success message
        print(" Content-Based Recommendation completed successfully!")
        print("\nAll similar products:", similar_products)

        return similar_products

    except Exception as e:
        print(" Error in Content-Based Recommendation:", e)
        return []

# Hybrid Recommendation Function (Show all results)
def hybrid_recommendation(user_id, product_id, model_data):
    try:
        collab_rec = collaborative_recommendation(user_id, model_data)
        content_rec = content_based_recommendation(product_id, model_data)

        # Combine with weights
        recommendations = {}
        for product in collab_rec:
            recommendations[product] = recommendations.get(product, 0) + model_data['hybrid_weights']['collaborative']

        for product in content_rec:
            recommendations[product] = recommendations.get(product, 0) + model_data['hybrid_weights']['content']

        # Sort by combined score
        sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)

        final_recommendations = [product for product, score in sorted_recommendations]

        # Success message
        print(" Hybrid Recommendation completed successfully!")
        print("\nAll recommended products:", final_recommendations)

        return final_recommendations

    except Exception as e:
        print(" Error in Hybrid Recommendation:", e)
        return []

# Testing the functions
print("\n--- Running Recommendation System ---\n")
user_id = 3
product_id = "P3"
hybrid_recommendation(user_id, product_id, model_data)


--- Running Recommendation System ---

 Collaborative Recommendation completed successfully!

All recommended products: ['P2', 'P4']
 Content-Based Recommendation completed successfully!

All similar products: [np.str_('P4'), np.str_('P3'), np.str_('P1'), np.str_('P5')]
 Hybrid Recommendation completed successfully!

All recommended products: ['P4', 'P2', np.str_('P3'), np.str_('P1'), np.str_('P5')]


['P4', 'P2', np.str_('P3'), np.str_('P1'), np.str_('P5')]