<a href="https://colab.research.google.com/github/Rifthi-tech/recommendation_project/blob/main/Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setting Up the Environment

In [None]:
# Core data processing
import pandas as pd
import numpy as np

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Utilities
from datetime import datetime
import pickle
import os

# 2. Import data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Recommendation Project/test_sample_data.csv")  # Replace with your actual file path
print(df.head())

                            uniq_id  \
0  c2d766ca982eca8304150849735ffef9   
1  7f7036a6d550aaa89d34c77bd39a5e48   
2  f449ec65dcbc041b6ae5e6a32717d01b   
3  0973b37acd0c664e3de26e97e5571454   
4  bc940ea42ee6bef5ac7cea3fb5cfbee7   

                                         product_url  \
0  http://www.flipkart.com/alisha-solid-women-s-c...   
1  http://www.flipkart.com/fabhomedecor-fabric-do...   
2  http://www.flipkart.com/aw-bellies/p/itmeh4grg...   
3  http://www.flipkart.com/alisha-solid-women-s-c...   
4  http://www.flipkart.com/sicons-all-purpose-arn...   

                            product_name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3                            product_url   
4  Sicons All Purpose Arnica Dog Shampoo   

                               product_category_tree  Unnamed: 4  \
0  ["Clothing >> Women's Clothing >> Lingerie, Sl...         NaN   
1  ["Furniture >> Living Ro

# 3. Data Cleaning

In [None]:
# Display original dataset info
print("Original Dataset Info:")
print(df.info())
print("\nOriginal Dataset Shape:", df.shape)

# Handle missing values
# Fill missing numerical values with median
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill missing categorical values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if not df[col].mode().empty:
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna("Unknown")

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Ensure rating is between 0-5 (assuming rating scale is 0-5)
if 'rating' in df.columns:
    df['rating'] = df['rating'].clip(lower=0, upper=5)



print("\nSuccess: Dataset cleaned successfully! Missing values handled and duplicates removed.")


Original Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uniq_id                 20000 non-null  object 
 1   product_url             20000 non-null  object 
 2   product_name            20000 non-null  object 
 3   product_category_tree   20000 non-null  object 
 4   Unnamed: 4              0 non-null      float64
 5   retail_price            20000 non-null  float64
 6   image                   20000 non-null  object 
 7   description             20000 non-null  object 
 8   product_rating          20000 non-null  object 
 9   brand                   20000 non-null  object 
 10  product_specifications  20000 non-null  object 
dtypes: float64(2), object(9)
memory usage: 1.7+ MB
None

Original Dataset Shape: (20000, 11)

Success: Dataset cleaned successfully! Missing values handled and duplicates removed

#4.Feature Extraction

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

def feature_extraction(products, ratings, purchases):
    # Step 1: TF-IDF on product descriptions + categories
    tfidf = TfidfVectorizer(stop_words='english')
    product_features = tfidf.fit_transform(products['description'] + " " + products['category'])

    # Step 2: User features from ratings
    user_ratings = ratings.groupby('user_id').agg({
        'rating': ['mean', 'count'],
        'product_id': 'nunique'
    }).reset_index()
    user_ratings.columns = ['user_id', 'avg_rating', 'rating_count', 'unique_products_rated']

    # Step 3: User features from purchases
    user_purchases = purchases.groupby('user_id').agg({
        'product_id': ['count', 'nunique'],
        'purchase_date': 'max'
    }).reset_index()
    user_purchases.columns = ['user_id', 'total_purchases', 'unique_products_purchased', 'last_purchase_date']

    # Step 4: Merge features
    user_features = pd.merge(user_ratings, user_purchases, on='user_id', how='outer')
    user_features.fillna(0, inplace=True)

    # Step 5: Normalize selected columns
    scaler = MinMaxScaler()
    cols_to_normalize = ['avg_rating', 'rating_count', 'unique_products_rated',
                         'total_purchases', 'unique_products_purchased']
    user_features[cols_to_normalize] = scaler.fit_transform(user_features[cols_to_normalize])

    return product_features, user_features, tfidf, scaler

# ======== Main Execution ========
if __name__ == "__main__":
    # Load the data


    # Run feature extraction
    product_features, user_features, tfidf, scaler = feature_extraction(products, ratings, purchases)

    # Display the user features (first 5 rows)
    print("\nExtracted User Features:")
    print(user_features.head())



Extracted User Features:
   user_id  avg_rating  rating_count  unique_products_rated  total_purchases  \
0        1         1.0           1.0                    1.0              1.0   
1        2         0.0           1.0                    1.0              0.0   
2        3         0.0           0.0                    0.0              1.0   

   unique_products_purchased last_purchase_date  
0                        1.0         2023-02-01  
1                        0.0         2023-01-15  
2                        1.0         2023-03-15  


# 5. Model Training

In [None]:
def display_all_model_outputs(results):
    """Displays **all** trained model outputs with clear success messages and formatted data"""

    try:
        print("\n" + "="*50)
        print(" STARTING MODEL OUTPUT DISPLAY".center(50))
        print("="*50)

        # 1. User similarity matrix
        print("\n STEP 1/4: Displaying User Similarity Matrix")
        user_sim_df = pd.DataFrame(results['user_similarity'])
        print(f" Success! User similarity matrix shape: {user_sim_df.shape}")
        print("Top 5x5 portion of User Similarity Matrix:")
        print(user_sim_df.iloc[:5, :5].to_markdown(tablefmt="grid", floatfmt=".2f"))

        # 2. Product similarity matrix
        print("\n STEP 2/4: Displaying Product Similarity Matrix")
        product_sim_df = pd.DataFrame(results['product_similarity'])
        print(f" Success! Product similarity matrix shape: {product_sim_df.shape}")
        print("Top 5x5 portion of Product Similarity Matrix:")
        print(product_sim_df.iloc[:5, :5].to_markdown(tablefmt="grid", floatfmt=".2f"))

        # 3. Purchase matrix
        print("\n STEP 3/4: Displaying Purchase Matrix")
        print(f" Success! Purchase matrix shape: {results['purchase_matrix'].shape}")
        print("Sample of Purchase Matrix (first 5 rows):")
        print(results['purchase_matrix'].head().to_markdown(tablefmt="grid"))

        # 4. Hybrid model weights
        print("\n STEP 4/4: Displaying Hybrid Model Weights")
        weights_df = pd.DataFrame.from_dict(results['hybrid_weights'], orient='index', columns=['Weight'])
        print(" Success! Retrieved hybrid model weights")
        print("Hybrid Model Weights:")
        print(weights_df.to_markdown(tablefmt="grid", floatfmt=".2f"))

        print("\n" + "="*50)
        print(" ALL MODEL OUTPUTS DISPLAYED SUCCESSFULLY!".center(50))
        print("="*50)

    except KeyError as e:
        print(f"\n Error: Missing expected key in results - {str(e)}")
    except Exception as e:
        print(f"\n Unexpected error displaying outputs: {str(e)}")

# Example usage with test data
if __name__ == "__main__":
    import pandas as pd
    import numpy as np

    # Create sample model results
    model_results = {
        'user_similarity': np.random.rand(10, 10),
        'product_similarity': np.random.rand(15, 15),
        'purchase_matrix': pd.DataFrame(np.random.randint(0, 2, (10, 15))),
        'hybrid_weights': {
            'content_weight': 0.4,
            'collab_weight': 0.3,
            'popularity_weight': 0.2,
            'recency_weight': 0.1
        }
    }

    # Call the function
    display_all_model_outputs(model_results)



           STARTING MODEL OUTPUT DISPLAY          

 STEP 1/4: Displaying User Similarity Matrix
 Success! User similarity matrix shape: (10, 10)
Top 5x5 portion of User Similarity Matrix:
+----+------+------+------+------+------+
|    |    0 |    1 |    2 |    3 |    4 |
|  0 | 0.77 | 0.42 | 1.00 | 0.14 | 0.42 |
+----+------+------+------+------+------+
|  1 | 0.73 | 0.94 | 0.51 | 0.92 | 0.30 |
+----+------+------+------+------+------+
|  2 | 0.42 | 0.26 | 0.15 | 0.01 | 0.48 |
+----+------+------+------+------+------+
|  3 | 0.09 | 0.85 | 0.21 | 0.34 | 0.09 |
+----+------+------+------+------+------+
|  4 | 0.22 | 0.61 | 0.40 | 0.41 | 0.63 |
+----+------+------+------+------+------+

 STEP 2/4: Displaying Product Similarity Matrix
 Success! Product similarity matrix shape: (15, 15)
Top 5x5 portion of Product Similarity Matrix:
+----+------+------+------+------+------+
|    |    0 |    1 |    2 |    3 |    4 |
|  0 | 0.01 | 0.09 | 0.96 | 0.55 | 0.48 |
+----+------+------+------+------

# 6. Recommendation Functions

In [None]:
# Function to get recommendations based on different approaches
def get_recommendations(method='hybrid', user_id=None, product_id=None):
    if method == 'content':
        if product_id is None:
            print("Error: product_id required for content-based recommendations")
            return None
        return content_based_recommendations(product_id)
    elif method == 'collaborative':
        if user_id is None:
            print("Error: user_id required for collaborative recommendations")
            return None
        return collaborative_user_based(user_id)
    elif method == 'hybrid':
        if user_id is None or product_id is None:
            print("Error: both user_id and product_id required for hybrid recommendations")
            return None
        return hybrid_recommendations(user_id, product_id)
    elif method == 'rating':
        return rating_based_recommendations()
    elif method == 'purchase':
        return purchase_based_recommendations()
    else:
        print("Error: Invalid method. Choose from: content, collaborative, hybrid, rating, purchase")
        return None

# Example usage:
    content_rec = get_recommendations(method='content', product_id=123)
    collab_rec = get_recommendations(method='collaborative', user_id=456)
    hybrid_rec = get_recommendations(method='hybrid', user_id=456, product_id=123)
    top_rated = get_recommendations(method='rating')
    top_purchased = get_recommendations(method='purchase')

print("\nSuccess: Recommendation functions created successfully!")


Success: Recommendation functions created successfully!


# 7.Export data

In [89]:
import os
import pandas as pd

# Load your dataset
file_path = '/content/drive/MyDrive/Recommendation Project/test_sample_data.csv'
df = pd.read_csv(file_path)

# Normalize column names: lowercase and trim spaces
df.columns = df.columns.str.strip().str.lower()

# Create mapping from your actual columns to our required columns
column_mapping = {
    'product_name': 'name',  # This is the key fix - mapping product_name to name
    # These columns might exist in your data:
    'uniq_id': 'id',
    'retail_price': 'price',
    'product_rating': 'rating',
    'product_category_tree': 'category',
    # These columns already match or don't need renaming:
    'id': 'id',
    'price': 'price',
    'rating': 'rating',
    'category': 'category',
    'image': 'image',
    'description': 'description'
}

# Rename columns - only keep mappings that exist in the dataframe
final_mapping = {k: v for k, v in column_mapping.items() if k in df.columns}
df = df.rename(columns=final_mapping)

# Output directory in Google Drive
output_dir = "/content/drive/MyDrive/Recommendation Project/Output/"
os.makedirs(output_dir, exist_ok=True)

# Define required columns
required_columns = ['id', 'image', 'name', 'description', 'price', 'rating', 'category']

# Verify which required columns we have
available_columns = [col for col in required_columns if col in df.columns]
missing_columns = [col for col in required_columns if col not in df.columns]

if missing_columns:
    print(f"⚠️ Warning: The following required columns are missing: {missing_columns}")
    print("Available columns in your data:", list(df.columns))
else:
    print("✅ All required columns are present in the data")

# Only proceed with available columns
if available_columns:
    # Step 1: Drop rows with missing values in available required columns
    processed_df = df.dropna(subset=available_columns)

    # Step 2: Drop rows with empty or whitespace-only values
    for col in available_columns:
        processed_df = processed_df[processed_df[col].astype(str).str.strip() != '']

    # Step 3: Clean category column if it exists (remove brackets and quotes)
    if 'category' in processed_df.columns:
        processed_df['category'] = processed_df['category'].str.replace(r'[\[\]"]', '', regex=True)
        processed_df['category'] = processed_df['category'].str.split(',').str[0].str.strip()

    # Step 4: Keep only available required columns
    processed_df = processed_df[available_columns]

    # Step 5: Save to CSV
    processed_data_path = os.path.join(output_dir, 'processed_recommendation_data.csv')
    try:
        processed_df.to_csv(processed_data_path, index=False)
        print("\n✅ Processed data saved successfully to:", processed_data_path)
        print("📦 Total valid products saved:", len(processed_df))

        # Preview the result
        print("\n📋 Preview of saved data:")
        print(processed_df.head())
    except Exception as e:
        print("❌ Error saving data:", str(e))
else:
    print("❌ Error: None of the required columns were found in the data")

✅ All required columns are present in the data

✅ Processed data saved successfully to: /content/drive/MyDrive/Recommendation Project/Output/processed_recommendation_data.csv
📦 Total valid products saved: 19917

📋 Preview of saved data:
                                 id  \
0  c2d766ca982eca8304150849735ffef9   
1  7f7036a6d550aaa89d34c77bd39a5e48   
2  f449ec65dcbc041b6ae5e6a32717d01b   
3  0973b37acd0c664e3de26e97e5571454   
4  bc940ea42ee6bef5ac7cea3fb5cfbee7   

                                               image  \
0  ["http://img5a.flixcart.com/image/short/u/4/a/...   
1  ["http://img6a.flixcart.com/image/sofa-bed/j/f...   
2  ["http://img5a.flixcart.com/image/shoe/7/z/z/r...   
3  ["http://img5a.flixcart.com/image/short/6/2/h/...   
4  ["http://img5a.flixcart.com/image/pet-shampoo/...   

                                    name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3              