In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
product_df = pd.read_csv("converted_products.csv")
allowed_values_df = pd.read_csv("dietary_Allowances.csv")
# allowed_values_df.head()
product_df.dropna(axis=0)

Unnamed: 0,Brand,Size,Flavor,Rating,Price,CALORIES,TOTAL FAT,CHOLESTEROL,SODIUM,TOTAL CARBOHYDRATE,...,MAGNESIUM,VITAMIN C,VITAMIN D.2,NIACIN,VITAMIN B6,VITAMIN B12,NIACIN.1,BIOTIN,FOLATE,Category
0,Optimum Nutrition,1 Lb.,Delicious Strawberry,9.3,$19.99,120,1500.0,40.0,50.0,3000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WHEY PROTEIN
1,Optimum Nutrition,1 Lb.,Double Rich Chocolate,9.3,$19.99,120,1500.0,35.0,50.0,3000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WHEY PROTEIN
2,Optimum Nutrition,1 Lb.,Vanilla Ice Cream,9.3,$19.99,120,1000.0,35.0,105.0,4000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WHEY PROTEIN
3,Optimum Nutrition,2 Lbs.,Banana Cream,9.3,$29.99,120,1000.0,40.0,100.0,4000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WHEY PROTEIN
4,Optimum Nutrition,2 Lbs.,Birthday Cake,9.3,$29.99,130,1500.0,35.0,150.0,5000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WHEY PROTEIN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363,Vital Proteins,9 Oz.,Strawberry Lemon,10.0,$25.00,70,0.0,0.0,100.0,3000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEAL REPLACEMENT
1364,Vital Proteins,9 Oz.,Tropical Hibiscus,10.0,$25.00,70,0.0,0.0,75.0,3000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEAL REPLACEMENT
1365,Vital Proteins,9 Oz.,Watermelon Mint,10.0,$25.00,70,0.0,0.0,100.0,3000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MASS GAINERS
1366,Vital Proteins,14 Packets,Lavender Lemon,10.0,$34.00,50,0.0,0.0,55.0,3000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEAL REPLACEMENT


In [36]:
import re
def dollars_to_rupees(dollar_price_str):
    # Extract numeric part from the input string using regex
    match = re.search(r'\d+\.\d+', dollar_price_str)
    
    if match:
        # Convert the extracted numeric part to a float
        dollar_price = float(match.group())
        
        rupee_price = dollar_price * 82.93
        
        return f"₹{rupee_price:.2f}"
    else:
        # Handle the case when no numeric part is found in the input string
        return "Invalid input for dollar_price"


In [37]:

recommended_products = pd.DataFrame()
recommended_products_subset = pd.DataFrame()
def recommend_products(gender,ingredient_list, category, product_df, allowed_values_df) :

    recommended_items = {}
    updated_ingredient_list = {}
    for nutrient, value in ingredient_list.items():
        # Check if nutrient is present in allowed_values_df for the specified gender
        matching_rows = allowed_values_df[(allowed_values_df["Nutrient"] == nutrient) & (allowed_values_df["Gender"] == gender)]

        if not matching_rows.empty: 
            matching_row = matching_rows.iloc[0]
            allowed_range = matching_row["Allowance (mg)"]
            
            if not (value - 0.1 * value <= allowed_range <= value + 0.1 * value):
                print(f"Updating {nutrient} value to allowed range for {gender}.")
                updated_ingredient_list[nutrient] = allowed_range
            else:
                updated_ingredient_list[nutrient] = value
        else:
            print(f"No information found for {nutrient} for {gender}. Keeping the original value.")
            updated_ingredient_list[nutrient] = value

    # print("Updated Ingredient List:")
    # print(updated_ingredient_list)

    ingredient_columns = product_df.columns[5:]
    product_ingredients = product_df[ingredient_columns]

    updated_ingredient_df = pd.DataFrame([updated_ingredient_list])
    common_ingredients = list(set(updated_ingredient_df.columns).intersection(set(product_ingredients.columns)))

    # Ensure the order of columns matches between updated_ingredient_df and product_ingredients
    updated_ingredient_df = updated_ingredient_df[common_ingredients]
    product_ingredients = product_ingredients[common_ingredients]

    updated_ingredient_df = updated_ingredient_df.dropna(axis=0)
    product_ingredients = product_ingredients.dropna(axis=0)

    if updated_ingredient_df.empty or product_ingredients.empty:
        print("Insufficient data after removing NaN values.")
    else:
        cosine_similarities_ingredient = cosine_similarity(updated_ingredient_df, product_ingredients)
        similarity_threshold_ingredient = 0.87
        similar_product_indices_ingredient = [i for i, sim in enumerate(cosine_similarities_ingredient[0]) if sim > similarity_threshold_ingredient]

        vectorizer = CountVectorizer()
        category_matrix = vectorizer.fit_transform([category] + list(product_df['Category']))

        # Calculate cosine similarity for category
        cosine_similarities_category = cosine_similarity(category_matrix[0], category_matrix[1:])[0]
        similarity_threshold_category = 0.8
        similar_product_indices_category = [i for i, sim in enumerate(cosine_similarities_category) if sim > similarity_threshold_category]

        # Find products that are similar in both ingredients and category
        similar_product_indices = list(set(similar_product_indices_ingredient) & set(similar_product_indices_category))

        if similar_product_indices:
            
            recommended_products = product_df.iloc[similar_product_indices]
            recommended_products_subset = recommended_products[["Brand", "Rating", "Flavor", "Price", "Category"]]

            for _, row in recommended_products_subset.iterrows():
                brand = row['Brand']
                if brand not in recommended_items:
                    recommended_items[brand] = {
                        "Brand": row['Brand'],
                        "Rating": row['Rating'],
                        "Flavor": row['Flavor'],
                        "Price": dollars_to_rupees(row['Price']),
                        "Category": row['Category']
                    }
        else:
            print("No similar products found.")

    return recommended_items

In [38]:
# Input: Gender, Category and Ingredient List
gender = "Female"  # gender input
ingredient_list = {"PROTEIN": 23000, "MAGNESIUM": 0, "FOLATE": 32, 'VITAMIN B12':0.60, 'VITAMIN A':1.20}  # Example ingredient input
category = "WHEY PROTEIN" #Category input
recommended_dict = recommend_products(gender, ingredient_list, category, product_df, allowed_values_df)
for brand, details in recommended_dict.items():
    details = {k: v for k, v in details.items() if pd.notna(v) and str(v) != "{{vm.sku.name}}"}
    recommended_dict[brand] = details
recommended_dict

No information found for PROTEIN for Female. Keeping the original value.
Updating MAGNESIUM value to allowed range for Female.
Updating FOLATE value to allowed range for Female.
Updating VITAMIN B12 value to allowed range for Female.
Updating VITAMIN A value to allowed range for Female.


{'Optimum Nutrition': {'Brand': 'Optimum Nutrition',
  'Rating': 9.3,
  'Flavor': 'Delicious Strawberry',
  'Price': '₹1657.77',
  'Category': 'WHEY PROTEIN'},
 'JYM Supplement Science': {'Brand': 'JYM Supplement Science',
  'Rating': 9.2,
  'Flavor': 'Chocolate Cookie Crunch',
  'Price': '₹2901.72',
  'Category': 'WHEY PROTEIN ISOLATE'},
 'Bodybuilding.com Signature': {'Brand': 'Bodybuilding.com Signature',
  'Rating': 8.8,
  'Flavor': 'Chocolate',
  'Price': '₹1678.50',
  'Category': 'WHEY PROTEIN'},
 'BSN': {'Brand': 'BSN',
  'Rating': 9.2,
  'Flavor': 'Berry Berry Berry Good',
  'Price': '₹2487.07',
  'Category': 'WHEY PROTEIN'},
 'Dymatize': {'Brand': 'Dymatize',
  'Rating': 9.3,
  'Flavor': 'Birthday Cake',
  'Price': '₹2735.86',
  'Category': 'WHEY PROTEIN'},
 'MuscleTech': {'Brand': 'MuscleTech',
  'Rating': 9.6,
  'Flavor': 'Chocolate Fudge Brownie',
  'Price': '₹3150.51',
  'Category': 'WHEY PROTEIN ISOLATE'},
 'MusclePharm': {'Brand': 'MusclePharm',
  'Rating': 8.8,
  'Flavo

In [39]:
all_recommended_products = list(zip(recommended_products_subset['Brand'], recommended_products_subset['Flavor']))

# Calculate the novelty as the inverse of the frequency (popularity) of each recommended item
novelty_scores = {product: 1 / all_recommended_products.count(product) for product in set(all_recommended_products)}

# Calculate average novelty
average_novelty = sum(novelty_scores.values()) / len(novelty_scores) if len(novelty_scores) > 0 else 0

# Print the novelty
print(f"Average Novelty: {average_novelty:.2f}")

KeyError: 'Brand'

In [40]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Extract features for clustering
features_for_clustering = recommended_products.drop(["Brand", "Flavor", "Size", "Price", "Rating", "Category"], axis=1)
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters,n_init=10, random_state=42)
predicted_clusters = kmeans.fit_predict(features_for_clustering)

silhouette_avg = silhouette_score(features_for_clustering, predicted_clusters)

# silhouette coefficient
print(f"Silhouette Coefficient: {silhouette_avg:.2f}")

print("Silhouette Coefficient = How well recommended products are similar to each other")

KeyError: "['Brand', 'Flavor', 'Size', 'Price', 'Rating', 'Category'] not found in axis"

In [26]:
product_df.Category.unique()
product_df.Price.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1369 entries, 0 to 1368
Series name: Price
Non-Null Count  Dtype 
--------------  ----- 
1369 non-null   object
dtypes: object(1)
memory usage: 10.8+ KB
