# Step 1: Import library 

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from scipy.sparse import coo_matrix
from spacy.cli.train import train

# Trending Product Recommendations System

In [27]:
train_data = pd.read_csv('data/clean_data.csv')

In [28]:
def rating_based_recommendation(data, top_n=10):
    # Calculate average ratings, sort, and select top n items
    top_rated_items = (
        data.groupby(['Name', 'ReviewCount', 'Brand', 'ImageURL'])['Rating']
        .mean().reset_index()
        .sort_values(by=['Rating', 'ReviewCount'], ascending=[False, False])
        .head(top_n)
    )

    # Convert to integer and merge to get all columns, then select necessary columns
    top_rated_items[['Rating', 'ReviewCount']] = top_rated_items[['Rating', 'ReviewCount']].astype(int)
    return top_rated_items.merge(data, on=['Name', 'Rating', 'ReviewCount', 'Brand', 'ImageURL'], how='left')[
        ['Name', 'ImageURL', 'Brand', 'Rating', 'ReviewCount', 'Description', 'Price']
    ]

# Example usage
# Assuming `train_data` is your DataFrame with columns: Name, ReviewCount, Brand, ImageURL, Rating, Description, Price
top_rated_items = rating_based_recommendation(train_data, 10)

# save the data to a new file ('data/trending_new.csv')
top_rated_items.to_csv('data/trending_new.csv', index=False)


top_rated_items

  return top_rated_items.merge(data, on=['Name', 'Rating', 'ReviewCount', 'Brand', 'ImageURL'], how='left')[


Unnamed: 0,Name,ImageURL,Brand,Rating,ReviewCount,Description,Price
0,"ACT Braces Care Anticavity Mouthwash (18 Oz, C...",https://i5.walmartimages.com/asr/a7fa6e41-316f...,ACT,5,32,Got braces? Start ACTing to help prevent cavit...,3.98
1,"Versace Man Eau Fraiche Eau De Toilette Spray,...",https://i5.walmartimages.com/asr/edaaeed5-9da0...,Versace,5,24,,30.0
2,Tree Hut Shea Sugar Scrub Passion Fruit & Guav...,https://i5.walmartimages.com/asr/be83d31d-81cf...,Tree Hut,5,22,"Enjoy this gentle exfoliator, the Tree Hut She...",6.48
3,Biolage Hydrasource Conditioning Balm For Dry ...,https://i5.walmartimages.com/asr/4a0904fb-a101...,Matrix,5,21,,18.99
4,"Ardell Double Up False Eyelashes, 203",https://i5.walmartimages.com/asr/7c7534ab-b6ea...,Ardell,5,19,New Ardell Double Up lashes are two pairs of e...,8.28
5,"Hello Kids Fluoride Free and SLS Free Rinse, N...",https://i5.walmartimages.com/asr/6998ff2c-58be...,Hello,5,19,Say hello to naturally friendly kids fluoride ...,13.49
6,"ReNew Life CleanseMore, Veggie Caps, 60 ea",https://i5.walmartimages.com/asr/9f707fe4-9ee3...,Renew Life,5,15,Herbal & Mineral FormulaWorks OvernightBrings ...,15.55
7,"(2 Pack) Pro Styl Vitamin &quotE&quot Oil, 6 Oz",https://i5.walmartimages.com/asr/b31a925f-cb2a...,V.I.P.,5,13,,6.83
8,Aura Cacia Essential Oil Frankincense 0.5 fl o...,https://i5.walmartimages.com/asr/d5261a71-0787...,Aura Cacia,5,13,Frankincense oil is a traditional favorite tha...,7.0
9,Clay-Park Labs Ammonium Lactate Lotion 12% 14 ...,https://i5.walmartimages.com/asr/b3d5462d-d067...,Clay-Park Labs,5,13,Pack of 3 for the UPC: 081642525267 Pearigo Am...,73.55


# Content Base Recommendation system (User Preferences or Items similarities)

In [29]:
train_data['Tags']

0       opi, infinite, shine, nail, lacquer, nail, pol...
1       nice, n, easy, permanent, color, 111, natural,...
2       clairol, nice, n, easy, permanent, color, natu...
3       kokie, professional, matte, lipstick, hot, ber...
4       gillette, trac, ii, plus, razor, blade, refill...
                              ...                        
4953    garden, mint, room, spray, double, strength, 4...
4954    garnier, nutrisse, nourishing, hair, color, cr...
4955    nail, file, electric, drill, 6, 1, professiona...
4956    creed, love, black, hair, body, wash, new, box...
4957    foundation, beauty, makeup, face, makeup, face...
Name: Tags, Length: 4958, dtype: object

# Function To Recommend Products for Content Base

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

"""
Train Data columns: 
['ID', 'ProdID', 'Rating', 'ReviewCount', 'Category', 'Brand', 'Name',
       'Price', 'ImageURL', 'Description', 'Tags']
"""

def content_based_recommendations(data, item_name, top_n=10 ,useful_score = 0.7):
    # Check if the item name exists in the training data
    if item_name not in data['Name'].values:
        print(f"Item '{item_name}' not found in the training data.")
        return pd.DataFrame(), 0.0

    # Create a TF-IDF vectorizer for item descriptions
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')

    # Apply TF-IDF vectorization to item descriptions
    tfidf_matrix_content = tfidf_vectorizer.fit_transform(data['Tags'])

    # Calculate cosine similarity between items based on descriptions
    cosine_similarities_content = cosine_similarity(tfidf_matrix_content, tfidf_matrix_content)

    # Find the index of the item
    item_index = data[data['Name'] == item_name].index[0]

    # Get the cosine similarity scores for the item
    similar_items = list(enumerate(cosine_similarities_content[item_index]))

    # Sort similar items by similarity score in descending order
    similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)

    # Get the top N most similar items (excluding the item itself)
    top_similar_items = similar_items[1:top_n+1]

    # Get the indices and similarity scores of the top similar items
    recommended_item_indices = [x[0] for x in top_similar_items]
    similarity_scores = [x[1] for x in top_similar_items]

    # Add similarity scores to the recommended items
    recommended_items_details = data.iloc[recommended_item_indices][['Name', 'ImageURL', 'Brand', 'Rating', 'ReviewCount', 'Description', 'Price']].copy()
    recommended_items_details['SimilarityScore'] = similarity_scores

    # Calculate the ratio of useful recommendations (similarity score > 0.5)
    useful_count = sum(score > useful_score for score in similarity_scores)
    useful_ratio = useful_count / top_n
    
    return recommended_items_details, useful_ratio



In [31]:
# Example: Get content-based recommendations for a specific item
item_name = 'BMC Bright and Loud Cream Gel Lacquer Polish Set - Neon Wasteland Collection'
content_based_rec,useful_ratio = content_based_recommendations(train_data, item_name, top_n=8)

content_based_rec


Unnamed: 0,Name,ImageURL,Brand,Rating,ReviewCount,Description,Price,SimilarityScore
1868,OPI Nail GelColor Gel Polish NEON Color .5oz/1...,https://i5.walmartimages.com/asr/cf655dbc-8e90...,OPI,0.0,0.0,,17.24,0.222477
3033,"Sally Hansen Miracle Gel, 051 Peach Please (Ne...",https://i5.walmartimages.com/asr/54de0e9b-5f65...,Sally Hansen,0.0,11336.0,"&quotGet an up to 8 day mani*. Miracle Gel, ou...",7.46,0.184668
2751,"Sally Hansen Miracle Gel, 053 Miami Ice (Neon)...",https://i5.walmartimages.com/asr/8ecd2ae5-fb6c...,Sally Hansen,0.0,11336.0,"&quotGet an up to 8 day mani*. Miracle Gel, ou...",7.46,0.183538
3094,"Sensationail Express Gel Nail Polish (White), ...",https://i5.walmartimages.com/asr/a6ebc6b8-0fc9...,SensatioNail,0.0,23.0,Bring the nail salon experience to your home w...,11.67,0.177481
4890,Just Gel SLURPLE PURPLE,https://i5.walmartimages.com/asr/e3867ec3-97b2...,ibd,0.0,0.0,"IBD introduces just gel, the strong 100% pure ...",9.95,0.163421
1450,Just Gel BLUE HAVEN,https://i5.walmartimages.com/asr/aa40f091-849d...,ibd,0.0,0.0,"IBD introduces just gel, the strong 100% pure ...",9.99,0.1633
1959,Just Gel INGNUE,https://i5.walmartimages.com/asr/6fb32bc9-0ffd...,ibd,0.0,0.0,"IBD introduces just gel, the strong 100% pure ...",9.95,0.160268
771,"Sally Hansen Miracle Gel Nail Polish, Leaf Me ...",https://i5.walmartimages.com/asr/13c7e57c-526b...,Sally Hansen,4.2,11272.0,"Sally Hansen Miracle Gel Nail Polish, Leaf Me ...",7.46,0.155635


In [32]:
print(f"Useful recommendations:  ({useful_ratio:.2%})")

Useful recommendations:  (0.00%)


In [33]:
# Example: Get content-based recommendations for a specific item
item_name = 'Kokie Professional Matte Lipstick, Hot Berry, 0.14 fl oz'
content_based_rec = content_based_recommendations(train_data, item_name, top_n=8)

content_based_rec

(                                                   Name  \
 3379  Kokie Professional Matte Lipstick, Firecracker...   
 542   Kokie Professional Matte Lipstick, Kiss Me, 0....   
 4015  Kokie Professional Lip Poudre Liquid Matte Liq...   
 2383           L.A. Colors Matte Lipstick, Tender Matte   
 2847  Kokie Professional Lip Poudre Liquid Matte Liq...   
 2997                           Be Matte Lipstick - Pink   
 4503                      Be Matte Lipstick - Soft Pink   
 1306                      Be Matte Lipstick - Baby Pink   
 
                                                ImageURL            Brand  \
 3379  https://i5.walmartimages.com/asr/8312221b-ed22...  Kokie Cosmetics   
 542   https://i5.walmartimages.com/asr/27dd82a2-2b9c...  Kokie Cosmetics   
 4015  https://i5.walmartimages.com/asr/fdd7498c-319f...  Kokie Cosmetics   
 2383  https://i5.walmartimages.com/asr/271264fb-e8c3...      L.A. Colors   
 2847  https://i5.walmartimages.com/asr/31c99d9b-ea11...  Kokie Cosmetics

In [34]:
def calculate_average_useful_ratio(train_data, top_n=10, useful_score=0.7, count_num=50):
    # Initialize a list to store useful ratios
    useful_ratios = []

    # Get unique item names and limit to the first `count_num` items
    unique_items = train_data['Name'].unique()[:count_num]

    # Loop through the selected unique item names in the training data
    for item_name in unique_items:
        # Get recommendations and useful ratio for the current item
        _, useful_ratio = content_based_recommendations(train_data, item_name, top_n, useful_score)

        # Append the useful ratio to the list
        useful_ratios.append(useful_ratio)

    # Convert the list of useful ratios to a pandas Series
    ratios_series = pd.Series(useful_ratios)

    # Print descriptive statistics
    print("\nDescriptive Statistics for Useful Ratios:")
    print(ratios_series.describe())

    # Return the list of ratios and descriptive statistics
    return ratios_series

In [35]:
count_num = len(train_data['Name'].unique()) - 1

In [36]:
average_useful_ratio = calculate_average_useful_ratio(train_data, top_n=5, useful_score = 0.7,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.167600
std        0.284521
min        0.000000
25%        0.000000
50%        0.000000
75%        0.200000
max        1.000000
dtype: float64


In [37]:
average_useful_ratio53 = calculate_average_useful_ratio(train_data, top_n=5, useful_score = 0.3,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.760000
std        0.371963
min        0.000000
25%        0.400000
50%        1.000000
75%        1.000000
max        1.000000
dtype: float64


In [38]:
average_useful_ratio55 = calculate_average_useful_ratio(train_data, top_n=5, useful_score = 0.5,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.372800
std        0.398268
min        0.000000
25%        0.000000
50%        0.200000
75%        0.800000
max        1.000000
dtype: float64


In [39]:
average_useful_ratio58 = calculate_average_useful_ratio(train_data, top_n=5, useful_score = 0.8,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.109200
std        0.233805
min        0.000000
25%        0.000000
50%        0.000000
75%        0.200000
max        1.000000
dtype: float64


In [40]:
average_useful_ratio103 = calculate_average_useful_ratio(train_data, top_n=10, useful_score = 0.3,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.648200
std        0.392996
min        0.000000
25%        0.200000
50%        0.800000
75%        1.000000
max        1.000000
dtype: float64


In [41]:
average_useful_ratio105 = calculate_average_useful_ratio(train_data, top_n=10, useful_score = 0.5,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.242800
std        0.307436
min        0.000000
25%        0.000000
50%        0.100000
75%        0.400000
max        1.000000
dtype: float64


In [42]:
average_useful_ratio107 = calculate_average_useful_ratio(train_data, top_n=10, useful_score = 0.7,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.092600
std        0.176259
min        0.000000
25%        0.000000
50%        0.000000
75%        0.100000
max        1.000000
dtype: float64


In [43]:
average_useful_ratio108 = calculate_average_useful_ratio(train_data, top_n=10, useful_score = 0.8,count_num = 500)


Descriptive Statistics for Useful Ratios:
count    500.000000
mean       0.057200
std        0.130546
min        0.000000
25%        0.000000
50%        0.000000
75%        0.100000
max        1.000000
dtype: float64


In [36]:
!pip install streamlit
!streamlit run app2.py





[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


^C
