In [1]:
import pandas as pd
import numpy as np
import nltk
import gensim

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from rapidfuzz import fuzz, process, utils

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

# Importing dataset
dataset_1 = pd.read_csv("../data/AmazonData.csv")
dataset_2 = pd.read_csv("../data/AmazonData.csv")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nytoy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nytoy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nytoy\AppData\Roaming\nltk_data...


In [2]:
# Dataset process function
def dataset_process(dataset):

    # Excluding columns that we dont use
    cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
    dataset.drop(dataset.columns[cols], axis =1, inplace=True)
    dataset.dropna(inplace = True)

    # Splitting Category in 3 parts
    new = dataset["Category"].str.split("|", n = 3, expand = True)
    
    # making the first category called Main Category
    dataset["Main Category"]= new[0] 
    
    # making the second category called sub_category 
    dataset["Sub Category"]= new[1]

    # making the third category called side_category 
    dataset["Side Category"]= new[2]

    # making the last column consist of the remaining categories
    dataset["Other Category"]= new[3]

    # Dropping old category columns and the remaining categories 
    dataset.drop(columns =["Category"], inplace = True)

    # Setting Column Selling Price as float value
    # Database Price and weight treatment
    dataset.rename(columns = {'Uniq Id':'Id','Shipping Weight':'Shipping Weight(Pounds)', 'Selling Price':'Selling Price($)'}, inplace = True)

    # Removing units from Price and Weight
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('ounces')
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('pounds')
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '')

    # Removing rows with Total Price invalid
    indexes = dataset[dataset['Selling Price($)'] == 'Total price:'].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with '-' character
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace(',', '', regex=False)
    indexes = dataset[dataset['Selling Price($)'].str.contains('-', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with '&' character
    indexes = dataset[dataset['Selling Price($)'].str.contains('&', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with 'Currently' character
    indexes = dataset[dataset['Selling Price($)'].str.contains('Currently', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with 'from' character
    indexes = dataset[dataset['Selling Price($)'].str.contains('from', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Adjusting values with wrong format
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.split(' ').str[0]
    dataset['Selling Price($)'] = dataset['Selling Price($)'].astype(float)

    # Setting Column Shipping Weight as float value
    indexes = dataset[dataset['Shipping Weight(Pounds)'].str.contains(r'\. ', na=False)].index

    dataset.at[1619, 'Shipping Weight(Pounds)']
    dataset.drop(1619, inplace=True)
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].astype(float)

    return dataset

In [3]:
# Processing text function
def preprocess_text(text):
    text = text.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    text = text.replace(stop_re, '')
    text = text.split()

    # Add lemmatization using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text

In [4]:
# Creating a Word2Vec model with a vector size of 100, using the 'Processed Product Name' column.
def train_word2vec_model(dataset):
    model = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=100, window=50, min_count=1, workers=4)
    return model

In [5]:
# vectorizing text function
def vectorize_product(product_name, model):

    words = [word for word in product_name if word in model.wv]
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.wv.vector_size)

In [6]:
def prototype_rapid_fuzz_filter(user_input, products, number_of_rec):
    list_of_rec = []
    token_set_ratio_match = process.extract(user_input, products, scorer=fuzz.token_set_ratio, limit=1, processor=utils.default_process)
    partial_ratio_matches = process.extract(user_input, products, scorer=fuzz.partial_ratio, limit=number_of_rec, processor=utils.default_process)

    if token_set_ratio_match[0][1] == 100:
        list_of_rec.extend(token_set_ratio_match)
        for match in partial_ratio_matches:
            if match[0] != token_set_ratio_match[0][0]:
                list_of_rec.append(match)
    
    else:
        for match in partial_ratio_matches:
            list_of_rec.append(match)

    return list_of_rec

def rapid_fuzz_rec_to_df(recommendations, dataframe):
    sorted_indeces = [match[2] for match in recommendations]
    reordered_df = dataframe.iloc[sorted_indeces].reset_index(drop=True)
    return reordered_df

In [7]:
# Product Recommendation function
def product_recommendation(product_vector, dataset, top_n=5):
    
    # Calcular similaridades cosseno
    similarities = dataset["Product Vector"].apply(lambda x: cosine_similarity([product_vector], [x])[0][0])
    
    # Ordenar por similaridade e pegar os top_n produtos
    top_indices = similarities.nlargest(top_n).index
    
    # Retornar o DataFrame com os produtos recomendados, mas mantendo os nomes originais
    return dataset.loc[top_indices, dataset.columns != 'Product Vector']

In [8]:
# Category Filter 
def category_filter(dataset, selected_product):

    main_category_input = selected_product['Main Category']
    sub_category_input = selected_product['Sub Category']
    side_category_input = selected_product['Side Category']
    other_category_input = selected_product['Other Category']

    # Create a new column to calculate the score
    dataset['score'] = 0

    # Raises the score if categories match
    dataset.loc[dataset['Main Category'] == main_category_input, 'score'] += 1
    dataset.loc[dataset['Sub Category'] == sub_category_input, 'score'] += 1
    dataset.loc[dataset['Side Category'] == side_category_input, 'score'] += 1
    dataset.loc[dataset['Other Category'] == other_category_input, 'score'] += 1

    # Sort the database based on the score
    category_filter = dataset.sort_values(by='score', ascending=False)

    max_score = category_filter['score'].max()

    # Filter rows with the maximum score
    category_filter = category_filter[category_filter['score'] == max_score]

    # Removes the new column
    category_filter = category_filter.drop(columns='score')

    # return the sorted database
    if category_filter.empty:
        print('No recommendaation found for this product.')
    else:
        return category_filter

In [9]:

def name_based_filter(dataset, product_name):

    # Applying text preprocess in dataset
    dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocess_text)

    model = train_word2vec_model(dataset)

    # Applying vectorizing function in dataset
    dataset["Product Vector"] = dataset["Processed Product Name"].apply(lambda x: vectorize_product(x, model))

    # Pré-processando o nome do produto fornecido pelo usuário
    processed_product_name = preprocess_text(product_name)

    # Vetorizando o nome do produto fornecido (passar o modelo aqui também)
    product_vector = vectorize_product(processed_product_name, model)

    recommendation = product_recommendation(product_vector, dataset, dataset.shape[0])
    return recommendation

In [10]:
dataset_1 = dataset_process(dataset_1)
dataset_2 = dataset_process(dataset_2)

In [13]:
print("Please select option:")
print("1- Select input from database")
print("2- Type product name")
user_input = int(input("Enter 1 or 2: "))

main_category_input = None
sub_category_input = None
side_category_input = None
other_category_input = None

list_word2vec = []

if user_input == 1:
    product_line = int(input("Type the product's number: "))
    product_line = product_line - 2

    if 0 <= product_line < len(dataset_1):
        selected_product = dataset_1.iloc[product_line]  # Selects the product by its line in the database
        print("\nSelected Product:")
        print(selected_product['Product Name'])
    else:
        print("Invalid line Number.")
    
    product_name = selected_product['Product Name']
    category_filter = category_filter(dataset_1, selected_product)
    # print(category_filter)

    # Word2Vec Filter
    name_based_filter = name_based_filter(category_filter, product_name)
    print("\n")
    print("Word2Vec with Category Filter: ")
    print(name_based_filter['Product Name'][:5])
    list_word2vec = name_based_filter['Product Name'].tolist()

    # RapidFuzz Filter
    print("\n")
    print("RapidFuzz with no Category Filter: ")
    for i in range(5):
        print(prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=5)[i])
    list_rapidfuzz = (prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=dataset_2.shape[0]))
    
elif user_input == 2:
    product_name = input("Type the product's name: ")

    # Word2Vec Filter
    name_based_filter = name_based_filter(dataset_1, product_name)
    print("Word2Vec: ")
    print(name_based_filter['Product Name'][:5])
    list_word2vec = name_based_filter['Product Name'].tolist()

    # RapidFuzz Filter
    print("\n")
    print("RapidFuzz: ")
    for i in range(5):
        print(prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=5)[i])
    list_rapidfuzz = (prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=dataset_2.shape[0]))

else:
    print("Not a valid number")

final_list = []
for i in range(len(list_word2vec)):
    for j in range(len(list_rapidfuzz)):
        if (list_word2vec[i]==list_rapidfuzz[j][0]):
            final_list.append([list_word2vec[i], i + j])
        
final_list.sort(key=lambda x: x[1])
print("\nFinal Recommendation:")
for idx, item in enumerate(final_list[:10], start=1):
    print(f"{idx} - Produto: {item[0]}, Valor: {item[1]}")



Please select option:
1- Select input from database
2- Type product name

Selected Product:
Waving Flag Party Table Cover, 54" x 96"


TypeError: 'DataFrame' object is not callable