In [None]:
## Developed algorithm libraries
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec

from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from rapidfuzz import fuzz, process, utils


## References libraries
# import turicreate as tc
# from sklearn.model_selection import train_test_split
# import sys
# sys.path.append("..")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to load the dataset
def load_database():
    dataset = pd.read_csv("../data/AmazonData.csv")
    return dataset

# Function to process the dataset
def data_manipulation(dataset):
    # Excluding columns that we dont use
    cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
    dataset.drop(dataset.columns[cols], axis =1, inplace=True)
    dataset.dropna(inplace = True)

    # Splitting Category in 3 parts
    new = dataset["Category"].str.split("|", n = 3, expand = True)
    
    # making the first category called Main Category
    dataset["Main Category"]= new[0] 
    
    # making the second category called sub_category 
    dataset["Sub Category"]= new[1]

    # making the third category called side_category 
    dataset["Side Category"]= new[2]

    # making the last column consist of the remaining categories
    dataset["Other Category"]= new[3]

    # Dropping old category columns and the remaining categories 
    dataset.drop(columns =["Category"], inplace = True)

    # Setting Column Selling Price as float value
    # Database Price and weight treatment
    dataset.rename(columns = {'Uniq Id':'Id','Shipping Weight':'Shipping Weight(Pounds)', 'Selling Price':'Selling Price($)'}, inplace = True)

    # Removing units from Price and Weight
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('ounces')
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('pounds')
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '')

    # Removing rows with Total Price invalid
    indexes = dataset[dataset['Selling Price($)'] == 'Total price:'].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with '-' character
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace(',', '', regex=False)
    indexes = dataset[dataset['Selling Price($)'].str.contains('-', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with '&' character
    indexes = dataset[dataset['Selling Price($)'].str.contains('&', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with 'Currently' character
    indexes = dataset[dataset['Selling Price($)'].str.contains('Currently', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Removing rows with 'from' character
    indexes = dataset[dataset['Selling Price($)'].str.contains('from', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Adjusting values with wrong format
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.split(' ').str[0]
    dataset['Selling Price($)'] = dataset['Selling Price($)'].astype(float)

    # Setting Column Shipping Weight as float value
    indexes = dataset[dataset['Shipping Weight(Pounds)'].str.contains(r'\. ', na=False)].index

    dataset.at[1619, 'Shipping Weight(Pounds)']
    dataset.drop(1619, inplace=True)
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].astype(float)

    return dataset
def save_data_manipulation(dataset):
    dataset.to_csv('../data/CleanData.csv', index=False)


[nltk_data] Downloading package punkt to C:\Users\ricar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ricar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ricar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [81]:
# Requisito #02
def load_data():
    dataset = load_database()
    dataset = data_manipulation(dataset)
    return dataset

# Data split to sklearn
from sklearn.model_selection import train_test_split
def split_data_sets(dataset):
    train, test = train_test_split(dataset, test_size=0.2)
    return train, test

# Save data (optional)
def save_data_sets(train, test):
    train.to_csv("../data/train.csv", index=False)
    test.to_csv("../data/test.csv", index=False)

In [82]:
dataset = load_data()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7136 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product Name             7136 non-null   object 
 1   Selling Price($)         7136 non-null   float64
 2   About Product            7136 non-null   object 
 3   Product Specification    7136 non-null   object 
 4   Shipping Weight(Pounds)  7136 non-null   float64
 5   Main Category            7136 non-null   object 
 6   Sub Category             7136 non-null   object 
 7   Side Category            6155 non-null   object 
 8   Other Category           2724 non-null   object 
dtypes: float64(2), object(7)
memory usage: 557.5+ KB


In [None]:
# Requisitos #03 (Our Recommendation Algorithm )

# Load data for training model
def load_data_parameters():
    # Aqui seria possível implementar a leitura dos parâmetros para o modelo
    return {'vector_size': 100, 'window': 5, 'min_count': 1,'workers': 4}

# Processing text function
def preprocess_text(text):
    text = text.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    text = text.replace(stop_re, '')
    text = text.split()

    # Add lemmatization using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text

# Treinamento do modelo base com Word2Vec
def train_word2vec_model(data_parameters, dataset):

    model = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=data_parameters['vector_size'],window=data_parameters['window'], min_count=data_parameters['min_count'], workers=data_parameters['workers'])
    return model

# vectorizing text function
def vectorize_product(product_name, model):
    words = [word for word in product_name if word in model.wv]
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.wv.vector_size)
    
# Morfologic and Syntatic Filters
def prototype_rapid_fuzz_filter(user_input, products, number_of_rec):
    list_of_rec = []
    token_set_ratio_match = process.extract(user_input, products, scorer=fuzz.token_set_ratio, limit=1, processor=utils.default_process)
    partial_ratio_matches = process.extract(user_input, products, scorer=fuzz.token_set_ratio, limit=number_of_rec, processor=utils.default_process)

    if token_set_ratio_match[0][1] == 100:
        list_of_rec.extend(token_set_ratio_match)
        for match in partial_ratio_matches:
            if match[0] != token_set_ratio_match[0][0]:
                list_of_rec.append(match)
    
    else:
        for match in partial_ratio_matches:
            list_of_rec.append(match)

    return list_of_rec

def rapid_fuzz_rec_to_df(recommendations, dataframe):
    sorted_indeces = [match[2] for match in recommendations]
    reordered_df = dataframe.iloc[sorted_indeces].reset_index(drop=True)
    return reordered_df


# Product Recommendation function
def product_recommendation(product_vector, dataset, top_n=5):
    # Calcular similaridades cosseno
    similarities = dataset["Product Vector"].apply(lambda x: cosine_similarity([product_vector], [x])[0][0])
    
    # Ordenar por similaridade e pegar os top_n produtos
    top_indices = similarities.nlargest(top_n).index
    
    # Retornar o DataFrame com os produtos recomendados, mas mantendo os nomes originais
    return dataset.loc[top_indices, dataset.columns != 'Product Vector']


# Category Filter 
def category_filter(dataset, selected_product):

    main_category_input = selected_product['Main Category']
    sub_category_input = selected_product['Sub Category']
    side_category_input = selected_product['Side Category']
    other_category_input = selected_product['Other Category']

    # Create a new column to calculate the score
    dataset['score'] = 0

    # Raises the score if categories match
    dataset.loc[dataset['Main Category'] == main_category_input, 'score'] += 1
    dataset.loc[dataset['Sub Category'] == sub_category_input, 'score'] += 1
    dataset.loc[dataset['Side Category'] == side_category_input, 'score'] += 1
    dataset.loc[dataset['Other Category'] == other_category_input, 'score'] += 1

    # Sort the database based on the score
    category_filter = dataset.sort_values(by='score', ascending=False)

    max_score = category_filter['score'].max()

    # Filter rows with the maximum score
    category_filter = category_filter[category_filter['score'] == max_score]

    # Removes the new column
    category_filter = category_filter.drop(columns='score')

    # return the sorted database
    if category_filter.empty:
        print('No recommendaation found for this product.')
    else:
        return category_filter

def name_based_filter(dataset, product_name):

    parameters = load_data_parameters()
    # Applying text preprocess in dataset
    dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocess_text)

    model = train_word2vec_model(parameters,dataset)

    # Applying vectorizing function in dataset
    dataset["Product Vector"] = dataset["Processed Product Name"].apply(lambda x: vectorize_product(x, model))

    # Pré-processando o nome do produto fornecido pelo usuário
    processed_product_name = preprocess_text(product_name)

    # Vetorizando o nome do produto fornecido (passar o modelo aqui também)
    product_vector = vectorize_product(processed_product_name, model)

    recommendation = product_recommendation(product_vector, dataset, dataset.shape[0])
    return recommendation

def save_main_model(model):
    # Create model pasta
    model.save("../data/main_model.model")

def load_main_model():
    # Create model pasta
    return Word2Vec.load("../data/main_model.model")

In [None]:
## MAIN TO RUN CODE
# pensar em forma de testar código no dataset
dataset_1 = load_data()
dataset_2 = load_data()
print("Please select option:")
print("1- Select input from database")
print("2- Type product name")
user_input = int(input("Enter 1 or 2: "))

if user_input == 1:
    product_line = int(input("Type the product's number: "))
    product_line = product_line - 2

    if 0 <= product_line < len(dataset_1):
        selected_product = dataset_1.iloc[product_line]  # Selects the product by its line in the database
        print("\nSelected Product:")
        print(selected_product['Product Name'])
    else:
        print("Invalid line Number.")
    
    product_name = selected_product['Product Name']
    category_filtered = category_filter(dataset_1, selected_product)
    # print(category_filter)

    # Word2Vec Filter
    name_based_filter = name_based_filter(category_filtered, product_name)
    print("\n")
    print("Word2Vec with Category Filter: ")
    print(name_based_filter['Product Name'][:5])
    list_word2vec = name_based_filter['Product Name'].tolist()

    # RapidFuzz Filter
    print("\n")
    print("RapidFuzz with no Category Filter: ")
    for i in range(5):
        print(prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=5)[i])
    list_rapidfuzz = (prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=dataset_2.shape[0]))
    
elif user_input == 2:
    product_name = input("Type the product's name: ")

    # Word2Vec Filter
    name_based_filter = name_based_filter(dataset_1, product_name)
    print("Word2Vec: ")
    print(name_based_filter['Product Name'][:5])
    list_word2vec = name_based_filter['Product Name'].tolist()

    # RapidFuzz Filter
    print("\n")
    print("RapidFuzz: ")
    for i in range(5):
        print(prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=5)[i])
    list_rapidfuzz = (prototype_rapid_fuzz_filter(user_input=product_name, products=dataset_2["Product Name"], number_of_rec=dataset_2.shape[0]))

else:
    print("Not a valid number")

final_list = []
for i in range(len(list_word2vec)):
    for j in range(len(list_rapidfuzz)):
        if (list_word2vec[i]==list_rapidfuzz[j][0]):
            final_list.append([list_word2vec[i], i + j])
        
final_list.sort(key=lambda x: x[1])
print("\nFinal Recommendation:")
for idx, item in enumerate(final_list[:10], start=1):
    print(f"{idx} - Produto: {item[0]}, Valor: {item[1]}")


Please select option:
1- Select input from database
2- Type product name
Word2Vec: 
560     Heroes of Goo Jit Zu 41012 S1 Ultimate Hero Pk...
61                  Domez My Hero Academia 4-Piece Bundle
3668    Avengers Marvel Endgame Titan Hero Power Fx Ir...
5408    Icon Heroes DC Heroes Black Adam 1: 9 Scale Po...
5072                       Wildkin Heroes Rolling Luggage
Name: Product Name, dtype: object


RapidFuzz: 
('Domez My Hero Academia 4-Piece Bundle', 100.0, 61)
('Banpresto 35783 My Hero Academia Enter The Hero Izuku Midoriya Figure', 100.0, 117)
('Avengers Marvel Black Panther 6"-Scale Marvel Super Hero Action Figure Toy', 100.0, 262)
('Neon Super Hero Mask, Party Accessory', 100.0, 460)
('Heroes of Goo Jit Zu 41012 S1 Ultimate Hero Pk, Multicolor', 100.0, 560)

Final Recommendation:
1 - Produto: Domez My Hero Academia 4-Piece Bundle, Valor: 1
2 - Produto: Heroes of Goo Jit Zu 41012 S1 Ultimate Hero Pk, Multicolor, Valor: 4
3 - Produto: Avengers Marvel Black Panther 6"-Scale M

In [None]:
# Requisitos #04 (Reference Recommendation Algorithm - Medium)
def load_reference_database():
    customers = pd.read_csv('data/recommend_1.csv') 
    transactions = pd.read_csv('data/trx_data.csv')
    return customers,transactions

def data_reference_manipulation(customers,transactions):
    transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
    data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
                id_vars=['customerId'],
                value_name='products') \
        .dropna().drop(['variable'], axis=1) \
        .groupby(['customerId', 'products']) \
        .agg({'products': 'count'}) \
        .rename(columns={'products': 'purchase_count'}) \
        .reset_index() \
        .rename(columns={'products': 'productId'})
    data['productId'] = data['productId'].astype(np.int64)
    return data

def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

def data_model_input():
    customers,transactions = load_reference_database()
    data = data_reference_manipulation(customers,transactions)
    data_dummy = create_data_dummy(data)
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

    # create a table for input to the modeling  
    d = df_matrix_norm.reset_index() 
    d.index.names = ['scaled_purchase_freq'] 
    data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

    return data,data_dummy,data_norm
#function that combines steps above
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

# To Run 
"""
data,data_dummy,data_norm = data_model_input()
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

# constant variables to define field names include:
name = 'cosine'
user_id = 'customerId'
item_id = 'productId'
target = 'purchase_count'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)
"""


In [None]:
# Requisitos #05 (Comparison between our recommendation and the reference)


In [87]:
# Requisitos #06 (Recomendation Algorithm)