In [27]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function to load the dataset
def load_database():
    dataset = pd.read_csv("../data/AmazonData.csv")
    return dataset

# Function to process the dataset
def pandas_conversion(dataset):
    # Excluding columns that we don't use
    cols = [0, 2, 3, 5, 6, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
    dataset.drop(dataset.columns[cols], axis=1, inplace=True)
    dataset.dropna(inplace=True)
    return dataset

def data_manipulation(dataset):
    # Split categories from "Category" column and assign them to new columns
    new = dataset["Category"].str.split("|", n=3, expand=True)
    dataset["Main Category"] = new[0]
    dataset["Sub-Category"] = new[1]
    dataset["Side Category"] = new[2]
    dataset["Other Categories"] = new[3]
    # Drop unnecessary columns and rename columns
    dataset.drop(columns=["Category", "Other Categories"], inplace=True)
    dataset.rename(columns={
        'Uniq Id': 'Id',
        'Shipping Weight': 'Shipping Weight(Pounds)',
        'Selling Price': 'Selling Price($)'
    }, inplace=True)
    # Convert weight and price values
    indexes_ounces = dataset[dataset['Shipping Weight(Pounds)'].str.contains('ounces', na=False)].index
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace('ounces', '').str.strip()
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace('pounds', '').str.strip()
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '')
    # Handling inconsistent values in price and remove records with invalid values in "Shipping Weight(Pounds)"
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace(',', '', regex=False)
    indexes = dataset[dataset['Shipping Weight(Pounds)'].str.contains(r'\. ', na=False)].index
    dataset.drop(indexes, inplace=True)
    # Handling inconsistent values in weight and remove records with invalid values in "Selling Price($)"
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)
    invalid_price_terms = ['Total price:', '&', 'Currently', 'from', '-', ' ']
    for term in invalid_price_terms:
        indexes = dataset[dataset['Selling Price($)'].str.contains(term, na=False)].index
        if term == ' ':
            dataset.loc[indexes, 'Selling Price($)'] = dataset.loc[indexes, 'Selling Price($)'].str.split(' ').str[0]
        else:
            dataset.drop(indexes, inplace=True)
    # Removing rows with non-numeric values in weight and price columns
    dataset = dataset[pd.to_numeric(dataset['Selling Price($)'], errors='coerce').notnull()]
    dataset = dataset[pd.to_numeric(dataset['Shipping Weight(Pounds)'], errors='coerce').notnull()]
    # Convert data types of price and weight columns to float
    dataset['Selling Price($)'] = dataset['Selling Price($)'].astype(float)
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].astype(float)
    # Convert weight values from ounces to pounds when necessary
    indexes_ounces_updated = dataset.index.intersection(indexes_ounces)
    dataset.loc[indexes_ounces_updated, 'Shipping Weight(Pounds)'] = dataset.loc[indexes_ounces_updated, 'Shipping Weight(Pounds)'] / 16
    return dataset

def save_data_manipulation(dataset):
    dataset.to_csv('../data/CleanData.csv', index=False)


[nltk_data] Downloading package punkt to C:\Users\ricar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ricar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ricar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# Requisitos #02
def load_data():
    dataset = load_database()
    dataset = pandas_conversion(dataset)
    dataset = data_manipulation(dataset)
    return dataset

# Data split to sklearn
from sklearn.model_selection import train_test_split
def split_data_sets(dataset):
    train, test = train_test_split(dataset, test_size=0.2)
    return train, test

# Save data (optional)
def save_data_sets(train, test):
    train.to_csv("./data/train.csv", index=False)
    test.to_csv("./data/test.csv", index=False)

In [None]:
# Requisitos #03 (Our Recommendation Algorithm )

# Load data for training model
def load_data_parameters():
    # Aqui seria possível implementar a leitura dos parâmetros para o modelo
    return {'vector_size': 100, 'window': 5, 'min_count': 1,'workers': 4}

# Processing text function
def preprocess_text(text):
    text = text.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    text = text.replace(stop_re, '')
    text = text.split()

    # Add lemmatization using WordNetLemmatizer to handle singular-plural cases
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text

# Treinamento do modelo base com Word2Vec
def train_model_create(data_parameters, dataset):
    model = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=data_parameters['vector_size'],window=data_parameters['window'], min_count=data_parameters['min_count'], workers=data_parameters['workers'])
    return model

# vectorizing text function
def vectorize_product(product_name, model):
    words = [word for word in product_name if word in model.wv]
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.wv.vector_size)

# Product Recommendation function
def product_recommendation(product_vector, dataset, top_n=5):
    
    # Calcular similaridades cosseno
    similarities = dataset["Product Vector"].apply(lambda x: cosine_similarity([product_vector], [x])[0][0])
    
    # Ordenar por similaridade e pegar os top_n produtos
    top_indices = similarities.nlargest(top_n).index
    
    # Retornar o DataFrame com os produtos recomendados, mas mantendo os nomes originais
    return dataset.loc[top_indices, dataset.columns != 'Product Vector']


def name_based_filter(dataset, product_name):

    # Applying text preprocess in dataset
    dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocess_text)

    parameter = load_data_parameters()
    model = train_model_create(parameter,dataset)

    # Applying vectorizing function in dataset
    dataset["Product Vector"] = dataset["Processed Product Name"].apply(lambda x: vectorize_product(x, model))

    # Pré-processando o nome do produto fornecido pelo usuário
    processed_product_name = preprocess_text(product_name)

    # Vetorizando o nome do produto fornecido (passar o modelo aqui também)
    product_vector = vectorize_product(processed_product_name, model)

    recommendation = product_recommendation(product_vector, dataset, top_n=5)
    return recommendation


# pensar em forma de testar código no dataset
def run_model(dataset):
    print("Please select option:")
    print("1- Select input from database")
    print("2- Type product name")
    user_input = int(input("Enter 1 or 2: "))

    main_category_input = None
    sub_category_input = None
    side_category_input = None
    other_category_input = None

    if user_input == 1:
        product_line = int(input("Type the product's number: "))
        product_line = product_line - 2

        if 0 <= product_line < len(dataset):
            selected_product = dataset.iloc[product_line]  # Selects the product by its line in the database
            print("\nSelected Product:")
            print(selected_product)
        else:
            print("Invalid line Number.")
        
        product_name = selected_product['Product Name']

        category_filter = category_filter(dataset, selected_product)
        # print(category_filter)

        name_based_filter = name_based_filter(category_filter, product_name)
        print(name_based_filter)
        
    elif user_input == 2:
        product_name = input("Type the product's name: ")
        name_based_filter = name_based_filter(dataset, product_name)
        print(name_based_filter)

    else:
        print("Not a valid number")

    print("\n")
    print(product_name)
    # Checking if categories are valid
    if main_category_input not in dataset['Main Category'].unique():
        print('Main Category not found in the database.')
    else:
        print(main_category_input)
    if sub_category_input not in dataset['Sub Category'].unique():
        print('Subcategoria not found in the database.')
    else:
        print(sub_category_input)
    if side_category_input not in dataset['Side Category'].unique():
        print('Side Category not found in the database.')
    else:
        print(side_category_input)
    if other_category_input not in dataset['Other Category'].unique():
        print('Other Category not found in the database.')
    else:
        print(other_category_input)

def save_main_model(model):
    # Create model pasta
    model.save("../data/main_model.model")

def load_main_model():
    # Create model pasta
    return Word2Vec.load("../data/main_model.model")

In [None]:
## MAAIN TO RUN CODE
dataset = load_data()
run_model(dataset)

Please select option:
1- Select input from database
2- Type product name

Selected Product:
Product Name                                                      Herbaceous
Selling Price($)                                                       14.01
About Product              Make sure this fits by entering your model num...
Product Specification      ProductDimensions:5.5x1.5x7.5inches|ItemWeight...
Shipping Weight(Pounds)                                                  0.7
Main Category                                                  Toys & Games 
Sub-Category                                            Games & Accessories 
Side Category                                                    Board Games
Name: 83, dtype: object


UnboundLocalError: cannot access local variable 'category_filter' where it is not associated with a value

In [31]:
# Requisitos #04 (Reference Recommendation Algorithm - Medium)

In [32]:
# Requisitos #05 (Comparison between our recommendation and the reference)

In [33]:
# Requisitos #06 (Recomendation Algorithm)