In [11]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Requisitos #00 e #01
# Functions to load and process the dataset

def load_database():
    dataset = pd.read_csv("../data/AmazonData.csv")
    return dataset

# Dataset processing by requisitos #00 e #01
def pandas_conversion(dataset):
    # Excluding columns that we dont use
    cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
    dataset.drop(dataset.columns[cols], axis=1, inplace=True)
    dataset.dropna(inplace=True)
    return dataset

def data_manipulation(dataset):
    # Split categories from "Category" column and assign them to new columns
    new = dataset["Category"].str.split("|", n=3, expand=True)
    dataset["Main Category"] = new[0]
    dataset["Sub-Category"] = new[1]
    dataset["Side Category"] = new[2]
    dataset["Other Categories"] = new[3]

    # Drop unnecessary columns and rename columns
    dataset.drop(columns=["Category", "Other Categories"], inplace=True)
    dataset.rename(columns={
        'Uniq Id': 'Id',
        'Shipping Weight': 'Shipping Weight(Pounds)',
        'Selling Price': 'Selling Price($)'
    }, inplace=True)

    # Convert weight and price values
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('ounces')
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('pounds')
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '')

    # Handling inconsistent values in price and remove records with invalid values in "Shipping Weight(Pounds)"
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace(',', '', regex=False)
    indexes = dataset[dataset['Shipping Weight(Pounds)'].str.contains(r'\. ', na=False)].index
    dataset.drop(indexes, inplace=True)

    # Handling inconsistent values in weight and remove records with invalid values in "Selling Price($)"
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)

    invalid_price_terms = ['Total price:', '&', 'Currently', 'from', '-', ' ']
    for term in invalid_price_terms:
        indexes = dataset[dataset['Selling Price($)'].str.contains(term, na=False)].index
        if term == ' ':
            dataset.loc[indexes, 'Selling Price($)'] = dataset.loc[indexes, 'Selling Price($)'].str.split(' ').str[0]
        else:
            dataset.drop(indexes, inplace=True)

    # Convert data types of price and weight columns to float
    dataset['Selling Price($)'] = dataset['Selling Price($)'].astype(float)
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].astype(float)

    # Convert weight values from ounces to pounds when necessary
    indexes_ounces = dataset[dataset['Shipping Weight(Pounds)'].str.contains('ounces', na=False)].index
    indexes_ounces_updated = []
    indexes_ounces_updated = dataset.index.intersection(indexes_ounces) = dataset.loc[indexes_ounces_updated, 'Shipping Weight(Pounds)'] / 16

    return dataset

def save_data_manipulation(dataset):
    dataset.to_csv('../data/CleanData.csv', index=False)


[nltk_data] Downloading package punkt to /home/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/leo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Requisitos #02
def load_data():
    dataset = load_database()
    dataset = pandas_conversion(dataset)
    dataset = data_manipulation(dataset)
    return dataset

# Data split to sklearn
from sklearn.model_selection import train_test_split
def split_data_sets(dataset):
    train, test = train_test_split(dataset, test_size=0.2)
    return train, test

# Save data (optional)
def save_data_sets(train, test):
    train.to_csv("./data/train.csv", index=False)
    test.to_csv("./data/test.csv", index=False)

In [None]:
# Requisitos #03 (Our Recommendation Algorithm )

# Load data for training model
def load_data_parameters():
    # Aqui seria possível implementar a leitura dos parâmetros para o modelo
    return {'vector_size': 100, 'window': 5, 'min_count': 1,'workers': 4}

# Processing text function
def preprocess_text(text):
    text = text.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    text = text.replace(stop_re, '')
    text = text.split()

    # Add lemmatization using WordNetLemmatizer to handle singular-plural cases
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text]
    return lemmatized_text

# Treinamento do modelo base com Word2Vec
def create_base_model(data_parameters, dataset):
    model = Word2Vec(sentences=dataset["Processed Product Name"], vector_size=data_parameters['vector_size'],window=data_parameters['window'], min_count=data_parameters['min_count'], workers=data_parameters['workers'])
    return model

# vectorizing text function
def vectorize_product(product_name, model):
    words = [word for word in product_name if word in model.wv]
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.wv.vector_size)

def save_main_model(model):
    # Create model pasta
    model.save("../data/main_model.model")

def load_main_model():
    # Create model pasta
    return Word2Vec.load("../data/main_model.model")

In [None]:
# Requisitos #04 (Reference Recommendation Algorithm - Medium)

In [None]:
# Requisitos #05 (Comparison between our recommendation and the reference)

In [None]:
# Requisitos #06 (Recomendation Algorithm)