In [19]:
from rapidfuzz import fuzz, process, utils
import pandas as pd

In [20]:
dataset = pd.read_csv('../data/AmazonData.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8232 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

In [21]:
def prototype_rapid_fuzz_filter(user_input, dataset, number_of_rec):
    list_of_rec = []
    products = dataset["Product Name"]
    token_set_ratio_match = process.extract(user_input, products, scorer=fuzz.token_set_ratio, limit=1, processor=utils.default_process)
    partial_ratio_matches = process.extract(user_input, products, scorer=fuzz.token_set_ratio, limit=number_of_rec, processor=utils.default_process)

    if token_set_ratio_match[0][1] == 100:
        list_of_rec.extend(token_set_ratio_match)
        for match in partial_ratio_matches:
            if match[0] != token_set_ratio_match[0][0]:
                list_of_rec.append(match)
    
    else:
        for match in partial_ratio_matches:
            list_of_rec.append(match)

    
    sorted_indeces = [match[2] for match in list_of_rec]
    
    # Garantindo que os índices são válidos
    sorted_indeces = [i for i in sorted_indeces if i < len(dataset)]
    
    # Resetando o índice do DataFrame original antes de acessar .iloc
    dataset = dataset.reset_index(drop=True)
    
    # Reordenando DataFrame com os índices válidos
    reordered_df = dataset.iloc[sorted_indeces].reset_index(drop=True)

    return reordered_df


In [22]:

# Dataset process function
def dataset_process(dataset):

    # Excluindo colunas desnecessárias
    cols = [0, 2, 3, 5, 6, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
    dataset.drop(dataset.columns[cols], axis=1, inplace=True)
    
    # Removendo linhas com valores nulos antes do processamento das categorias
    dataset.dropna(subset=["Category"], inplace=True)
    
    # Separando a coluna "Category" em subcategorias
    new = dataset["Category"].str.split("|", n=3, expand=True)
    dataset["Main Category"] = new[0]
    dataset["Sub Category"] = new[1]
    dataset["Side Category"] = new[2]
    dataset["Other Category"] = new[3]
    
    # Excluindo coluna antiga "Category"
    dataset.drop(columns=["Category"], inplace=True)
    
    # Renomeando colunas
    dataset.rename(columns={'Uniq Id': 'Id', 'Shipping Weight': 'Shipping Weight(Pounds)', 'Selling Price': 'Selling Price($)'}, inplace=True)
    
    # Removendo unidades de peso e preço
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace('ounces', '').str.replace('pounds', '').str.strip()
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '').str.replace(',', '', regex=False)
    
    # Removendo linhas com caracteres inválidos na coluna de preço
    invalid_patterns = ['Total price:', '-', '&', 'Currently', 'from']
    for pattern in invalid_patterns:
        indexes = dataset[dataset['Selling Price($)'].str.contains(pattern, na=False)].index
        dataset.drop(indexes, inplace=True)
    
    # Ajustando formato do preço e removendo linhas inválidas
    dataset['Selling Price($)'] = dataset['Selling Price($)'].str.split(' ').str[0]
    dataset['Selling Price($)'] = pd.to_numeric(dataset['Selling Price($)'], errors='coerce')
    
    # Removendo linhas onde 'Shipping Weight(Pounds)' ainda contém caracteres indesejados
    dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)
    dataset['Shipping Weight(Pounds)'] = pd.to_numeric(dataset['Shipping Weight(Pounds)'], errors='coerce')
    
    # Removendo linhas com valores nulos após o processamento
    dataset.dropna(inplace=True)

    return dataset


In [23]:
dataset = dataset_process(dataset)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2724 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product Name             2724 non-null   object 
 1   Selling Price($)         2724 non-null   float64
 2   About Product            2724 non-null   object 
 3   Product Specification    2724 non-null   object 
 4   Shipping Weight(Pounds)  2724 non-null   float64
 5   Main Category            2724 non-null   object 
 6   Sub Category             2724 non-null   object 
 7   Side Category            2724 non-null   object 
 8   Other Category           2724 non-null   object 
dtypes: float64(2), object(7)
memory usage: 212.8+ KB




In [24]:
product_name = input("Type the product's name: ")
new_df = prototype_rapid_fuzz_filter(user_input=product_name, dataset=dataset, number_of_rec=dataset.shape[0])

print(new_df)

                                          Product Name  Selling Price($)  \
0    B. spaces by Battat – Totes Tidy Toy Organizer...             49.22   
1    The Riddler-Poison DC Comics Originals Ivy Str...             11.99   
2                                Janod Confetti Guitar             41.99   
3    Corolle - Mon Grand Poupon Outfits Set - Tropi...             23.23   
4    Pitsco Cut-And-Fold Cover SunEzoon Solar Car (...            125.00   
..                                                 ...               ...   
656  Xenon 2.0 Women's Fitness Inline Skate, White/...            173.83   
657  Bright Pink Trifle Container, Medium, Party Favor              9.76   
658                Hitec RCD 57345S 12 Servo Extension              4.93   
659  Wildkin Embroidered Backpack for Toddler Boys ...             29.99   
660  Melissa & Doug Sunny Patch Cutie Pie Butterfly...             21.59   

                                         About Product  \
0    Make sure this fits by e