In [2]:
import pandas as pd
import numpy as np
import nltk
import gensim


from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') 

dataset = pd.read_csv("../data/AmazonData.csv")

# Excluding columns with null and not usefull values
cols = [0,2,3,5,6,8,9,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
dataset.drop(dataset.columns[cols], axis =1, inplace=True)
dataset.dropna(inplace = True)


# Splitting Category in 3 parts
new = dataset["Category"].str.split("|", n = 2, expand = True)
  
# making the first category called Main Category
dataset["Main Category"]= new[0] 
  
# making the second category called sub_category 
dataset["Sub-Category"]= new[1]

# making the third category called side_category 
dataset["Side Category"]= new[2]


# Database Price and weight treatment
dataset.rename(columns = {'Uniq Id':'Id','Shipping Weight':'Shipping Weight(Pounds)', 'Selling Price':'Selling Price($)'}, inplace = True)

# Removing units from Price and Weight
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('ounces')
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.strip('pounds')
dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace('$', '')

# Setting Column Selling Price as float value
indexes = dataset[dataset['Selling Price($)'] == 'Total price:'].index
dataset.drop(indexes, inplace=True)

#
dataset['Selling Price($)'] = dataset['Selling Price($)'].str.replace(',', '', regex=False)
indexes = dataset[dataset['Selling Price($)'].str.contains('-', na=False)].index
dataset.drop(indexes, inplace=True)

#
indexes = dataset[dataset['Selling Price($)'].str.contains('&', na=False)].index
dataset.drop(indexes, inplace=True)

#
indexes = dataset[dataset['Selling Price($)'].str.contains('Currently', na=False)].index
dataset.drop(indexes, inplace=True)

#
indexes = dataset[dataset['Selling Price($)'].str.contains('from', na=False)].index
dataset.drop(indexes, inplace=True)

#
dataset['Selling Price($)'] = dataset['Selling Price($)'].str.split(' ').str[0]
dataset['Selling Price($)'] = dataset['Selling Price($)'].astype(float)

# Setting Column Shipping Weight as float value
indexes = dataset[dataset['Shipping Weight(Pounds)'].str.contains(r'\. ', na=False)].index

dataset.at[1619, 'Shipping Weight(Pounds)']
dataset.drop(1619, inplace=True)
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].str.replace(',', '', regex=False)
dataset['Shipping Weight(Pounds)'] = dataset['Shipping Weight(Pounds)'].astype(float)

dataset.info()

[nltk_data] Downloading package punkt to C:\Users\ricar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ricar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ricar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
Index: 7136 entries, 0 to 10001
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product Name             7136 non-null   object 
 1   Category                 7136 non-null   object 
 2   Selling Price($)         7136 non-null   float64
 3   About Product            7136 non-null   object 
 4   Product Specification    7136 non-null   object 
 5   Shipping Weight(Pounds)  7136 non-null   float64
 6   Main Category            7136 non-null   object 
 7   Sub-Category             7136 non-null   object 
 8   Side Category            6155 non-null   object 
dtypes: float64(2), object(7)
memory usage: 557.5+ KB


In [3]:
def preprocessamento(texto):
    texto = texto.replace('[^a-zA-Z]',' ').lower()
    stop_re = '\\b'+'\\b|\\b'.join(nltk.corpus.stopwords.words('english'))+'\\b'
    texto = texto.replace(stop_re, '')
    texto = texto.split()
    return texto

# Function for preprocessing with stemming
def preprocess_stemming(texto):
    texto = texto.replace('[^a-zA-Z]', ' ').lower()
    stop_re = '\\b' + '\\b|\\b'.join(nltk.corpus.stopwords.words('english')) + '\\b'
    texto = texto.replace(stop_re, '')
    texto = texto.split()

    # Add stemming using PorterStemmer
    stemmer = PorterStemmer()
    stemmed_text = [stemmer.stem(word) for word in texto]

    return stemmed_text

# Function for preprocessing with lemmatization
def preprocess_lemmatization(texto):
    texto = texto.replace('[^a-zA-Z]', ' ').lower()
    stop_re = '\\b' + '\\b|\\b'.join(nltk.corpus.stopwords.words('english')) + '\\b'
    texto = texto.replace(stop_re, '')
    texto = texto.split()

    # Add lemmatization using WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in texto]

    return lemmatized_text

In [4]:
# Original text without preprocessing
dataset["Processed Product Name"] = dataset["Product Name"].apply(preprocessamento)
dataset["Processed Product Name"]

0        [db, longboards, coreflex, crossbow, 41", bamb...
1        [electronic, snap, circuits, mini, kits, class...
2        [3doodler, create, flexy, 3d, printing, filame...
3        [guillow, airplane, design, studio, with, trav...
4                   [woodstock-, collage, 500, pc, puzzle]
                               ...                        
9995     [cozy, line, home, fashions, size, 2, piece, o...
9996           [lego, 8-brick, storage, box,, bright, red]
9998     [trends, international, nfl, la, chargers, hg,...
9999     [newpath, learning, 10, piece, science, owls, ...
10001    [hasegawa, ladders, lucano, step, ladder,, ora...
Name: Processed Product Name, Length: 7136, dtype: object

In [5]:
# Preprocess with stemming
dataset['Processed (Stemming)'] = dataset["Product Name"].apply(preprocess_stemming)
dataset['Processed (Stemming)']

0        [db, longboard, coreflex, crossbow, 41", bambo...
1        [electron, snap, circuit, mini, kit, classpack...
2        [3doodler, creat, flexi, 3d, print, filament, ...
3        [guillow, airplan, design, studio, with, trave...
4                     [woodstock-, collag, 500, pc, puzzl]
                               ...                        
9995     [cozi, line, home, fashion, size, 2, piec, oce...
9996            [lego, 8-brick, storag, box,, bright, red]
9998     [trend, intern, nfl, la, charger, hg, -, mobil...
9999     [newpath, learn, 10, piec, scienc, owl, and, o...
10001     [hasegawa, ladder, lucano, step, ladder,, orang]
Name: Processed (Stemming), Length: 7136, dtype: object

In [6]:
# Preprocess with lemmatization
dataset['Processed (Lemmatization)'] = dataset["Product Name"].apply(preprocess_lemmatization)
dataset['Processed (Lemmatization)']

0        [db, longboards, coreflex, crossbow, 41", bamb...
1        [electronic, snap, circuit, mini, kit, classpa...
2        [3doodler, create, flexy, 3d, printing, filame...
3        [guillow, airplane, design, studio, with, trav...
4                   [woodstock-, collage, 500, pc, puzzle]
                               ...                        
9995     [cozy, line, home, fashion, size, 2, piece, oc...
9996           [lego, 8-brick, storage, box,, bright, red]
9998     [trend, international, nfl, la, charger, hg, -...
9999     [newpath, learning, 10, piece, science, owl, a...
10001    [hasegawa, ladder, lucano, step, ladder,, orange]
Name: Processed (Lemmatization), Length: 7136, dtype: object