In [151]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import plotly.express as px

#Libraries for preprocessing
from gensim.parsing.preprocessing import remove_stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import webcolors

#Download once if using NLTK for preprocessing
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

stops = set(stopwords.words('spanish'))


#Libraries for vectorisation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from fuzzywuzzy import fuzz

#Libraries for clustering
from sklearn.cluster import KMeans

import spacy
nlp = spacy.cli.download("es_dep_news_trf")
nlp = spacy.load("es_dep_news_trf")

#Load data set
#df = pd.read_csv('Productos con Data Extra.csv', delimiter=';', encoding="utf-8")
df = pd.read_excel('Productos con Data Extra.xlsx')
df['nombre'] = df['nombre'].astype(str)
text1 = df['nombre']

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting es-dep-news-trf==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_dep_news_trf-3.4.0/es_dep_news_trf-3.4.0-py3-none-any.whl (410.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.2/410.2 MB 3.4 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_dep_news_trf')


In [153]:
from spacy.lang.es.stop_words import STOP_WORDS
stop = STOP_WORDS

In [154]:
df = pd.read_excel('Productos con Data Extra.xlsx')
df['nombre'] = df['nombre'].astype(str)
df = df[['nombre']]
df['nombre'] = df['nombre'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
df['nombre'] = df.nombre.str.replace('[#,@,&,°,%,º,/]', '')
df['nombre'] = df['nombre'].replace('lí-quido', 'líquido')

In [None]:
stops = stopwords.words("spanish")

def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        if len(word)>3:
            if not word.is_punct:
                lemma = word.lemma_.strip()
                if lemma:
                    if not remove_stopwords or (remove_stopwords and lemma not in stops):
                        lemmatized.append(lemma)
    return " ".join(lemmatized)


df['texto'] = df['nombre'].apply(normalize, lowercase=True, remove_stopwords=True)

In [None]:
text1 = df['texto']

In [None]:
#Stem and make lower case
def stemSentence(sentence):
    #stemmer = SnowballStemmer('spanish')
    token_words = word_tokenize(sentence)
    #stem_sentence = [stemmer.stem(word) for word in token_words]
    stem_sentence = [word for word in token_words]
    return ' '.join(stem_sentence)
text3 = pd.Series([stemSentence(x) for x in text1])

In [None]:
#Remove colours
#colors = list(webcolors.CSS3_NAMES_TO_HEX)
#colors = [stemSentence(x) for x in colors if x in ('naranja','frutilla','chocolate','vainilla','limón',               'oliva', 'manzana', 'zero', 'pera', 'color', 'collection', 'collecion','extra', 'pack','oregano')]
colors = ['naranja','frutilla','chocolate','vainilla','limón', 'oliva', 'manzana', 'zero', 'pera', 'color', 'collection', 'collecion', 'extra', 'pack','oregano', 'li', 'lí', 'piña', 'fruta', 'frambuesa', 'mango', 'durazno']
text4 = [' '.join([x for x in string.split() if x not in colors]) for string in text3]

In [None]:
#Bag of words
vectorizer_cv = CountVectorizer(analyzer='word')
X_cv = vectorizer_cv.fit_transform(text4)

In [None]:
#TF-IDF (word level)
vectorizer_wtf = TfidfVectorizer(analyzer='word')
X_wtf = vectorizer_wtf.fit_transform(text4)

In [None]:
text4

In [None]:
matrix = pd.concat([text1, pd.DataFrame(X_cv.toarray(), columns=vectorizer_cv.get_feature_names())],axis=1)
matrix[['texto', 'cerveza']]

In [None]:
matrix

In [None]:
#TF-IDF (n-gram level)
vectorizer_ntf = TfidfVectorizer(analyzer='word',ngram_range=(1,2))
X_ntf = vectorizer_ntf.fit_transform(text4)

In [None]:
#LDA
lda = LatentDirichletAllocation(n_components=30, learning_decay=0.9)
X_lda = lda.fit(X_cv)

#Plot topics function. Code from: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(6, 5, figsize=(30, 30), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

#Show topics
n_top_words = 5
feature_names = vectorizer_cv.get_feature_names()
plot_top_words(X_lda, feature_names, n_top_words, '')

In [None]:
     #Fuzzywuzzy
X_fuzz = pd.crosstab([text4.index,text4],text4).apply(lambda col: [fuzz.token_sort_ratio(col.name, x)
                                                                   for x in col.index.get_level_values(1)])

In [None]:
#Test increments of 100 clusters using elbow method
sse={}
for k in np.arange(100,900,100):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(X_cv)
    sse[k] = kmeans.inertia_
plt.plot(list(sse.keys()),list(sse.values()))
plt.xlabel('Values for K')
plt.ylabel('SSE')
plt.show();

In [None]:
#Create 200 clusters
kmeans = KMeans(n_clusters=200)
kmeans.fit(X_cv)
result = pd.concat([text1,pd.DataFrame(X_cv.toarray(),columns=vectorizer_cv.get_feature_names())],axis=1)
result['cluster'] = kmeans.predict(X_cv)

In [None]:
#Label each cluster with the word(s) that all of its food names have in common
clusters = result['cluster'].unique()
labels = []
for i in range(len(clusters)):
    subset = result[result['cluster'] == clusters[i]]
    words = ' '.join([x for x in np.where(subset.all()!=0,subset.columns,None) if x and x!='texto' and x!='cluster' and len(x.split()) == 1])
    labels.append(words)
labels_table = pd.DataFrame(zip(clusters,labels),columns=['cluster','label'])
result_labelled = pd.merge(result,labels_table,on='cluster',how='left')

In [None]:
labels_table

In [None]:
result_labelled[['texto', 'label_y']]

In [None]:
#Visualise sizes of supermarket categories (manually added to result_labelled) and clean clusters
result_summary = pd.pivot_table(result_labelled,index=['label','category'],values=['nombre'],aggfunc='count').reset_index().rename(columns={'Name':'count'})
result_treemap = result_summary[(result_summary['label'] != '') & (result_summary['count'] > 1)]
fig = px.treemap(result_treemap,path=['category','label'],values='count')
fig.show();