In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import pickle

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def process_text(text):
    """
    Funcion que realiza tokeinizacion en base a un texto.

    Args:
        text (string): Palabra u oración para aplicar la tokeinizacin.

    Returns:
        str: Serie de strings.
    """
    # Aplico la teokeinizacion
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    words = word_tokenize(text)
    words = [ps.stem(word.lower()) for word in words if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(words)

def obtener_palabras_similares(palabra, modelo, topn=3):
    try:
        similares = modelo.similar_by_word(palabra, topn=topn)
        return [palabra for palabra, _ in similares]
    except KeyError:
        return []

In [None]:

#Genero un dataframe que contenga, las categorias y los negocios para yelp y google.

# Cambiar por la lectura a la BD

local_categories_google = pd.read_parquet('./datasets/processed/bd/7_categories_google.parquet.gz')

# Cambiar por la lectura a la BD
local_categories_yelp = pd.read_parquet('./datasets/processed/bd/8_categories_yelp.parquet.gz')

#Si se lee de la base de datos business_id ya esta como nombre.
local_categories_google.rename(columns={'gmap_id':'business_id'},inplace=True)
local_categories = pd.concat([local_categories_google,local_categories_yelp])

# Cambiar por la lectura a la BD
categoires = pd.read_parquet('./datasets/processed/bd/2_categories.parquet.gz')
local_categories = pd.merge(local_categories,categoires,on='categories_id',how='inner')
    
#### Se genera el dataframe local_categories.#####
    
    
    
local_categories['procceced'] = local_categories['name'].apply(process_text)

# Si hay mas clase ademas de restaur ej: pizza restaur borra restaur, si no deja igual
local_categories['procceced'] = local_categories['procceced'].apply(lambda x:x.replace('restaur','') if x!= 'restaur' else x)
local_categories['procceced'] = local_categories['procceced'].astype(str)
# Crear una matriz TF-IDF para medir la similitud del contenido
    
from gensim.models import KeyedVectors

# Ruta al archivo GoogleNews-vectors-negative300.bin
ruta_modelo = './datasets/extras/model/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'

# Cargar el modelo
modelo = KeyedVectors.load_word2vec_format(ruta_modelo, binary=True,limit=1000000)
    
    
local_categories['processed'] = local_categories['procceced'].apply(
lambda text: ' '.join(
    [
        ' '.join(obtener_palabras_similares(palabra.strip(), modelo)) 
        if palabra in text 
        else palabra 
        for palabra in text.split()
    ]
)
)   
local_categories['processed'] = local_categories['processed'].apply(lambda x:'restaur' if x == '' else x)

local_categories = local_categories[['business_id','name','processed']]
local_categories.to_parquet('./app/ml/datasets/locales_categories.parquet') # Guardo el dataset util


tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(local_categories['processed'])



In [None]:

with open('./app/ml/tfidf_matrix.pkl', 'wb') as file:
        pickle.dump(tfidf_matrix, file) 
#Modelo de recomendación usando similitudes con vecinos cercanos

#Defino y entreno al modelo.
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=30)
knn_model.fit(tfidf_matrix)

# Guardo el modelo en un pkl
with open('./app/ml/modelo_knn.pkl', 'wb') as file:
    pickle.dump(knn_model, file)

In [4]:
df_categories = pd.read_parquet('./datasets/locales_categories.parquet')

In [5]:
df_categories[df_categories['name']=='fast food']

Unnamed: 0,business_id,name,processed
103796,aNtKyc2rr-uK5cqzY9TVQQ,fast food,quick rapidly Fast foods Food foodstuffs
103797,QjV4v7q_pt7tt3K1US7IHg,fast food,quick rapidly Fast foods Food foodstuffs
103798,CtMEJxpVMlNzFpB4PtFjfA,fast food,quick rapidly Fast foods Food foodstuffs
103799,SRexiuEsx1d9-dZVUsssQA,fast food,quick rapidly Fast foods Food foodstuffs
103800,A1lAqN3SOaBLoo6NqVcy4Q,fast food,quick rapidly Fast foods Food foodstuffs
...,...,...,...
105454,btYJ5G9Vf5j9V3XpLhkMDw,fast food,quick rapidly Fast foods Food foodstuffs
105455,J4YNKiI-NakcGbO1r5curQ,fast food,quick rapidly Fast foods Food foodstuffs
105456,BJzgkme_rfJeTgZ773pZsw,fast food,quick rapidly Fast foods Food foodstuffs
105457,qjtELTt9fdIwoi_xGNN21g,fast food,quick rapidly Fast foods Food foodstuffs


In [6]:
df_categories[df_categories['name'].str.lower().str.contains('fast food'.lower())]

Unnamed: 0,business_id,name,processed
15335,0x88e76652cd84272f:0x548abb9935d912ff,fast food restaurant,quick rapidly Fast foods Food foodstuffs
15336,0x88e76652cd84272f:0x548abb9935d912ff,fast food restaurant,quick rapidly Fast foods Food foodstuffs
15337,0x889381fc9fb6b7a9:0x1462d721c1f99d63,fast food restaurant,quick rapidly Fast foods Food foodstuffs
15338,0x88e8a6974147d007:0xc51b74b265ef108c,fast food restaurant,quick rapidly Fast foods Food foodstuffs
15339,0x88d9b9c2a7b16fbb:0x960bb156a4532c53,fast food restaurant,quick rapidly Fast foods Food foodstuffs
...,...,...,...
105454,btYJ5G9Vf5j9V3XpLhkMDw,fast food,quick rapidly Fast foods Food foodstuffs
105455,J4YNKiI-NakcGbO1r5curQ,fast food,quick rapidly Fast foods Food foodstuffs
105456,BJzgkme_rfJeTgZ773pZsw,fast food,quick rapidly Fast foods Food foodstuffs
105457,qjtELTt9fdIwoi_xGNN21g,fast food,quick rapidly Fast foods Food foodstuffs
