# Grupo de categorias de los restaurantes

In [10]:
import pandas as pd

In [11]:
def get_groups(df):
    # Reemplaza 'restaurant' por cadena vacía, excepto cuando el nombre es 'restaurant'
    df['name'] = df['name'].apply(lambda x: x.replace('restaurant', '') if x != 'restaurant' else x)

    # Crea la columna 'group' y asigna el valor predeterminado 'general'
    df['group'] = 'general'

    # Asigna grupos basados en patrones en el nombre
    df.loc[(df['name'].str.contains('caf')) | (df['name'].str.contains('cof')) | 
           (df['name'].str.contains('brea')) | (df['name'].str.contains('tea')), 'group'] = 'coffess & breakfast'

    df.loc[(df['name'].str.contains('bar')) & (~df['name'].str.contains('barb')) | 
           (df['name'].str.contains('nigh')) | (df['name'].str.contains('pub')), 'group'] = 'bars & nightlife'

    df.loc[
        (df['name'].str.contains('burg') |
         (df['name'].str.contains('fast') & ~df['name'].str.contains('break')) |
         df['name'].str.contains('pizza') |
         df['name'].str.contains('sandw') |
         df['name'].str.contains('hot dog') |
         df['name'].str.contains('takeou')),
        'group'] = 'fast food'

    df.loc[
        (df['name'].str.contains('suhi') |
         df['name'].str.contains('asian') |
         df['name'].str.contains('japa') |
         df['name'].str.contains('kore') |
         df['name'].str.contains('mexi') |
         df['name'].str.contains('eth') |
         df['name'].str.contains('falafel') |
         df['name'].str.contains('chilean') |
         df['name'].str.contains('mongolian') |
         df['name'].str.contains('polish') |
         df['name'].str.contains('italian') |
         df['name'].str.contains('british')),
        'group'] = 'foreign'

    df.loc[(df['name'].str.contains('veg')), 'group'] = 'veggie & vegetarian'

    return df

In [12]:
categories = pd.read_parquet('../datasets/processed/bd/2_categories.parquet.gz')
categories = get_groups(categories)
categories

Unnamed: 0,categories_id,name,group
0,0,restaurant,general
1,1,sandwich shop,fast food
2,2,caterer,general
3,3,fast food,fast food
4,4,takeout,fast food
...,...,...,...
1292,1292,botanical gardens,general
1293,1293,newspapers & magazines,general
1294,1294,parking,general
1295,1295,georgian,general


# Yelp

In [13]:
df_by = pd.read_parquet('../datasets/processed/bd/6_business_yelp.parquet.gz')
df_cy = pd.read_parquet('../datasets/processed/bd/8_categories_yelp.parquet.gz')

In [14]:
aux = df_cy.merge(categories, on='categories_id').drop(columns=['categories_id','name'])
aux

Unnamed: 0,business_id,group
0,eEOYSgkmpB90uNA7lDOMRA,general
1,Yh_KhyVD6ZBwsIQQ1wSUpw,general
2,7EhTT4iEuA7JaaWA-eI3Qw,general
3,wXvbqjS9g9461rnxSCqcag,general
4,o2sdqQ7e4IzeBWxUo_-tFQ,general
...,...,...
61864,vV57YWbrHqm1iylWmIdwVA,general
61865,y7P06O7ypUgdU5fgV4s87Q,general
61866,jchZL8NJP9YBwgFmrSdpfQ,general
61867,9295K2aKOltQSSTpm_7DBQ,general


In [15]:
df_grupos_categoria = pd.DataFrame(columns=['business_id','grupos'])

for i in aux['business_id'].unique():
    aux_categoria = []
    aux_negocio = []

    df_aux = pd.DataFrame(columns=['business_id','grupos'])
    for grupo in aux[aux['business_id'] == i]['group'].values:
        if grupo not in aux_categoria:
            aux_categoria.append(grupo)
            aux_negocio.append(i)

    if len(aux_categoria) != 1 and 'general' in aux_categoria:
        aux_categoria.pop(aux_categoria.index('general'))
        aux_negocio.pop()

    df_aux['business_id'] = aux_negocio
    df_aux['grupos'] = aux_categoria
    
    df_grupos_categoria = pd.concat((df_grupos_categoria, df_aux), ignore_index=True)
    
df_grupos_categoria

Unnamed: 0,business_id,grupos
0,eEOYSgkmpB90uNA7lDOMRA,general
1,Yh_KhyVD6ZBwsIQQ1wSUpw,general
2,7EhTT4iEuA7JaaWA-eI3Qw,fast food
3,7EhTT4iEuA7JaaWA-eI3Qw,coffess & breakfast
4,7EhTT4iEuA7JaaWA-eI3Qw,foreign
...,...,...
17814,FBTKjIHyMk8V4frov04ClQ,fast food
17815,FBTKjIHyMk8V4frov04ClQ,foreign
17816,6E3Z4yiajuHAd_NKBPiROA,general
17817,A93xUmG7sz4o6XewUnrO5A,general


In [16]:
df_grupos_categoria.to_parquet('../datasets/processed/bd/12_grupo_de_categorias_yelp.parquet.gz', compression='gzip')

# Google

In [17]:
df_bg = pd.read_parquet('../datasets/processed/bd/5_business_google.parquet.gz')
df_cg = pd.read_parquet('../datasets/processed/bd/7_categories_google.parquet.gz')

In [18]:
aux = df_cg.merge(categories, on='categories_id').drop(columns=['categories_id','name'])
aux

Unnamed: 0,gmap_id,group
0,0x88d9ba5d65937567:0xbc27649cf513cc89,general
1,0x88d900575f0dd065:0x9b3638d2a80be4d,general
2,0x88d9b719170a9f61:0x30d454a980f76ad3,general
3,0x88e76652cd84272f:0x548abb9935d912ff,general
4,0x88d9b395c6179c89:0x914b0aafb453b3b5,general
...,...,...
70302,0x89c0bc82a804da0b:0xe9466fa5a11862a3,general
70303,0x89c1846c5f430859:0x69ba082723f8b32d,general
70304,0x89c1026295b55cb7:0x24756dd82d799a5d,bars & nightlife
70305,0x89c3ccd5de4abc17:0x93a5267ffe05b932,general


In [19]:
df_grupos_categoria = pd.DataFrame(columns=['gmap_id','grupos'])

for i in aux['gmap_id'].unique():
    aux_categoria = []
    aux_negocio = []

    df_aux = pd.DataFrame(columns=['gmap_id','grupos'])
    for grupo in aux[aux['gmap_id'] == i]['group'].values:
        if grupo not in aux_categoria:
            aux_categoria.append(grupo)
            aux_negocio.append(i)
            
    if len(aux_categoria) != 1 and 'general' in aux_categoria:
        aux_categoria.pop(aux_categoria.index('general'))
        aux_negocio.pop()

    df_aux['gmap_id'] = aux_negocio
    df_aux['grupos'] = aux_categoria
    
    df_grupos_categoria = pd.concat((df_grupos_categoria, df_aux), ignore_index=True)
    
df_grupos_categoria

Unnamed: 0,gmap_id,grupos
0,0x88d9ba5d65937567:0xbc27649cf513cc89,general
1,0x88d900575f0dd065:0x9b3638d2a80be4d,general
2,0x88d9b719170a9f61:0x30d454a980f76ad3,general
3,0x88e76652cd84272f:0x548abb9935d912ff,fast food
4,0x88d9b395c6179c89:0x914b0aafb453b3b5,general
...,...,...
29178,0x89c24d40ddc47bbb:0x3de9669d68298809,general
29179,0x89c2fc8ab9397b95:0xe80c4188d7096d17,general
29180,0x89c3bf9cdff6bf5b:0x26dd7524a7f175f2,general
29181,0x89c3b34b43564f0b:0x6e1327c338d488bf,general


In [20]:
df_grupos_categoria.to_parquet('../datasets/processed/bd/11_grupo_de_categorias_google.parquet.gz', compression='gzip')