# Dependências

In [2]:
#!pip install psycopg2

# Bibliotecas

In [3]:
import pandas as pd
import numpy as np
import psycopg2 as postgres
import psycopg2.extras
import math

# Funções

In [4]:
def connect():
    conn = None
    try:
        conn = postgres.connect(
            host="localhost",
            database="austin_test",
            user="postgres",
            password="root",
            port = 5432) #8000 - LACINA PC
                         #5432 - LOCALHOST
    except postgres.Error as e:
        print(e)
    return conn

In [5]:
def closeConnection(conn):
    sucess = False
    try:
        conn.close()
        sucess = True
    except postgres.Error as e:
        print(e)
    
    return sucess

In [6]:
def executeQuery(conn, sql):
    record = None
    try:
        #print(sql)
        cur = conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor)
        cur.execute(sql)
        record = cur.fetchall()
        cur.close()
    except postgres.Error as e:
        print(e)
        cur.execute("ROLLBACK")
        cur.close()
    return record

In [7]:
def executeInsert(conn, sql):
    sucess = False
            
    try:
        #print(sql)
        cur = conn.cursor()
        cur.execute(sql)
        conn.commit()
        cur.close()
        sucess = True
    except postgres.Error as e:
        print(e)
        cur.execute("ROLLBACK")
        cur.close()

    return sucess

In [8]:
#Recebe um id e retorna as categorias e checkin do lugar
def getPOIInformation(conn, business_id,):
    
    sql = """
        SELECT checkin_count, name FROM pois_information WHERE id  = \'"""+str(business_id)+ """\'
    ;"""

    result = executeQuery(conn, sql)

    return result

In [9]:
#Encontra todos os pontos do bin centrado em um POI trazendo suas informações (categorias e checkin)
def getBinPoisInformation(conn, business_id, bin_number):

    result = None

    sql = """
        SELECT fk_poi_id_context, name, checkin_count, distance_m 
        FROM bins_pois_information 
        WHERE fk_poi_id_center = \'"""+str(business_id)+"""\' AND fk_bin_number = """+str(bin_number)+""";"""

    result = executeQuery(conn, sql)

    return result


In [10]:
def getBinOSMInformation(conn, business_id, bin_number, materialized_view):
    result = None

    sql = """
        SELECT *
        FROM """+materialized_view+"""
        WHERE id = \'"""+str(business_id)+"""\' AND number = """+str(bin_number)+""";"""

    #print (sql)

    result = executeQuery(conn, sql)

    return result


In [11]:
def getBinStats(conn, business_id, binRange):

    sql = """
    SELECT count(business_id) as quantity, sum(checkin_count) as total_checkin FROM poi WHERE business_id IN (
        SELECT fk_poi_business_id_01 FROM has_distance WHERE distance_m >= """+str(binRange[0])+""" AND distance_m < """+str(binRange[1])+"""
                                            AND fk_poi_business_id_02 = \'"""+str(business_id)+"""\'
    )
    """

    result = executeQuery(conn, sql)

    return result

In [12]:
def getCategoryInformation(data, category):
   
    occurences = 0
    checkin = 0

     #Formato de data
    #[business_id, checkin, category]
    for item in data:
        if(item[2] == category):
            occurences = occurences + 1
            checkin = checkin + item[1]
    
    return [checkin, occurences]


In [13]:
def calculateBin(df, bin_number, w=0.5):
    
    print("executing bin:", n, "\tweight:", w)

    #Criando canal de comunicação com a base de dados
    connection = connect()  

    if (connection != None):

        #Dicionário do ITDL
        scITDL = {} 
        i = 0  

        # largura do bin
        

        #centros
        for id_01, poi in df.iterrows():
            #print('calculating point:', id_01, 'id:', poi['business_id'])

            #Obtendo informações de categorias e checkin do poi central
            #[business_id, checkin, category]
            poi_information = getPOIInformation(connection, poi['business_id'])


            #[business_id, checkin, category, distance_m]
            bin_information = getBinPoisInformation(connection, poi['business_id'], bin_number)

            columns = list(dict(bin_information[0]).keys())

            bin_information = pd.DataFrame(bin_information, columns = columns)

            #Calculando os dois parâmetros abaixo
            #sc - total de checkins no bin
            #sp - total de pois no bin
            sp = len(bin_information['fk_poi_id_context'].unique())
            sc = bin_information.drop_duplicates(subset = 'fk_poi_id_context')['checkin_count'].sum()


            #As adições são feitas baseadas nos rótulos

            #Para evitar divisão por zero
            if(sc != 0 and sp != 0):
                for center_poi in poi_information: # Para cada tki
                    for id_02, row in bin_information.iterrows(): #Para cara tkj

                        
                        #cc = all checkin o tkj
                        #cp = all occurences of tkj
                        
                        cc = bin_information[bin_information['name'] == row['name']]['checkin_count'].sum()
                        cp = bin_information[bin_information['name'] == row['name']]['name'].count()

                        #print(sc, sp, cc, cp)

                        a = (1 - (cc/sc)) #Pode gerar 0
                        u = (cp/sp)       #Pode gerar 0

                        if((a > 0) and (u > 0)):

                            A = -np.log2(a)
                            U = -np.log2(u)

                            aug = int(math.ceil((w*A) + ((1 - w)*U)))

                            #Aumentando-o pelo fator b
                            for b in range(aug):

                                scITDL[i] = {   'poi_id_center': poi['business_id'],
                                                'center_poi': center_poi['name'],
                                                'poi_id_context': row['fk_poi_id_context'],
                                                'context_poi': row['name'], 
                                                'distance-m': row['distance_m']}
                                i = i + 1

            break


        scITDL = pd.DataFrame.from_dict(scITDL, 'index')
        name = 'austin-sl-tuple-n-itdl-' + str(n) + 'bin-wgt'+str(w)+'-p.parquet'
        scITDL.to_parquet(name, compression='brotli',  index = False)
        closeConnection(connection)
        
    return None

In [14]:
import csv
def calculateBin_Disco(df, bin_number, w=0.5):
    
    print("executing bin:", n, "\tweight:", w)

    #Arquivo para salvar diretamente no disco
    name = 'austin-sl-tuple-n-itdl-' + str(n) + 'bin-wgt'+str(w)+'-p.csv'
    csv_file = open(name, "w", newline='')
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["poi_id_center","center_poi","poi_id_context","context_poi","distance-m"])

    #Criando canal de comunicação com a base de dados
    connection = connect()  

    if (connection != None):

        #centros
        for id_01, poi in df.iterrows():
            #print('calculating point:', id_01, 'id:', poi['business_id'])

            #Obtendo informações de categorias e checkin do poi central
            #[business_id, checkin, category]
            poi_information = getPOIInformation(connection, poi['business_id'])


            #[business_id, checkin, category, distance_m]
            bin_information = getBinPoisInformation(connection, poi['business_id'], bin_number)

            columns = list(dict(bin_information[0]).keys())

            bin_information = pd.DataFrame(bin_information, columns = columns)

            #Calculando os dois parâmetros abaixo
            #sc - total de checkins no bin
            #sp - total de pois no bin
            sp = len(bin_information['fk_poi_id_context'].unique())
            sc = bin_information.drop_duplicates(subset = 'fk_poi_id_context')['checkin_count'].sum()


            #As adições são feitas baseadas nos rótulos

            #Para evitar divisão por zero
            if(sc != 0 and sp != 0):
                for center_poi in poi_information: # Para cada tki
                    for id_02, row in bin_information.iterrows(): #Para cara tkj

                        
                        #cc = all checkin o tkj
                        #cp = all occurences of tkj
                        
                        cc = bin_information[bin_information['name'] == row['name']]['checkin_count'].sum()
                        cp = bin_information[bin_information['name'] == row['name']]['name'].count()

                        #print(sc, sp, cc, cp)

                        a = (1 - (cc/sc)) #Pode gerar 0
                        u = (cp/sp)       #Pode gerar 0

                        if((a > 0) and (u > 0)):

                            A = -np.log2(a)
                            U = -np.log2(u)

                            aug = int(math.ceil((w*A) + ((1 - w)*U)))

                            #Aumentando-o pelo fator b
                            for b in range(aug):

                                line = [str(poi['business_id']), str(center_poi['name']), str(row['fk_poi_id_context']), str(row['name']), str(row['distance_m'])]
                                writer.writerow(line)

                            
            break
        
        csv_file.close()
        closeConnection(connection)
        
    return None

In [19]:
import csv
def calculateBinOSM_Disco(df, n, materialized_view, w=0.5):
    
    print("executing bin:", n, "\tweight:", w)

    mview_to_metric = {'bins_polygons_information': 'way_area_m',
                    'bins_roads_information': 'length',
                    'bins_lines_information': 'length',
                    'bins_points_information': 'none',
                    'bins_polygons_building_information': 'area_total',
                    }

    #Arquivo para salvar diretamente no disco
    name = './Austin/w05/ITDL Osm Partial Bins/austin-sl-tuple-n-itdl-' + str(n) + 'bin-wgt'+str(w)+"-"+materialized_view+'-p.csv'
    csv_file = open(name, "w", newline='')
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["poi_id_center","center_poi","osm_id_context","context_osm","distance-m"])

    #Criando canal de comunicação com a base de dados
    connection = connect()  

    if (connection != None):

        if (materialized_view == 'bins_polygons_information' or materialized_view == 'bins_polygons_building_information'):
            for id_01, poi in df.iterrows():
                #print('calculating point:', id_01, 'id:', poi['business_id'])

                #[business_id, checkin, category]
                poi_information = getPOIInformation(connection, poi['business_id'])

                bin_osm_polygon_information = []
                bin_osm_polygon_building_information = []

                #[business_id, checkin, category, distance_m]
                bin_osm_polygon_information = getBinOSMInformation(connection, poi['business_id'], n, 'bins_polygons_information')
                bin_osm_polygon_building_information = getBinOSMInformation(connection, poi['business_id'], n, 'bins_polygons_building_information')


                #Calculando os dois parâmetros abaixo
                #sc - área ou comprimento total
                #sp - total de pois no bin
                sp = 0
                sc = 0

                #Se o bin está preenchido com alguma informação
                if (len(bin_osm_polygon_information) > 0):
                    tags_polygons = list(dict(bin_osm_polygon_information[0]).keys())
                    bin_osm_polygon_information = pd.DataFrame(bin_osm_polygon_information, columns = tags_polygons)
                    tags_polygons = tags_polygons[2:len(tags_polygons)-3]
                    sp = sp + len(bin_osm_polygon_information['osm_id'].unique())
                    sc = sc + bin_osm_polygon_information[mview_to_metric[materialized_view]].sum()

                if (len(bin_osm_polygon_building_information) > 0):
                    tags_buildings = list(dict(bin_osm_polygon_building_information[0]).keys())
                    bin_osm_polygon_building_information = pd.DataFrame(bin_osm_polygon_building_information, columns = tags_buildings)
                    tags_buildings = tags_buildings[1:2]
                    sp = sp + bin_osm_polygon_building_information.iloc[0]['building_count']
                    sc = sc + bin_osm_polygon_building_information.iloc[0]['area_total']

                    #Excluindo ids e bin_number
                    
                #As adições são feitas baseadas nos rótulos

                #Para evitar divisão por zero
                if(sc != 0 and sp != 0):
                    for center_poi in poi_information: # Para cada tki
                        if(len(bin_osm_polygon_information) > 0):
                            for id_02, row in bin_osm_polygon_information.iterrows(): #Para cara polygons
                                for tag in tags_polygons:
                                    #Percorrer cada tag
                                    
                                    #cc = all area/length o tag
                                    #cp = all occurences of tag

                                    value = row[tag]
                                    if (value != 'None'): #Para tags vazias
                                    
                                        
                                        cc = bin_osm_polygon_information[bin_osm_polygon_information[tag] == value][mview_to_metric[materialized_view]].sum()
                                        cp = bin_osm_polygon_information[bin_osm_polygon_information[tag] == value][tag].count()

                                        a = (1 - (cc/sc)) #Pode gerar 0
                                        u = (cp/sp)       #Pode gerar 0

                                        if((a > 0) and (u > 0)):

                                            A = -np.log2(a)
                                            U = -np.log2(u)

                                            aug = int(math.ceil((w*A) + ((1 - w)*U)))

                                            name = tag+"_"+value

                                            #Aumentando-o pelo fator b
                                            for b in range(aug):

                                                line = [str(poi['business_id']), str(center_poi['name']), str(row['osm_id']), str(name), str(row['distance_m'])]
                                                writer.writerow(line)
                                            
                        if (len(bin_osm_polygon_building_information) > 0):

                            for id_02, row in bin_osm_polygon_building_information.iterrows(): #Para cara buildings

                                    
                                    #cc = all area/length o tag
                                    #cp = all occurences of tag

                                    
                                        
                                    cc = row['area_total']
                                    cp = row['building_count']

                                    #print(sc, sp, cc, cp)
                                    

                                    a = (1 - (cc/sc)) #Pode gerar 0
                                    u = (cp/sp)       #Pode gerar 0

                                    #print(a, u)
                                    #print()

                                    if((a > 0) and (u > 0)):

                                        A = -np.log2(a)
                                        U = -np.log2(u)

                                        aug = int(math.ceil((w*A) + ((1 - w)*U)))

                                        name = 'building_yes'
                                        distance = n*100 + 50
                                        building_id = '-1'

                                        #Aumentando-o pelo fator b
                                        for b in range(aug):

                                            line = [str(poi['business_id']), str(center_poi['name']), building_id, str(name), str(distance)]
                                            writer.writerow(line)

                            
                #break
        
        #Demais casos
        else:
            #centros
            for id_01, poi in df.iterrows():
                #print('calculating point:', id_01, 'id:', poi['business_id'])

                #[business_id, checkin, category]
                poi_information = getPOIInformation(connection, poi['business_id'])


                #[business_id, checkin, category, distance_m]
                bin_osm_information = getBinOSMInformation(connection, poi['business_id'], n, materialized_view)

                #Se o bin está preenchido com alguma informação
                if (len(bin_osm_information) > 0):
                    tags = list(dict(bin_osm_information[0]).keys())

                    bin_osm_information = pd.DataFrame(bin_osm_information, columns = tags)

                    #Excluindo ids e bin_number
                    
                    #Calculando os dois parâmetros abaixo
                    #sc - área ou comprimento total
                    #sp - total de pois no bin


                    sp = len(bin_osm_information['osm_id'].unique())
                    sc = 1

                    if(materialized_view != 'bins_points_information'):
                        sc = bin_osm_information[mview_to_metric[materialized_view]].sum()
                        tags = tags[2:len(tags)-3]
                    
                    else:
                        w = 0 #Removendo a parte A do algoritmo
                        tags = tags[2:len(tags)-2]
                

                    #As adições são feitas baseadas nos rótulos

                    #Para evitar divisão por zero
                    if(sc != 0 and sp != 0):
                        for center_poi in poi_information: # Para cada tki
                            for id_02, row in bin_osm_information.iterrows(): #Para cara tkj
                                for tag in tags:
                                    #Percorrer cada tag
                                    
                                    #cc = all area/length o tag
                                    #cp = all occurences of tag

                                    value = row[tag]
                                    if (value != 'None'): #Para tags vazias
                                    
                                        cc = 0
                                        if(materialized_view != 'bins_points_information'):
                                            cc = bin_osm_information[bin_osm_information[tag] == value][mview_to_metric[materialized_view]].sum()

                                        cp = bin_osm_information[bin_osm_information[tag] == value][tag].count()

                                        

                                        a = (1 - (cc/sc)) #Pode gerar 0
                                        u = (cp/sp)       #Pode gerar 0

                                        if((a > 0) and (u > 0)):

                                            A = -np.log2(a)
                                            U = -np.log2(u)

                                            aug = int(math.ceil((w*A) + ((1 - w)*U)))

                                            name = tag+"_"+value

                                            #Aumentando-o pelo fator b
                                            for b in range(aug):

                                                line = [str(poi['business_id']), str(center_poi['name']), str(row['osm_id']), str(name), str(row['distance_m'])]
                                                writer.writerow(line)

                                
                    #break
        
        csv_file.close()
        closeConnection(connection)
        
    return None

ITDL Geográfico [Todos abaixo]

In [None]:
import csv
def calculateBinOSMPolygon_Disco(df, bin_number, w=0.5):
    
    print("executing bin:", n, "/tweight:", w)
    h = 100

    #Arquivo para salvar diretamente no disco
    name = './Austin/w05/ITDL Osm Partial Bins/austin-sl-tuple-n-itdl-' + str(n) + 'bin-wgt'+str(w)+'-bins_polygons_information-p.csv'
    csv_file = open(name, "w", newline='')
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["poi_id_center","center_poi","osm_id_context","context_osm","distance-m"])

    #Criando canal de comunicação com a base de dados
    connection = connect()  

    if (connection != None):

        for id_01, poi in df.iterrows():
            #print('calculating point:', id_01, 'id:', poi['business_id'])

            #[business_id, checkin, category]
            poi_information = getPOIInformation(connection, poi['business_id'])

            bin_osm_polygon_information = []
            bin_osm_polygon_building_information = []

            #[business_id, checkin, category, distance_m]
            bin_osm_polygon_information = getBinOSMInformation(connection, poi['business_id'], bin_number, 'bins_polygons_information')
            bin_osm_polygon_building_information = getBinOSMInformation(connection, poi['business_id'], bin_number, 'bins_polygons_building_information')


            #Calculando os dois parâmetros abaixo
            #sc - área total
            #sp - total de polygons no bin
            sp = 0
            sc = 0

            #Se o bin está preenchido com alguma informação
            if (len(bin_osm_polygon_information) > 0):
                tags_polygons = list(dict(bin_osm_polygon_information[0]).keys())
                bin_osm_polygon_information = pd.DataFrame(bin_osm_polygon_information, columns = tags_polygons)
                #Excluindo ids e bin_number
                tags_polygons = tags_polygons[2:len(tags_polygons)-3]
                sp = sp + len(bin_osm_polygon_information['osm_id'].unique())
                sc = sc + bin_osm_polygon_information['way_area_m'].sum()

            if (len(bin_osm_polygon_building_information) > 0):
                tags_buildings = list(dict(bin_osm_polygon_building_information[0]).keys())
                bin_osm_polygon_building_information = pd.DataFrame(bin_osm_polygon_building_information, columns = tags_buildings)
                #Excluindo ids e bin_number
                tags_buildings = tags_buildings[1:2]
                sp = sp + bin_osm_polygon_building_information.iloc[0]['building_count']
                sc = sc + bin_osm_polygon_building_information.iloc[0]['area_total']

                
                
            #As adições são feitas baseadas nos rótulos

            #Para evitar divisão por zero
            if(sc != 0 and sp != 0):
                for center_poi in poi_information: # Para cada tki

                    #Calculando a co-ocorrência com polígonos que não são unicamente prédios
                    if(len(bin_osm_polygon_information) > 0):
                        for id_02, row in bin_osm_polygon_information.iterrows(): #Para cara polygons
                            for tag in tags_polygons:
                                #Percorrer cada tag
                                
                                value = row[tag]
                                if (value != 'None'): #Para tags vazias

                                    #cc = all area o tag
                                    #cp = all occurences of tag                               
                                    cc = bin_osm_polygon_information[bin_osm_polygon_information[tag] == value]['way_area_m'].sum()
                                    cp = bin_osm_polygon_information[bin_osm_polygon_information[tag] == value][tag].count()

                                    a = (1 - (cc/sc)) #Pode gerar 0
                                    u = (cp/sp)       #Pode gerar 0

                                    if((a > 0) and (u > 0)):

                                        A = -np.log2(a)
                                        U = -np.log2(u)

                                        aug = int(math.ceil((w*A) + ((1 - w)*U)))

                                        name = tag+"_"+value

                                        #Aumentando-o pelo fator b
                                        for b in range(aug):

                                            line = [str(poi['business_id']), str(center_poi['name']), str(row['osm_id']), str(name), str(row['distance_m'])]
                                            writer.writerow(line)

                    #Calculando a co-ocorrência com polígonos que são unicamente prédios                    
                    if (len(bin_osm_polygon_building_information) > 0):

                        for id_02, row in bin_osm_polygon_building_information.iterrows(): #Para cara buildings

                                
                                #cc = all area o tag
                                #cp = all occurences of tag

                                cc = row['area_total']
                                cp = row['building_count']

                                
                                a = (1 - (cc/sc)) #Pode gerar 0
                                u = (cp/sp)       #Pode gerar 0

                                if((a > 0) and (u > 0)):

                                    A = -np.log2(a)
                                    U = -np.log2(u)

                                    aug = int(math.ceil((w*A) + ((1 - w)*U)))

                                    name = 'building_yes'
                                    distance = n*h + (h/2)
                                    building_id = '-1'

                                    #Aumentando-o pelo fator b
                                    for b in range(aug):

                                        line = [str(poi['business_id']), str(center_poi['name']), building_id, str(name), str(distance)]
                                        writer.writerow(line)

                            
                #break
        
        
        csv_file.close()
        closeConnection(connection)
        
    return None

In [None]:
import csv
def calculateBinOSMRoadsLines_Disco(df, bin_number, roads=True, w=0.5):
    
    if(roads):
        materialized_view = 'bins_roads_information'
    else:
        materialized_view = 'bins_lines_information'

    print("executing bin:", n, "\tweight:", w)


    #Arquivo para salvar diretamente no disco
    name = './Austin/w05/ITDL Osm Partial Bins/austin-sl-tuple-n-itdl-' + str(n) + 'bin-wgt'+str(w)+"-"+materialized_view+'-p.csv'
    csv_file = open(name, "w", newline='')
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["poi_id_center","center_poi","osm_id_context","context_osm","distance-m"])

    #Criando canal de comunicação com a base de dados
    connection = connect()  

    if (connection != None):

        
        #centros
        for id_01, poi in df.iterrows():
            #print('calculating point:', id_01, 'id:', poi['business_id'])

            #[business_id, checkin, category]
            poi_information = getPOIInformation(connection, poi['business_id'])


            #[business_id, checkin, category, distance_m]
            bin_osm_information = getBinOSMInformation(connection, poi['business_id'], bin_number, materialized_view)

            #Se o bin está preenchido com alguma informação
            if (len(bin_osm_information) > 0):
                tags = list(dict(bin_osm_information[0]).keys())

                bin_osm_information = pd.DataFrame(bin_osm_information, columns = tags)

                #Excluindo ids e bin_number
                tags = tags[2:len(tags)-3]

                #Calculando os dois parâmetros abaixo
                #sc - comprimento total
                #sp - total de roads/lines no bin
                sp = len(bin_osm_information['osm_id'].unique())
                sc = bin_osm_information['length'].sum()

                
                #As adições são feitas baseadas nos rótulos

                #Para evitar divisão por zero
                if(sc != 0 and sp != 0):
                    for center_poi in poi_information: # Para cada tki
                        for id_02, row in bin_osm_information.iterrows(): #Para cara tkj
                            for tag in tags:
                                #Percorrer cada tag
                                
                                

                                value = row[tag]
                                if (value != 'None'): #Para tags vazias
                                
                                    #cc = all length o tag
                                    #cp = all occurences of tag
                                    cc = bin_osm_information[bin_osm_information[tag] == value]['length'].sum()
                                    cp = bin_osm_information[bin_osm_information[tag] == value][tag].count()

                                    a = (1 - (cc/sc)) #Pode gerar 0
                                    u = (cp/sp)       #Pode gerar 0

                                    if((a > 0) and (u > 0)):

                                        A = -np.log2(a)
                                        U = -np.log2(u)

                                        aug = int(math.ceil((w*A) + ((1 - w)*U)))

                                        name = tag+"_"+value

                                        #Aumentando-o pelo fator b
                                        for b in range(aug):

                                            line = [str(poi['business_id']), str(center_poi['name']), str(row['osm_id']), str(name), str(row['distance_m'])]
                                            writer.writerow(line)

                                
                    #break
        
        csv_file.close()
        closeConnection(connection)
        
    return None

In [None]:
import csv
def calculateBinOSMPoints_Disco(df, bin_number):
    
    print("executing bin:", n)

    #Arquivo para salvar diretamente no disco
    name = './Austin/w05/ITDL Osm Partial Bins/ITDL Osm Partial Bins/austin-sl-tuple-n-itdl-' + str(n) + '-bins_points_information-p.csv'
    csv_file = open(name, "w", newline='')
    writer = csv.writer(csv_file, delimiter=',')
    writer.writerow(["poi_id_center","center_poi","osm_id_context","context_osm","distance-m"])

    #Criando canal de comunicação com a base de dados
    connection = connect()  

    if (connection != None):

        
        for id_01, poi in df.iterrows():
            #print('calculating point:', id_01, 'id:', poi['business_id'])

            #[business_id, checkin, category]
            poi_information = getPOIInformation(connection, poi['business_id'])


            #[business_id, checkin, category, distance_m]
            bin_osm_information = getBinOSMInformation(connection, poi['business_id'], bin_number, 'bins_points_information')

            #Se o bin está preenchido com alguma informação
            if (len(bin_osm_information) > 0):
                tags = list(dict(bin_osm_information[0]).keys())

                bin_osm_information = pd.DataFrame(bin_osm_information, columns = tags)

                #Excluindo ids e bin_number
                tags = tags[2:len(tags)-2]

                #sp - total de lines no bin
                sp = len(bin_osm_information['osm_id'].unique())
                
            
                #As adições são feitas baseadas nos rótulos

                #Para evitar divisão por zero
                if(sp != 0):
                    for center_poi in poi_information: # Para cada tki
                        for id_02, row in bin_osm_information.iterrows(): #Para cara tkj
                            for tag in tags:
                                #Percorrer cada tag
                                
                                value = row[tag]
                                if (value != 'None'): #Para tags vazias
                                
                                    cp = bin_osm_information[bin_osm_information[tag] == value][tag].count()
                                    u = (cp/sp)       #Pode gerar 0

                                    if(u > 0):

                                        U = -np.log2(u)
                                        aug = int(math.ceil(U))

                                        name = tag+"_"+value

                                        #Aumentando-o pelo fator b
                                        for b in range(aug):

                                            line = [str(poi['business_id']), str(center_poi['name']), str(row['osm_id']), str(name), str(row['distance_m'])]
                                            writer.writerow(line)
        
        csv_file.close()
        closeConnection(connection)
        
    return None

# Geração do ITDL

In [15]:
#Quantidade de tuplas de vizinhos nos dados do yelp (center, context) considerando d = 100m
#df = pd.read_csv('/kaggle/input/yelpcambridge/yelpcambridge-ml.csv')
pois_file_name = './Austin/austin-ml-updated.csv'
df = pd.read_csv(pois_file_name)
print(df.shape)
df = df.dropna()
print(df.shape)
df.head()

(22399, 7)
(22399, 7)


Unnamed: 0,business_id,city,state,latitude,longitude,categories,checkin_count
0,N3_Gs3DnX4k9SgpwJxdEfw,Austin,TX,30.346169,-97.711458,"Shopping, Jewelry Repair, Appraisal Services, ...",14
1,tXvdYGvlEceDljN8gt2_3Q,Austin,TX,30.172706,-97.79992,"Barbers, Beauty & Spas",1
2,nTIhpR7MhsALPwg_Hh14EA,Austin,TX,30.326377,-97.704543,"Hotels, Hotels & Travel, Event Planning & Serv...",475
3,8XyEpVdAO0o6iVkVxkWosQ,Austin,TX,30.246465,-97.778738,"Home Services, Real Estate, Property Management",0
4,NVfOn7TdnHbaGH97CVB_Qg,Austin,TX,30.244902,-97.857409,"Chiropractors, Health & Medical",33


iteractive ITDL

In [21]:
for n in range(0, 1):
    res = calculateBin(df, n, 0.5)
    #break

executing bin: 0 	weight: 0.5


In [None]:
for n in range(0, 11):
    calculateBinOSMPolygon_Disco(df, n, 0.5)
    #calculateBinOSM_Disco(df, n, 'bins_points_information', 0.5)
    #calculateBinOSM_Disco(df, n, 'bins_lines_information', 0.5)
    #calculateBinOSM_Disco(df, n, 'bins_roads_information', 0.5)

executing bin: 0 	weight: 0.5
executing bin: 1 	weight: 0.5


In [88]:
test = pd.read_parquet('austin-sl-tuple-n-itdl-0bin-wgt0.7-p.parquet')
print(test.shape)
test.head()

(2291, 3)


Unnamed: 0,centerPoI,contextPoI,distance-m
0,Shopping,Screen Printing,13.1
1,Shopping,Screen Printing,13.1
2,Shopping,Screen Printing,13.1
3,Shopping,Screen Printing/T-Shirt Printing,13.1
4,Shopping,Screen Printing/T-Shirt Printing,13.1


Parallel ITDL

In [None]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(int(mp.cpu_count()))

# Step 2: `pool.apply` the `howmany_within_range()`
bins = range(6, 11)
#bins = [0]
#[pool.apply(calculateBin, args=(df, n, 100, 0.3)) for n in bins]
pool.starmap(calculateBinOSM_Disco, [(df, n, 'bins_polygons_information', 0.5) for n in bins])

# Step 3: Don't forget to close
pool.close()

Number of processors:  16
executing bin: 6 	weight: 0.5
executing bin: 7 	weight: 0.5
executing bin: 8 	weight: 0.5
executing bin: 9 	weight: 0.5
executing bin: 10 	weight: 0.5


In [16]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  8
