In [52]:
import polars as pl

In [53]:

def OpenFile(file_path,file_specification, nrows_dtype_check):
    '''
    This function open a .CSV file, but use a pre scan_csv whit limited rows to check the Dtypes
    and use them to optimize the entire .CSV scan. Returning a LazyFrame. 
    the inputs are file path, file specification to be used in a loop and the nrows limit used to check dtypes
    '''
    full_path = file_path+file_specification
    dictionary = (
        pl.scan_csv(
            full_path,n_rows=nrows_dtype_check
        )
        .with_column(
            pl.col('QTD_UNIDADE_FARMACOTECNICA').cast(pl.Utf8)
        )
        .schema
        )

    return pl.scan_csv(full_path,dtypes=dictionary)

In [54]:
def PrincipioAtivo(df,line_limite=1000000000):
    ''' 
    This funcition drop the unecessary columns, concatenate month and year in 1 column, groupby 'PRINCIPIO_ATIVO', 'UF_VENDA','DATA' and agregate by count
    return the top line_limit substances solded
    '''
    q = (
        df
        .drop(
            [
                'CID10', 'TIPO_RECEITUARIO', 'SEXO', 'IDADE', 'UNIDADE_IDADE', 'CONSELHO_PRESCRITOR', 'UF_CONSELHO_PRESCRITOR'
            ]
        )
        #.filter(
         #   pl.col('PRINCIPIO_ATIVO').str.contains(substance)
        #)
        .with_columns(
            [
                pl.concat_str(['MES_VENDA','ANO_VENDA']).alias('DATA')
            ]
        )
        .groupby(
            [
                'PRINCIPIO_ATIVO', 'UF_VENDA','DATA'
                ]
        )
        .agg(
            [
            pl.count()
            ]
        )
        .sort(
            ['count','PRINCIPIO_ATIVO'], reverse=True
        )
        .limit(line_limite)
    ) 
    return q.collect()


In [76]:
def PrincipioAtivoPorSubstancias(df, substance,line_limite=1000000000):
    ''' 
    This funcition drop the unecessary columns, concatenate month and year in 1 column, groupby 'PRINCIPIO_ATIVO', 'UF_VENDA','DATA' and agregate by count
    return the top line_limit substances solded
    '''
    q = (
        df
        .drop(
            [
                'CID10', 'TIPO_RECEITUARIO', 'SEXO', 'IDADE', 'UNIDADE_IDADE', 'CONSELHO_PRESCRITOR', 'UF_CONSELHO_PRESCRITOR'
            ]
        )
        .filter(
            pl.col('PRINCIPIO_ATIVO').str.contains(substance,literal=False)
        )
        .with_columns(
            [
                pl.concat_str(['MES_VENDA','ANO_VENDA']).alias('DATA')
            ]
        )
        .groupby(
            [
                'PRINCIPIO_ATIVO', 'UF_VENDA','DATA'
                ]
        )
        .agg(
            [
            pl.count()
            ]
        )
        .sort(
            ['count','PRINCIPIO_ATIVO'], reverse=True
        )
        .limit(line_limite)
    ) 
    return q.collect()

# Generate the all substances df 

In [56]:
#All substances df generate
years = list(range(2014,2022))

appended_data = list()

for year in years:

    file = '\DB-Anvisa-'+str(year)+'.csv'

    aux = OpenFile('D:\Base_de_dados_anvisa',file,100)
    
    df_substance = PrincipioAtivo(aux,400)
    
    appended_data.append(df_substance)



df = pl.concat(appended_data,how='vertical')
df


PRINCIPIO_ATIVO,UF_VENDA,DATA,count
str,str,str,u32
"""ESTOLATO DE ER...","""SP""","""102014""",5948
"""CLORIDRATO DE ...","""SP""","""102014""",5885
"""ESTOLATO DE ER...","""SP""","""72014""",5809
"""CLORIDRATO DE ...","""SP""","""92014""",5696
"""CLORIDRATO DE ...","""SP""","""72014""",5618
"""ESTOLATO DE ER...","""SP""","""92014""",5588
"""CLORIDRATO DE ...","""SP""","""122014""",5549
"""CLORIDRATO DE ...","""SP""","""112014""",5518
"""CLORIDRATO DE ...","""SP""","""82014""",5505
"""ESTOLATO DE ER...","""SP""","""52014""",5494


In [57]:
df.write_csv('D:\Base_de_dados_anvisa\years_active principle.csv')

# Generate the df filtered by substance

In [77]:
def generate_unique_string(strings):
    return "|".join(strings)

In [80]:

df_total = df.groupby(['PRINCIPIO_ATIVO']).agg(pl.sum('count')).sort(by='count', reverse=True)
substances = df_total.head(10)['PRINCIPIO_ATIVO'].unique().to_list()
substances = generate_unique_string(substances)
substances

'CLORIDRATO DE FLUOXETINA|TOPIRAMATO|CLORIDRATO DE TRAMADOL|GABAPENTINA|CLORIDRATO DE AMITRIPTILINA|ESTOLATO DE ERITROMICINA|CLORIDRATO DE SERTRALINA|TESTOSTERONA|CLORIDRATO DE BUPROPIONA|CLORIDRATO DE PAROXETINA'

In [81]:
#All substances df generate
years = list(range(2014,2022))

appended_data = list()

for year in years:

    file = '\DB-Anvisa-'+str(year)+'.csv'

    aux = OpenFile('D:\Base_de_dados_anvisa',file,100)
    
    df_substance = PrincipioAtivoPorSubstancias(aux,substances)
    
    appended_data.append(df_substance)



df_sub = pl.concat(appended_data,how='vertical')
df_sub

PRINCIPIO_ATIVO,UF_VENDA,DATA,count
str,str,str,u32
"""ESTOLATO DE ER...","""SP""","""102014""",5948
"""CLORIDRATO DE ...","""SP""","""102014""",5885
"""ESTOLATO DE ER...","""SP""","""72014""",5809
"""CLORIDRATO DE ...","""SP""","""92014""",5696
"""CLORIDRATO DE ...","""SP""","""72014""",5618
"""ESTOLATO DE ER...","""SP""","""92014""",5588
"""CLORIDRATO DE ...","""SP""","""122014""",5549
"""CLORIDRATO DE ...","""SP""","""112014""",5518
"""CLORIDRATO DE ...","""SP""","""82014""",5505
"""ESTOLATO DE ER...","""SP""","""52014""",5494


In [82]:
df_sub.write_csv('D:\Base_de_dados_anvisa\Top10_substances.csv')