In [38]:
import pandas as pd
import numpy as np
import aiohttp
import asyncio
import base64
import time
from bs4 import BeautifulSoup
from urllib.parse import quote

In [61]:
ano = 2021

df_edicao = pd.read_csv(f'./concept_drift/tabelas_finais/tabela_final_{ano}.csv')
codigo_cursos = df_edicao["Codigo_do_Curso"].unique().tolist()

dict_codigo_ce = {
    codigo: 0
    for codigo
    in codigo_cursos
}

In [62]:
async def fetch(session, codigo, timeout):
    utf8_codigo_curso = str(codigo).encode("utf-8")
    base64_codigo_curso = base64.b64encode(utf8_codigo_curso).decode("utf-8")
    base64_utf8_codigo_curso = quote(base64_codigo_curso, encoding="utf-8")
    #print(base64_utf8_codigo_curso)
        
    async with session.get(f'https://emec.mec.gov.br/emec/consulta-curso/listar-historico-indicadores-curso/c1999930082674af6577f0c513f05a96/{base64_utf8_codigo_curso}/list/10', 
                           timeout=timeout) as response:
        html_table = await response.text()
        soup = BeautifulSoup(html_table, 'html.parser')

        rows = soup.find('tbody').find_all('tr')

        row_edicao = None
        for row in rows:
            if row.find('td', text=f'{ano}'):
                row_edicao = row
                break

        if row_edicao:
            enade_edicao = row_edicao.find_all('td')[1].text
            dict_codigo_ce[codigo] = enade_edicao

async def fetch_all():
    result = []
    timeout = aiohttp.ClientTimeout(total=150)
    conn = aiohttp.TCPConnector(limit=10, limit_per_host=10)
    
    #codigo_cursos = lista_codigo_curso[1384:1605]
    # codigo_cursos = [73438, 1166493, 1169549, 1170053, 1170480, 1174506, 1179304, 1180251, 1180846]

    async with aiohttp.ClientSession(timeout=timeout, connector=conn) as session:
        await asyncio.gather(*[fetch(session, codigo, timeout) for codigo in codigo_cursos], return_exceptions=True)
        #return result

In [63]:
await fetch_all()

In [64]:
len(dict_codigo_ce)

1505

In [65]:
print(dict_codigo_ce)

{36: '3', 127: '4', 317: '4', 513: '4', 594: '4', 614: '4', 703: '5', 778: '4', 865: '3', 968: '3', 1110: '4', 1261: '4', 1452: '4', 1721: '4', 1904: '4', 2090: '3', 2209: '3', 2310: '3', 2388: '4', 2717: '5', 3190: '4', 3193: '4', 3197: '5', 3211: '3', 3419: '4', 3531: '2', 3676: '3', 3858: '2', 3859: '3', 3935: '3', 4547: '3', 4675: '2', 4963: '2', 5079: '3', 5238: '3', 5485: '2', 5841: '2', 6491: '4', 6668: '2', 6756: '3', 6847: '2', 7142: '2', 7292: '2', 7301: '2', 7511: '3', 7523: '4', 7839: '4', 8340: '3', 8913: '2', 9302: '2', 9311: '3', 9414: '2', 9425: '2', 9440: '4', 9470: '3', 10059: '3', 10159: '2', 10515: '3', 10814: '3', 11238: '3', 11452: '4', 11554: '4', 11730: '3', 12313: '5', 12581: '5', 12710: '5', 12837: '5', 12946: '5', 13216: '5', 13277: '4', 13401: '4', 13446: '4', 13595: '4', 13717: '5', 13881: '4', 13980: '4', 14098: '4', 14137: '3', 14217: '5', 14681: '2', 15002: '4', 15103: '3', 15127: '3', 15262: '3', 15555: '2', 15869: '4', 17077: '2', 17266: '2', 17307: '3

In [12]:
remove = []
for key in dict_codigo_ce.keys():
    if dict_codigo_ce[key] == 0 or dict_codigo_ce[key] == '-':
        remove.append(key)
        
for key in remove:
    del dict_codigo_ce[key]

In [66]:
len(dict_codigo_ce.keys())

1505

In [67]:
lista_ce = list(dict_codigo_ce.values())
lista_codigo_curso = list(dict_codigo_ce.keys())

In [None]:
lista_codigo_curso

In [68]:
data = {
    "Codigo_Curso": lista_codigo_curso,
    "Nota_Conceito_Faixa": lista_ce
}

In [69]:
df_ce_edicao = pd.DataFrame(data)
df_ce_edicao = df_ce_edicao.astype({'Nota_Conceito_Faixa':'int64'})

In [70]:
df_ce_edicao

Unnamed: 0,Codigo_Curso,Nota_Conceito_Faixa
0,36,3
1,127,4
2,317,4
3,513,4
4,594,4
...,...,...
1500,5001296,3
1501,5001383,4
1502,5001409,3
1503,5001434,5


In [71]:
df_ce_edicao.to_csv(f'./conceito_enade_{ano}_webscrap.csv',index=False)

###########################################
#### Para testes dos resultados

In [88]:
utf8_codigo_curso = str(94260).encode("utf-8")
base64_codigo_curso = base64.b64encode(utf8_codigo_curso).decode("utf-8")
base64_utf8_codigo_curso = quote(base64_codigo_curso, encoding="utf-8")
base64_utf8_codigo_curso

'OTQyNjA%3D'

In [18]:
df_ce_edicao[df_ce_edicao["Nota_Conceito_Faixa"] == 0]["Codigo_Curso"].to_list()

[]

In [19]:
# Edição de 2017
# df_ce_edicao = df_ce_edicao[df_ce_edicao["Codigo_Curso"] != 73438]

# Edição de 2011
df_ce_edicao = df_ce_edicao[df_ce_edicao["Codigo_Curso"] != 94260]

# df_ce_edicao.sort_values("Codigo_Curso", inplace = True)
# df_ce_edicao.reset_index(drop=True, inplace = True)

In [20]:
df_ce_edicao

Unnamed: 0,Codigo_Curso,Nota_Conceito_Faixa
0,36,3
1,127,4
2,317,3
3,513,3
4,594,3
...,...,...
1286,1126641,3
1287,1126655,3
1288,1129605,3
1289,1133236,3


In [21]:
df_ce_edicao = df_ce_edicao.sort_values(by="Codigo_Curso")

In [24]:
df_ce_edicao.reset_index(drop=True)

Unnamed: 0,Codigo_Curso,Nota_Conceito_Faixa
0,36,3
1,127,4
2,317,3
3,513,3
4,594,3
...,...,...
1286,5000356,3
1287,5000448,2
1288,5000456,3
1289,5000457,5


In [26]:
set(df_ce_edicao["Nota_Conceito_Faixa"].values)

{1, 2, 3, 4, 5}