In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Inicialization

In [2]:
df = pd.read_excel("Empresas.xlsx")
service = Service(ChromeDriverManager().install())
companies_sufixes = ['LTDA.', 'LTDA', 'S.A.', 'S.A', 'S/A', ' S A', 'LIMITADA']
session = requests.Session()
headers = {
'Sec-Ch-Ua-Platform': "Linux",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.5249.62 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'close',
}

# Fill DataSet

In [3]:
def get_name_reclame_aqui(cnpj: str, driver: webdriver.Chrome):
    cnpj = cnpj.replace('/', '')
    cnpj = cnpj.replace('-', '')
    cnpj = cnpj.replace('.', '')
    cnpj = int(cnpj)
    url = "https://www.reclameaqui.com.br/busca/?q=" + str(cnpj)
    driver.get(url)
    try:
        tag = driver.find_element(By.XPATH, '//div[@class="title-company-card"]//a')
        name = tag.get_attribute('href').split('/')[-1]
        return True, name
    except:
        return False, None

In [4]:
def find_cadastro_empresas(cnpj: str):
    url = 'https://cadastroempresa.com.br/cnpj/' + str(cnpj)
    r = session.get(url=url, headers=headers)
    soup = BeautifulSoup(r.text, features="html.parser")
    try:
        name = soup.find('dt', text=' Nome de Fantasia: ').findNext('p').text
        if len(name) == 0:
            return False, None
        return True, name    
    except:
        return False, None

In [5]:
def get_name(row):
    print('Company| ' + str(row['Razão Social']))
    valid, name = find_cadastro_empresas(str(row['CNPJ']))
    if not valid and str(row['Nome fantasia']) == 'nan':
        driver = webdriver.Chrome(service=service)
        valid, name = get_name_reclame_aqui(str(row['CNPJ']), driver)
        driver.quit()
    if not valid:
        name = ''
    return name

In [6]:
def fill_names(df: pd.DataFrame):
    for index, row in df.iterrows():
        name = get_name(row)
        if name == '':
            name = row['Razão Social']
            for word in companies_sufixes:
                name = name.replace(word, '')
        print('Name   | ' + name)
        df.iloc[index, 2] = name.strip()
    return df

In [7]:
fill_names(df)
df.to_excel('updated.xlsx')

Company| MERCADOPAGO.COM REPRESENTACOES LTDA.
Name   | mercado-pago
Company| PAGSMILE INTERMEDIACAO E AGENCIAMENTO DE NEGOCIOS LTDA
Name   | Pagsmile
Company| ADYEN DO BRASIL INSTITUICAO DE PAGAMENTO LTDA.
Name   | Adyen Latin America
Company| GOETTEN ADMINISTRADORA DE CREDIARIO LTDA.
Name   | Tidas Crediario
Company| HAVAN S.A.
Name   | havan-loja-fisica
Company| TIDAS TECNOLOGIA LTDA
Name   | Tidas Tecnologia
Company| IFOOD.COM AGENCIA DE RESTAURANTES ONLINE S.A.
Name   | ifood
Company| JEITTO MEIOS DE PAGAMENTO LTDA.
Name   | jeitto
Company| PAY BROKERS COBRANCA E SERVICOS EM TECNOLOGIA LTDA.
Name   | pay-brockers
Company| DEMERGE BRASIL FACILITADORA DE PAGAMENTOS LTDA
Name   | DEMERGE BRASIL FACILITADORA DE PAGAMENTOS 
Company| TIM S A
Name   | TIM
Company| EBANX LTDA
Name   | Ebanx
Company| CELCOIN INSTITUICAO DE PAGAMENTO S.A.
Name   | Celcoin
Company| SHPP BRASIL INSTITUICAO DE PAGAMENTO E SERVICOS DE PAGAMENTOS LTDA
Name   | shopee
Company| RCS GESTAO DE COBRANCAS LTDA.
Name   

# Get Logos URLs

In [8]:
def get_google_images(company):
    company = company.strip().replace(' ', '+')
    url = "https://www.google.com/search?q=" + str(company) + "+logo&source=lnms&tbm=isch&sa=X"
    r = requests.get(url, allow_redirects=True)
    soup = BeautifulSoup(r.text, features="html.parser")
    try:
        return True, soup.findAll('img')[1]['src']
    except:
        return False, None

In [9]:
dic = {}
for index, row in df.iterrows():
    company = str(row['Nome fantasia'])
    print(company)
    valid, link = get_google_images(company)
    if valid:
        print(link)
        dic[company] = link
    else:
        print("Not Founded")

mercado-pago
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS9GpUCk_ubGBR0bCl2qGSJp6ao6O__oh5shkJwisHoGCPP3A687how8piQMgY&s
Pagsmile
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSNpNzkWRJspMYkQemAqiybwKXfbW_KIKCdMW595sBNho4Pi2Nr9qHnc1geEg&s
Adyen Latin America
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR6JLjPsOZ8T8-CBFl3MhqnOCxIj0k-lq_W_6reNXFeqe5isvZp5rzRX2VwFsM&s
Tidas Crediario
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSd5T-Ie1L_RuMGH-xiag5sak93UyR8rKLAxXuaZWjaUQ8b81U0J7xNyqd90DM&s
havan-loja-fisica
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSgTA-Wewnn9K5ZsOd9n0_koKpYGUxPoXZ6KC64PtqFv0IbAXBp55NEHN3XrA&s
Tidas Tecnologia
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRTtScN3hNxkFVwn0snwTKWrQqIs4kLazFIiEnYGPAlKZ5_e1Q4Emxm5hfngA&s
ifood
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRdd7PMqIAUgzSW-SwncrMm-RNqOjxs-FdW1Yxtx9z1ybtdpmxvkSUQBF0IxOY&s
jeitto
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQmyFs6DLv2S7DE0ym

# Validate

If logo not corresponds to a company insert  (__e__)

Others inputs corresponds to a correct logo

In [10]:
driver = webdriver.Chrome(service=service)
with open("erros.txt", 'w') as erros_file:
    with open("output.txt", 'w') as output_file:
        for company, url in dic.items():
            print(company)
            output_file.write(company + '-> "' + url + '"\n')
            if url is None:
                erros_file.write(company + '-> None\n')
            else:
                driver.get(url)
            inp = input().lower()
            if inp == 'e':
                print('Error\n')
                erros_file.write(company + '-> "' + url + '"\n')
driver.implicitly_wait(10.0)
driver.quit()

mercado-pago
Pagsmile
Adyen Latin America
Tidas Crediario
havan-loja-fisica
Tidas Tecnologia
ifood
jeitto
pay-brockers
DEMERGE BRASIL FACILITADORA DE PAGAMENTOS
Error

TIM
Ebanx
Celcoin
shopee
RCS GESTAO DE COBRANCAS
TELEFONICA BRASIL
CLARO
pernambucanas-cartoes
paypal
Zippi
LIQUIDO BRL PAGAMENTOS DIGITAIS
OKTO PAGAMENTOS BRASIL
uber
Magalu Pagamentos
Hotmart
PAY BROKERS COBRANCA E SERVICOS EM TECNOLOGIA
Luxpag
Error

On-Line Games Diversoes e Entretenimento
Error

qi-sociedade-de-credito
CARLOS ALBERTO DE LIMA CARDOSO DE BRITO 38412612809
Error

Capitual
FUNDO DE INVESTIMENTO EM DIREITOS CREDITORIOS SUPERSIM INCLUSAO FINANCEIRA
Error

americanas-marketplace
UNIFIQUE TELECOMUNICACOES
Boacompra.com
PAYMENTEZ DO BRASIL
ORION TECNOLOGIA DA INFORMACAO E SOLUCOES DIGITAIS
Pin4Pay
Ebanx Pagamentos
COMPANHIA SECURITIZADORA DE CREDITOS FINANCEIROS VERT-ZIPPI
Supermercados Koch
LIDERANCA CAPITALIZACAO
We Pay Out
PIN4PAY MEIOS DE PAGAMENTO
Braip Pagamentos
eduzz
Aarin
banco-mercantil-do-brasil
S