# Professor Fedyk URAP Interview Task: Glassdoor Scraper

## Step 1: Build GlassDoor Seed URL list.

In [20]:
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd
import numpy as np
import nltk

import jsonlines
import json
import csv

wait=.05

In [21]:
# Get all data we need to search the company from files

name = []
longName = []
ticker = []

with open('gvkey_salary_company_seeds.jsonl') as reader:
    for line in reader:
        company = json.loads(line)
        
        name.append(company['name'])
        longName.append(company['longname'])
        ticker.append(company['capiq-ticker'])       
        

In [22]:
# Process out the legal words from the 'long name' to get common names

common = pd.Series(longName)

with open('legal.csv', newline='') as legal:
    reader = csv.reader(legal)
    legalTerms = list(reader)[0][1:]
    
    
    #Special case for words with '(The)'
    common = common.apply(lambda x: 'The ' + x.replace('(The)', '') if '(The)' in x else x)
    
    for term in legalTerms:
        common = common.apply(lambda x: x.replace(term, '') if (type(x)==str and x.endswith(term)) else x)

        
print(common)
    


0                                AAR
1                  American Airlines
2                 CECO Environmental
3       ASA Gold and Precious Metals
4                                AVX
                    ...             
5805                  nVent Electric
5806               Grindrod Shipping
5807      Navios Maritime Containers
5808                          Arcosa
5809                 Castor Maritime
Length: 5810, dtype: object


In [23]:
#Now we begin the search for seeds. below are helper functions to help divide up the task.

def sign_in(args):
    url = 'https://www.glassdoor.com/profile/login_input.htm'
    browser.get(url)
    time.sleep(wait)
    
    email_field = browser.find_element_by_name('username')
    password_field = browser.find_element_by_name('password')
    submit_btn = browser.find_element_by_xpath('//button[@type="submit"]')

    email_field.send_keys(args['username'])
    password_field.send_keys(args['password'])
    submit_btn.click()

    time.sleep(wait)
    browser.get(args['url'])

In [24]:
def search_word(c):
    '''Accesses the search bar and inputs all information
    and submits the search.
    '''
    keyword = browser.find_element(By.XPATH, '//*[@id="sc.keyword"]')
    keyword.clear()
    keyword.send_keys(c)
    time.sleep(wait)

    dropdown = browser.find_element(By.XPATH, '//*[@id="scBar"]/div/div[2]/div')
    dropdown.click()
    time.sleep(wait)

    selectCompany = browser.find_element(By.XPATH, '//*[@id="option_1"]')
    selectCompany.click()
    time.sleep(wait)

    #.clear() doesnt work so harcoded backspace
    location = browser.find_element(By.XPATH,'//*[@id="sc.location"]')
    for i in range(50):
        location.send_keys(Keys.BACK_SPACE)

    searchBtn = browser.find_element(By.XPATH,'//*[@id="scBar"]/div/button/span')
    searchBtn.click()
    time.sleep(wait)
    
    #Some times random pop ups come to block the scraper if the scraper goes too fast.
    #optionally uncomment below if this happens.
    # try:
    #     browser.find_element_by_class_name("selected").click()
    # except ElementClickInterceptedException:
    #     pass

    # time.sleep(wait)

    # try:
    #     browser.find_element_by_class_name("ModalStyle__xBtn___29PT9").click()  #clicking to the X.
    # except NoSuchElementException:
    #     pass

In [25]:
def find_match(i, c):
    ''' finds company with name 'c' and returns link to most likely website of company 'c'
        
        Detail:
        searches through each result and checks how many times each result word matches
        with the ticker, longname, and company name. We choose the best candidates
        that match the most of these three, and determine the most likely legetimate
        site via heuristic getStat(<possible result>) and clicks on link. 
    '''
    results = browser.find_elements(By.CLASS_NAME,'single-company-result.module')

    similarNames = []
    similarStat = []
    matches = []
    
    # count matches
    for oneResult in results:
        resultName = oneResult.find_element(By.CLASS_NAME, 'col-9.pr-0')
        textLink = resultName.find_element(By.TAG_NAME, 'a')
        matchNum = checkAnyInLowerStrip(i, textLink.text)
        matches.append(matchNum)  
        
    matches = np.array(matches)
    if matches.size == 0:
        return ''
    maxMatch = max(matches)
    if maxMatch == 0:
        return ''
    else:
        # choose best stat out of best matches
        bestCandidates = np.where(matches == maxMatch)[0]
        for i in bestCandidates:
            candidate = results[i]
            oneOfBestName = candidate.find_element(By.CLASS_NAME, 'col-9.pr-0')
            textLink = oneOfBestName.find_element(By.TAG_NAME, 'a')
            similarNames.append(textLink)
            similarStat.append(getStat(candidate))
        similarNames[np.argmax(similarStat)].click()
        
        direct_name = browser.find_element(By.XPATH, '//*[@id="DivisionsDropdownComponent"]').text
        if check_public(direct_name):
            time.sleep(wait)
            return browser.current_url
        else:
            return ''

In [26]:
def getStat(oneResult):
    '''gets a the mean of reviews, salaries, and interviews to determine how legitimite the company profile is.
    
    the mean is used since if there is only one metric that is strong (possibly due to spamming) then this is 
    discounted by taking the mean of all three values.
    '''
    
    stats = oneResult.find_elements(By.CLASS_NAME, 'num.h2')
    nums = []
    k = 1
    for s in stats:
        s = s.text.strip().replace('--', '0')
        if 'k' in s:
            s = s.replace('k', '')
            k = 1000 # we need to multiply by 1k = 1000
        numStat = float(s) * k
        nums.append(numStat)
        
    return np.mean(nums)
            
    

In [27]:
def check_redirect(i, c):
    '''Occasioanlly the exact name of the comapany is entered, and we are taken to their page.
    In this case the company name may be different so we just check if either the display name
    is in what we're searching and vice versa.
    
    returns TRUE if there was a redirect
    '''
    try:
        direct_name = browser.find_element(By.XPATH, '//*[@id="DivisionsDropdownComponent"]').text
    except:
        return 'not redirect'
    print('redirected to: ', direct_name, ' when searching: ', c)
    if direct_name and direct_name != ' ':
        if checkAnyInLowerStrip(i, direct_name):
            if check_public(direct_name):
                return browser.current_url
    return 'fail'


Logic for private words:
- Non-profits may issue stock publicly in some states
- Some franchises are publicly traded
- Hospitals can be publicly traded
- Contracting firms may be publicly traded
- Subsidaries may be publicly traded
- Other is ambiguous
    
    Thus everything else in the list must be private

https://www.glassdoor.com/mz-survey/start_input.htm?cr=&c=&showSurvey=Reviews

In [28]:
def check_public(direct_name):
    '''all entries in JSONL files should be publicly traded.
    If any company is not publicly traded, return false
    else return true
    
    might need to adjust private words later.
    '''
    return True #many public companies are listed as private so turn off
    
    
    typeText = browser.find_element(By.XPATH, "//*[@data-test='employer-type']").text
    privateWords = ['Private', 'Government', 'College', 'School', 'Self']
    if any([p for p in privateWords if p in typeText]):
        print('Private Company found and ignored: ', direct_name)
        return ''
    else:
        return True

In [29]:
def get_seed_link(i, c):
    '''gets the seed link of the 'i'th word, 'c'.
    if the link is found, appends it to seeds.
    otherwise appends ''.
    '''
    link_found = find_match(i, c)
    if not link_found:
        return ''
    else:
        return link_found

In [30]:
def search(i, c):
    '''searches the word in the search bar via search_word(c)
    then checks if there was a redirect via check_redirect
    finally gets seed link via get_seed_link
    '''
    try:
        search_word(c)
        result = check_redirect(i, c)
        if (result == 'not redirect'):
            return get_seed_link(i, c)
        elif (result == 'fail'):
            return ''
        else:
            return result
        
    except Exception as e:
        return ''

In [31]:
def gather_seeds(limit):
    '''iterates from entries at index start to end to search for the seed link
    adds '' to gd-url in taskOne if not found.
    adds the link to gd-url in taskOne if found
    
    if the link is found, we go ahead and collect the 
    '''
    
    
    reader = jsonlines.open('gvkey_salary_company_seeds.jsonl', mode='r')
    
    seed = ''
    taskOne = []
    taskTwo = []
    
    for index, company in enumerate(reader):
        #if index < limit:
        #    continue
        seed = search(index, longName[index])
        if not seed:
            seed = search(index, common[index])
            #give up only after common name and long name does not work.
        company['gd-url'] = seed
        taskOne.append(company)
        
        if (seed != ''):
            collectTaskTwo(taskTwo, company.copy())
            
        limit -= 1
        
    reader.close() 
    
    return taskOne, taskTwo

In [32]:
def checkAnyInLowerStrip(i, word):
    ''' checks if a the common name, long name, or ticker is in word, or vise versa when stripped 
    of spaces and punctuations and lower cased are contained in eachother.
    
    Note:
    After the first successfull pass, there were many instances where one search led to another
    version of the name popping up, but it was not caught as valid. This happened as redirects and
    normal searches.
    
    Additionally the cases of the words were messing with the accuracy so everything is in lower case.
    However, the search itself is in the original case because that tends to bring the most relavent to 
    the top. The punctuations are also removed.
    
    finally we get rid of spaces since some comapnies concatinate their company names on glass door.
    '''
    
    #strip all punctuation
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word = ''.join(tokenizer.tokenize(word)).lower()

    def normalizeAndCompare(c):
        c = ''.join(tokenizer.tokenize(c)).lower()
        return c in word or word in c
    
    check = [ticker[i], common[i], longName[i]]
    result = []
    
    for item in check:
        result.append(normalizeAndCompare(item))
        
    return sum(result)
    
    

In [33]:
### Now we input the URLs into the jsonl file
def writeToJson(writeList, dest):
    ''' Write writeList into dest.json as json object
    '''
    jsonObject = json.dumps(writeList, indent = 4)
    with open(dest + ".json", "w") as outfile: 
        outfile.write(jsonObject) 

In [34]:
def collectTaskTwo(taskTwo, company):
    ''' Given output list taskTwo and a dict company
    to input information, collect Overview and Top bar 
    data into the comapny dict, then append to
    taskTwo list
    '''
    collectOverview(company)
    collectTopBar(company)
    taskTwo.append(company)


In [35]:
def collectOverview(company):
    ''' Collect all information from the Overview pane on glassdoor.com
    '''
    infoSquare = browser.find_element(By.CLASS_NAME, 'css-155za0w.row.px-0.m-0')
    rows = infoSquare.find_elements(By.TAG_NAME, 'li')
    overview = {}
    
    # link formatted differently
    overview['website'] = rows[0].find_element(By.TAG_NAME, 'a').text 
    
    for info in rows[1:]:
        label = info.find_element(By.TAG_NAME, 'label').text 
        data = info.find_element(By.TAG_NAME, 'div').text 
        overview[label] = data
        
    company['Overview Data'] = overview

In [36]:
def collectTopBar(company):
    ''' Collect all information on the top bar pane of glassdoor.com
    then , enter information into company dict
    '''
    name =''
    topBarData = {}
    
    infoBar = browser.find_element(By.ID, 'EIProductHeaders')
    links = infoBar.find_elements(By.TAG_NAME, 'a')
    
    # skip first link = overview page
    overview = links[0].get_attribute("href")
    for data in links[1:]:
        link = data.get_attribute("href")
        
        if 'Reviews' in link:
            review = link
        
        if 'FAQ' in link:
            name = 'FAQ'
        elif 'Location' in link:
            name = 'Location'
        elif 'Affiliated' in link:
            name = 'Affiliated'
        else:
            textData = data.find_elements(By.TAG_NAME, 'span')
            value = textData[0].text
            name = textData[1].text
            topBarData['Number of ' +name+ ' Posted'] = value
            
        topBarData[name + ' Link'] = link
        
    reviewCase(company, review)

    company['Top bar Data'] = topBarData

In [37]:
def reviewCase(company, link):
    ''' Go to the reviews page at link, and collect some simple statistics.
    Add this information into company
    '''
    reviews = {'Star score':'', 'Ceo Approval Rate':'', 'Recommend to Friend Rate':'', 'CEO Name':''}
    browser.get(link)
    try:
        starScore = browser.find_element(By.XPATH, '//*[@id="EmpStats"]/div/div[1]/div/div/div').text

        reviews['Star score'] = starScore
    except Exception as e:
        print(company['name'], ' does not have star score')
    try:
        ceo = browser.find_element(By.XPATH, '//*[@id="EmpStats"]/div/div[2]/div[3]/div/div[2]/div[1]').text
        reviews['CEO Name'] = ceo

        ratings = browser.find_elements(By.CLASS_NAME, 'donut__DonutStyle__donutchart_text_val')

        recommendToFriend = ratings[0].text
        reviews['Recommend to Friend Rate'] = recommendToFriend

        approveOfCeo = ratings[1].text
        reviews['Ceo Approval Rate'] = approveOfCeo
    except Exception as e:
        print(company['name'], ' does not have donut values')
        
    company['Review Section Details'] = reviews

### And now we get the seeds!!!!

Some searches seem to tend towards the same result. 

For example:

"BANK OF NEW YORK MELLON CORP" on line 76 and 

"BNY Mellon Municipal Bond Infrastructure Fund Inc" on line 1258

both go to the same "BNY Mellon" page.In this case these both have the same link.

Created for Mac OS Chrome Ver. 88.0.4324.96
chromedriver.exe 


Downloaded from here:
https://chromedriver.storage.googleapis.com/index.html?path=88.0.4324.96/

Follow this tutorial to put chromedriver.exe into PATH:
https://www.edureka.co/community/52315/how-to-setup-chrome-driver-with-selenium-on-macos

In [39]:
browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")

#enter username and password 
args = {'username':'', 
        'password':'', 
        'url':'https://www.glassdoor.com/member/home/index.htm'}

sign_in(args)
taskOne, taskTwo = gather_seeds(635) #blocked from 635 so start from here
#started at deltic timber after entering captcha info
writeToJson(taskOne, 'taskOne')
writeToJson(taskTwo, 'taskTwo')
browser.quit()




redirected to:  Monster Worldwide  when searching:  Monster Worldwide Inc.
DPW HOLDINGS INC  does not have donut values
redirected to:  Quest Diagnostics  when searching:  Quest Diagnostics Inc
redirected to:  Emclaire Financial  when searching:  Emclaire Financial Corp
EMCLAIRE FINANCIAL CORP  does not have donut values
redirected to:  DXP Enterprises  when searching:  DXP Enterprises Inc
redirected to:  Cal-Maine Foods  when searching:  Cal Maine Foods Inc
redirected to:  Eltek (Israel)  when searching:  Eltek Ltd
ELTEK LTD  does not have donut values
redirected to:  Insperity  when searching:  Insperity Inc.
redirected to:  Kilroy Realty  when searching:  Kilroy Realty Corp
redirected to:  Cerus Corporation  when searching:  Cerus Corp
redirected to:  Vericel Corporation  when searching:  Vericel Corp
redirected to:  China Eastern Airlines  when searching:  China Eastern Airlines Corp Ltd
redirected to:  Vail Resorts  when searching:  Vail Resorts Inc.
redirected to:  Epiq  when sea

redirected to:  NICE inContact  when searching:  inContact Inc
redirected to:  Heritage Financial (MD)  when searching:  Heritage Financial Corp
HERITAGE FINANCIAL CORP  does not have donut values
redirected to:  Timberland Bancorp  when searching:  Timberland Bancorp Inc
redirected to:  Mid Penn Bancorp  when searching:  Mid Penn Bancorp Inc
redirected to:  Steelcase  when searching:  Steelcase Inc.
redirected to:  AeroCentury  when searching:  AeroCentury Corp
AEROCENTURY CORP  does not have donut values
redirected to:  Old Point Financial  when searching:  Old Point Financial Corp
redirected to:  Descartes Systems  when searching:  Descartes Systems Group Inc (The)
redirected to:  W. P. Carey  when searching:  W. P. Carey Inc
redirected to:  First National  when searching:  First National Corp
FIRST NATIONAL CORP  does not have donut values
redirected to:  Rockwell Medical  when searching:  Rockwell Medical Inc
redirected to:  Verisign  when searching:  Verisign Inc
redirected to:  

redirected to:  Cemtrex  when searching:  Cemtrex Inc
redirected to:  IXYS  when searching:  IXYS Corp
redirected to:  RiceBran Technologies  when searching:  RiceBran Technologies
RICEBRAN TECHNOLOGIES  does not have donut values
redirected to:  eBay  when searching:  eBay Inc.
redirected to:  Goldman Sachs  when searching:  Goldman Sachs Group Inc (The)
redirected to:  eMagin  when searching:  eMagin Corp
EMAGIN CORP  does not have donut values
redirected to:  Mackinac Financial  when searching:  MacKinac Financial Corp
redirected to:  Xenith Bankshares  when searching:  Xenith Bankshares Inc
redirected to:  Algonquin Power & Utilities  when searching:  Algonquin Power & Utilities Corp
redirected to:  WebMD Health  when searching:  Webmd Health Corp
redirected to:  Eaton Vance  when searching:  Eaton Vance Senior Income
redirected to:  American Outdoor Brands  when searching:  American Outdoor Brands Corp
redirected to:  FedNat  when searching:  FedNat Holding Co
NETWORK-1 TECHNOLOGI

redirected to:  GigaMedia  when searching:  GigaMedia Ltd
GIGAMEDIA LTD  does not have donut values
AMERICAS SILVER CORP  does not have donut values
redirected to:  Onvia  when searching:  Onvia Inc
redirected to:  UTStarcom  when searching:  Utstarcom Holdings Corp
redirected to:  Silicon Labs  when searching:  Silicon Laboratories Inc
redirected to:  Silicon Labs  when searching:  Silicon Laboratories
redirected to:  Edwards Lifesciences  when searching:  Edwards Lifesciences Corp
redirected to:  Edgewell Personal Care  when searching:  Edgewell Personal Care Co
redirected to:  Rigel Pharmaceuticals  when searching:  Rigel Pharmaceuticals Inc
redirected to:  Global Sources  when searching:  Global Sources Ltd
redirected to:  Camtek  when searching:  Camtek Ltd
redirected to:  CMC Materials  when searching:  Cabot Microelectronics Corp
redirected to:  CMC Materials  when searching:  Cabot Microelectronics
redirected to:  HealthStream  when searching:  HealthStream Inc
redirected to:  

redirected to:  Northern Oil and Gas  when searching:  Northern Oil and Gas Inc
NORTHERN OIL & GAS INC  does not have donut values
redirected to:  TransAtlantic Petroleum  when searching:  TransAtlantic Petroleum Ltd
redirected to:  Peabody  when searching:  Peabody Energy Corp
redirected to:  CNOOC  when searching:  Cnooc Ltd
MARINE PRODUCTS CORP  does not have donut values
redirected to:  PGMC  when searching:  Platinum Group Metals Ltd
redirected to:  PGMC  when searching:  Platinum Group Metals
redirected to:  Senomyx  when searching:  Senomyx Inc
redirected to:  Synaptics  when searching:  Synaptics Inc
redirected to:  Omnicell  when searching:  Omnicell Inc
redirected to:  Shore United Bank  when searching:  Shore Bancshares Inc
redirected to:  Prudential  when searching:  Prudential Financial Inc
redirected to:  Accenture  when searching:  Accenture PLC
redirected to:  Wright Medical  when searching:  Wright Medical Group NV
redirected to:  Retractable Technologies  when searchi

PARKE BANCORP INC  does not have donut values
redirected to:  Halozyme  when searching:  Halozyme Therapeutics Inc
redirected to:  CombiMatrix  when searching:  CombiMatrix Corp
COMBIMATRIX CORP  does not have donut values
redirected to:  Tenaris  when searching:  Tenaris SA, Luxembourg
redirected to:  Pluristem Life Systems  when searching:  Pluristem Therapeutics Inc
redirected to:  Pluristem Life Systems  when searching:  Pluristem Therapeutics
NF ENERGY SAVING CORP  does not have donut values
redirected to:  Molina Healthcare  when searching:  Molina Healthcare Inc.
redirected to:  AXIS Capital  when searching:  AXIS Capital Holdings Ltd
redirected to:  OptimumBank  when searching:  OptimumBank Holdings Inc
OPTIMUMBANK HOLDINGS INC  does not have donut values
redirected to:  NETGEAR  when searching:  NETGEAR Inc
redirected to:  QEP Resources  when searching:  QEP Resources Inc
redirected to:  Monarch Financial Holdings Inc  when searching:  Monarch Financial Holdings Inc
MONARCH FI

redirected to:  Viad  when searching:  Viad Corp
redirected to:  Cyclacel Pharmaceuticals  when searching:  Cyclacel Pharmaceuticals Inc
redirected to:  TOP Ships  when searching:  Top Ships Inc
redirected to:  Monolithic Power Systems  when searching:  Monolithic Power Systems Inc
redirected to:  Syneron Candela  when searching:  Syneron Medical Ltd
redirected to:  Syneron Candela  when searching:  Syneron Medical
redirected to:  Ormat  when searching:  Ormat Technologies Inc
redirected to:  Calamos Investments  when searching:  Calamos Asset Management Inc
redirected to:  Calamos Investments  when searching:  Calamos Asset Management
redirected to:  CubeSmart  when searching:  CubeSmart
redirected to:  Digital Realty  when searching:  Digital Realty Trust Inc
redirected to:  NanoViricides  when searching:  NanoViricides Inc
NANOVIRICIDES INC  does not have donut values
redirected to:  BAYCOM  when searching:  BayCom Corp
redirected to:  Mattress Firm  when searching:  Mattress Firm H

redirected to:  CytoSorbents  when searching:  Cytosorbents Corp
redirected to:  TreeHouse Foods  when searching:  TreeHouse Foods Inc
redirected to:  Gran Tierra Energy  when searching:  Gran Tierra Energy Inc
redirected to:  Toyota Motor Corporation  when searching:  CU Bancorp
redirected to:  Cimpress  when searching:  Cimpress NV
redirected to:  API Technologies  when searching:  API Technologies Corp
redirected to:  Omega Flex  when searching:  Omega Flex Inc
redirected to:  Retail Properties of America  when searching:  Retail Properties of America Inc
redirected to:  Gladstone Investment  when searching:  Gladstone Investment Corp
GLADSTONE INVESTMENT CORP  does not have donut values
redirected to:  Jamba  when searching:  Jamba Inc
redirected to:  Medivation  when searching:  Medivation Inc
redirected to:  Chanticleer Holdings  when searching:  Chanticleer Holdings Inc
CHANTICLEER HOLDINGS INC  does not have donut values
redirected to:  STRATA Skin Sciences  when searching:  ST

#### Notes from full drive #1

###### Main points:
- may need to compare all companies with same name, and choose whatever company has the most reviews. Ask research ppl
- If I dont have a sure answer as to what companuy I am looking for and im not sure of my link, should I omit? i.e. how certain do I have to be about the company I select?
- searching with the ticker is too unreliable so its better to not use
- Perhaps I could train a model of moving words around in different ways but at this point I feel like I should continue building manually

### Key Limitation:
- sometimes the company, job, salary, interview dropdown menu disappears. This means we have to create an entirely new scraper as the results page is completely different. Thus, continually refreshing will probably be the only option, if that doesnt trigger the website to think we are a bot.

<details>
<summary>Detailed Points (too many so in drop down mode)</summary>
<br>

asa - ticker misleading
    
UHL u-haul?
    
aep

aflac afl ?

aig - ticker?

ametek - ticker?

amrep - ticker?

trec

adm - archer daniels (ADM)

ash

rex stores redirect bc lower case

automatic data processing

compare lower to lower

avon

azz

british american tobacco

brt aaparetments - brt - ticker cant see images and detect related words... could use ML to choose related words...

Ball corp

popular


bank of mellon BK 

Bard (C.R) B Barnes

Barnes group

Beckton dickinson and Co

bemis bms

bkh black hills

block H&R reverse order listed

!@ensco rowasn is now ....

robert half international include in in search, not equality.

Bristol Myers Squibb

BP

liveramp - lower all and take away spaces

CDI 

cna

csp

csx

cts

campbell sroup - lower strip combine search

Carisle v Carisle IT - CSL

Caterpillar or CAT?

CENTURY LINK ctl TOO DIFFERENT. lowercase and strip would be same

what do we do when there are only iffy otions? better to not select data?

i e multiple similar name but ?

JP Morgan (Chase & is un necessary) also same name company but dif bc logo exists

Coca Cola Consolidated

cohu

use number of views to choose most legit company

commerce bank in in serarch

citi long v name?

Avis in in search - CAR

ca CA ?? CA INC? LOWER AND IN IN SEARCH

COMPUTER TASK CTG TASK?

rave restaurant rave totally wrong ;(

consolidated editonm ed 

Biglari steak and shake?

consolodated tomoka

cooper cos inc The cooper company?

TAP

CRANE

ONEMAIN FIN IN IN SEARCH

CROWN HOLDINGS IN IN SEARCH

DST

DATAO DAIO DATA I in in search for data i/o

deere in in search de to get john deere

deluxe in in searchDLX delux corp works best but thuis may not be the case forfr american airlines

Nortech systems

dover v dover crop - user review numbers to compare

Dupont de noumers

</details>

56 possible errors

#### Notes from full drive #2

- it seems like no longer using the ticker to search, the use of nltk to normalize text and do a greedy comparison against longname, commonname, and ticker, and having a heuristic to predict which link will be the most legetimate seems to have done well. Testing on rows 11 to 111, we only had 8 sketchy search results.

- I was thinking of using whether a company has an image or not as some sort of bonus point to the heuristic, but this may mess up some of the results. if I want to be safe, I could make the image a requirement. what kind of company would be so lazy to not upload their logo?

- there seems to be a lot of parameters i can adjust to make the search better. Sounds like a machine learning model...

- looks like I need to somehow predict acronyms. But at this point predicting acronyms really depends on the context of the company name. for example, if I search up A V Homes and get Avatar Holdings, I would guess its similar enough. But is it even right? I'm not a professional at determing companies.

- seems like there are some companies that exist and probably have an account on glassdoor but there are exact replicas with sketchy names and dangerous links. It looks like I'm going to have to set a SAFETY FACTOR to cut out some weird redirects.

- looks like long names are usually a better bet than the common name.

- when one of the searches go through, I redirected to a completely unrelated company. How do I prevent this while allowing for acronym changes?

- some names just seem off compared to the ticker. There are [API solutions to this](https://www.marketplace.spglobal.com/en/solutions/api-solutions-(61953ac7-ea64-4fac-926a-feb7f846c2be)) and there seems to have been python wrappers, but they cost money. Another method I could take is [download the capiq SPREAD SHEETS AND CONVERT THE CAPIQ to names there. ](https://nickderobertis.github.io/capiq-excel-downloader-py/) However this would be very memory intensive and would take long. [this seems to be common issue within the last year as well.](https://www.reddit.com/r/investing/comments/bjqlf7/capital_iq_api/) There also seems to be [academic resources](https://www.library.hbs.edu/Find/Databases/Capital-IQ-Identifiers) as well  but this also requires special permission.

<details>
<summary>Detailed Points</summary>
<br>
j ALEXANDER ACTUALLY CORRECT!

american vanguard - legitimate but looks sketchy. Maybe have no safety filter

amrep - longname better

apogee? - longname works better

A V Homes Avatar holdings redirected but too different

Brt Apartments - probably british apartments? - >redirects to apartments.com but different from brtapartments.com. not listed?

Barnes - Barnes and nobles or barns education? - barns group inc exists. longname first is better

</details>
    
8/100 errors

BRITISH AMERICAN TOBACCO - ticker not BAT, is BTE - too risky to allow head concat automatically

BNY Mellon skipped? - could tokenize to match just words, but then becomes sketchy. add more to legal? added Municipal,Bond,Infrastructure,Fund to legal list

AV Homes - AV Holdings add Homes to legal list

Perhaps having a minimal list to compare to might be advantageous, but sometimes misleading

Blocked after DIGITAL TURBINE 615

also much faster when task 2 not needed.

for tomorrow, just make python executable file and turn in

### Now we input the URLs into the jsonl file