In [30]:
from bs4 import BeautifulSoup
from pprint import pprint
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
options.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

# uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url, driver):
    driver.get(url)
    res_html = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(res_html, "html.parser")
    return clean_soup(soup)

def clean_soup(soup):
    for scr in soup.find_all(['script', 'img', 'style']):
        scr.decompose()
    return soup

def get_tags_with_matching_classes(key_classes_set, soup):
    def is_any_class_match(css_class):
        if css_class is None:
            return False
        
        class_tokens =  set(re.split(r'\s|-', css_class.lower())) # sdap, profile, field, links, research, etc
        return len(class_tokens.intersection(key_classes_set)) > 0
    
    matches = []
    for match in soup.find_all(class_=is_any_class_match):
        matches = [m for m in matches if match not in m.descendants]
        matches.append(match)

    return matches

def get_faculty_html_tags_from_url(url, key_classes):
    soup = get_js_soup(url, driver)
    matching_tags = get_tags_with_matching_classes(key_classes, soup)
    return matching_tags

def save_faculty_html_tags(filename, tags):
    with open(filename, "w",encoding='utf-8') as f:
        for t in tags:
            f.write(str(t))

def get_faculty_text_from_tags(tags):
    faculty_page_text = ' '.join([res.get_text(" ", strip=True) for res in tags])
    utf8_page_text = faculty_page_text.encode("ascii", errors="ignore").decode("utf-8")
    cleaned_faculty_page_text = re.sub("\s+", " ", utf8_page_text).strip()
    return cleaned_faculty_page_text

def save_faculty_text(filename, text):
    with open(filename, "w") as f:
        f.write(text)
        
# test gensim's LDA with chbe prof bio content
# construct rules for each dept in engr by having a prototype faculty member page HTML for each dept

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\Isaac\.wdm\drivers\chromedriver\win32\87.0.4280.20\chromedriver.exe] found in cache


 


In [31]:
chbe_key_classes = set([
    "profile",
    "biography",
    "research",
    "education",
    "email",
    "phone",
    "title"
])

save_folder = "data/chbe/"
chbe_urls = "chbe_faculty_page_urls.txt"

with open(chbe_urls) as f:
    chbe_faculty_urls = [s.strip() for s in f.readlines()]
    
chbe_text_filenames = []
    
for url in chbe_faculty_urls:
    tags = get_faculty_html_tags_from_url(url, chbe_key_classes)
    faculty_id = url[url.rindex('/')+1:]
    text = get_faculty_text_from_tags(tags)
    
    html_filename = save_folder + "chbe_" + faculty_id + ".html"
    text_filename = save_folder + "chbe_" + faculty_id + ".txt"
    
    save_faculty_html_tags(html_filename, tags)
    save_faculty_text(text_filename, text)
    
    chbe_text_filenames.append(text_filename)

In [102]:
import pandas as pd
import spacy

def identity_tokenizer(text):
    return text

class SpacyTokenizer:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_md')
    def __call__(self, doc):
        sp_text = self.nlp(doc)
        # use the following code for NER on new docs
#         with sp_text.retokenize() as retokenizer:
#             for ent in sp_text.ents:
#                 retokenizer.merge(sp_text[ent.start:ent.end], attrs={"LEMMA": ent.text})

        # also detect email (and maybe phone # as well?) using t.is_email later
        return [t.lemma_.lower() for t in sp_text if len(t.text) > 1 and t.is_alpha and not t.is_punct and not t.is_stop]    

In [103]:
docs = []
spacy_tokenizer = SpacyTokenizer()

for chbe_f in chbe_text_filenames:
    with open(chbe_f) as f:
        docs.append(spacy_tokenizer(f.read()))
        
pprint(docs)

[['cookie',
  'notice',
  'cookies',
  'ying',
  'diao',
  'chemical',
  'biomolecular',
  'engineering',
  'assistant',
  'professor',
  'leap',
  'scholar',
  'research',
  'statement',
  'pursue',
  'fundamental',
  'understanding',
  'control',
  'multiscale',
  'molecular',
  'assembly',
  'process',
  'achieve',
  'sustainable',
  'manufacturing',
  'electronic',
  'material',
  'energy',
  'device',
  'therapeutic',
  'product',
  'information',
  'diao',
  'research',
  'group',
  'website',
  'research',
  'synopsis',
  'education',
  'postdoctoral',
  'scholar',
  'stanford',
  'university',
  'slac',
  'national',
  'accelerator',
  'laboratory',
  'massachusetts',
  'institute',
  'technology',
  'massachusetts',
  'institute',
  'technology',
  'tsinghua',
  'university',
  'honors',
  'beckman',
  'fellow',
  'center',
  'advanced',
  'studies',
  'university',
  'illinois',
  'urbana',
  'champaign',
  'early',
  'career',
  'award',
  'avs',
  'prairie',
  'chapter',
  

  'organ',
  'body',
  'member',
  'lab',
  'use',
  'approach',
  'generate',
  'new',
  'insight',
  'biomaterial',
  'cue',
  'instruct',
  'cell',
  'response',
  'context',
  'development',
  'disease',
  'regeneration',
  'harley',
  'co',
  'author',
  'book',
  'cellular',
  'material',
  'nature',
  'medicine',
  'cambridge',
  'university',
  'press',
  'peer',
  'review',
  'manuscript',
  'harley',
  'receive',
  'funding',
  'nsf',
  'nih',
  'american',
  'cancer',
  'society',
  'army',
  'ao',
  'foundation',
  'recognize',
  'recipient',
  'nsf',
  'career',
  'award',
  'young',
  'investigator',
  'award',
  'society',
  'biomaterials',
  'elect',
  'fellow',
  'american',
  'association',
  'advancement',
  'science',
  'co',
  'found',
  'uk',
  'base',
  'orthomimetics',
  'acquire',
  'tigenix',
  'currently',
  'perform',
  'phase',
  'clinical',
  'trial',
  'material',
  'repair',
  'osteochondral',
  'defect',
  'knee',
  'professor',
  'harley',
  'join',
  

  'university',
  'postdoctorate',
  'university',
  'michigan',
  'university',
  'michigan',
  'hanyang',
  'university',
  'korea',
  'hanyang',
  'university',
  'korea',
  'honors',
  'american',
  'institute',
  'medical',
  'biological',
  'engineering',
  'aimbe',
  'fellow',
  'centennial',
  'scholar',
  'campus',
  'distinguished',
  'promotion',
  'award',
  'engineering',
  'dean',
  'award',
  'research',
  'achievement',
  'university',
  'illinois',
  'center',
  'advanced',
  'study',
  'fellow',
  'engineering',
  'dean',
  'award',
  'research',
  'achievement',
  'korean',
  'institute',
  'chemical',
  'engineers',
  'president',
  'young',
  'investigator',
  'award',
  'career',
  'award',
  'national',
  'science',
  'foundation',
  'scientist',
  'development',
  'grant',
  'american',
  'heart',
  'association',
  'fellowship',
  'hanyang',
  'university',
  'samnam',
  'foundation',
  'fellowship',
  'lotte',
  'korea',
  'selected',
  'articles',
  'journals

  'measurement',
  'biochemistry',
  'shi',
  'maruthamuthu',
  'leckband',
  'allosteric',
  'couple',
  'cadherin',
  'extracellular',
  'domain',
  'biophys',
  'li',
  'zhou',
  'wang',
  'shin',
  'su',
  'lei',
  'kuang',
  'guo',
  'yang',
  'tanaka',
  'ts',
  'leckband',
  'reynolds',
  'ab',
  'duan',
  'wang',
  'integrate',
  'biochemical',
  'mechanical',
  'signal',
  'regulate',
  'multifaceted',
  'human',
  'embryonic',
  'stem',
  'cell',
  'function',
  'cell',
  'biol',
  'mann',
  'leckband',
  'measure',
  'traction',
  'forces',
  'long',
  'term',
  'cell',
  'cultures',
  'cell',
  'mol',
  'bioeng',
  'le',
  'duc',
  'shi',
  'blonk',
  'sonnenberg',
  'wang',
  'leckband',
  'de',
  'rooij',
  'vinculin',
  'potentiate',
  'cadherin',
  'mechanosensing',
  'recruit',
  'actin',
  'anchor',
  'site',
  'adheren',
  'junction',
  'myosin',
  'ii',
  'dependent',
  'manner',
  'cell',
  'biol',
  'leckband',
  'design',
  'rule',
  'biological',
  'adhesion',
 

  'research',
  'university',
  'illinois',
  'arthur',
  'metzner',
  'early',
  'career',
  'award',
  'society',
  'rheology',
  'frontiers',
  'engineering',
  'national',
  'academy',
  'engineering',
  'packard',
  'fellowship',
  'science',
  'engineering',
  'tomorrow',
  'pi',
  'genome',
  'technology',
  'list',
  'teachers',
  'rank',
  'excellent',
  'university',
  'illinois',
  'spring',
  'fall',
  'pathway',
  'independence',
  'award',
  'national',
  'institute',
  'health',
  'postdoctoral',
  'fellowship',
  'jane',
  'coffin',
  'childs',
  'fund',
  'medical',
  'research',
  'gerald',
  'lieberman',
  'fellowship',
  'stanford',
  'university',
  'nsf',
  'graduate',
  'fellowship',
  'national',
  'science',
  'foundation',
  'stanford',
  'graduate',
  'fellow',
  'stanford',
  'university',
  'university',
  'honors',
  'carnegie',
  'mellon',
  'university',
  'andrew',
  'carnegie',
  'scholar',
  'carnegie',
  'mellon',
  'university',
  'phi',
  'kappa',


  'fundamental',
  'problem',
  'polymer',
  'physics',
  'develop',
  'design',
  'principle',
  'bio',
  'inspire',
  'soft',
  'material',
  'information',
  'sing',
  'research',
  'group',
  'biography',
  'charles',
  'sing',
  'assistant',
  'professor',
  'chemical',
  'biomolecular',
  'engineering',
  'sings',
  'background',
  'charge',
  'polymer',
  'polymer',
  'dynamic',
  'biophysic',
  'research',
  'group',
  'seek',
  'use',
  'coarse',
  'grain',
  'model',
  'understand',
  'emergent',
  'physics',
  'polymer',
  'biophysical',
  'system',
  'use',
  'result',
  'insight',
  'guide',
  'design',
  'new',
  'material',
  'current',
  'research',
  'effort',
  'focus',
  'problem',
  'challenging',
  'span',
  'large',
  'length',
  'time',
  'scale',
  'new',
  'theory',
  'simulation',
  'method',
  'necessary',
  'yield',
  'new',
  'fundamental',
  'physical',
  'principle',
  'sing',
  'recognize',
  'number',
  'honor',
  'include',
  'forbes',
  'science',
  '

  'guo',
  'tian',
  'yeh',
  'lan',
  'zhang',
  'tasan',
  'jain',
  'zhao',
  'efficient',
  'gene',
  'knock',
  'strategy',
  'double',
  'strand',
  'dna',
  'donor',
  'short',
  'homology',
  'arms',
  'nature',
  'chemical',
  'biology',
  'lian',
  'shultz',
  'cao',
  'hamedirad',
  'zhao',
  'multi',
  'functional',
  'genome',
  'wide',
  'crispr',
  'system',
  'high',
  'throughput',
  'genotype',
  'phenotype',
  'mapping',
  'nature',
  'communications',
  'hamedirad',
  'chao',
  'weisberg',
  'lian',
  'sinha',
  'zhao',
  'fully',
  'automate',
  'algorithm',
  'drive',
  'platform',
  'biosystems',
  'design',
  'nature',
  'communications',
  'wang',
  'guo',
  'dong',
  'zhao',
  'activation',
  'silent',
  'biosynthetic',
  'gene',
  'clusters',
  'transcription',
  'factor',
  'decoys',
  'nature',
  'chemical',
  'biology',
  'si',
  'tian',
  'min',
  'zhang',
  'sweedler',
  'van',
  'der',
  'donk',
  'zhao',
  'rapid',
  'structure',
  'activity',
  'scree

  'staining',
  'compute',
  'molecular',
  'histopathology',
  'technology',
  'yeh',
  'kenkel',
  'liu',
  'bhargava',
  'fast',
  'infrared',
  'chemical',
  'imaging',
  'quantum',
  'cascade',
  'laser',
  'anal',
  'chem',
  'van',
  'dijk',
  'mayerich',
  'carney',
  'optics',
  'sample',
  'interaction',
  'infrared',
  'spectroscopic',
  'devetter',
  'sivapalan',
  'patel',
  'schulmerich',
  'murphy',
  'bhargava',
  'observation',
  'molecular',
  'diffusion',
  'polyelectrolyte',
  'wrap',
  'sers',
  'nanoprobes',
  'langmuir',
  'liu',
  'schulmerich',
  'bhargava',
  'cunningham',
  'sculpting',
  'narrowband',
  'fano',
  'resonance',
  'inherent',
  'large',
  'area',
  'mid',
  'infrared',
  'photonic',
  'crystal',
  'microresonator',
  'spectroscopic',
  'image',
  'opt',
  'express',
  'rosenberg',
  'surya',
  'liu',
  'streyer',
  'law',
  'leslie',
  'bhargava',
  'wasserman',
  'flat',
  'mid',
  'infrared',
  'composite',
  'plasmonic',
  'material',
  'lat

  'effect',
  'liquid',
  'solid',
  'phase',
  'coexistence',
  'estimation',
  'crystal',
  'nucleation',
  'barrier',
  'physical',
  'review',
  'letter',
  'vol',
  'pp',
  'american',
  'physical',
  'society',
  'schmitz',
  'statt',
  'virnau',
  'binder',
  'investigation',
  'finite',
  'size',
  'effect',
  'determination',
  'interfacial',
  'tensions',
  'high',
  'performance',
  'computing',
  'science',
  'engineering',
  'pp',
  'springer',
  'cham',
  'rios',
  'de',
  'anda',
  'statt',
  'turci',
  'royall',
  'cp',
  'low',
  'density',
  'crystal',
  'charge',
  'colloid',
  'comparison',
  'yukawa',
  'theory',
  'contributions',
  'plasma',
  'physics',
  'statt',
  'ko',
  'virnau',
  'binder',
  'estimation',
  'nucleation',
  'barriers',
  'simulations',
  'crystal',
  'nuclei',
  'surround',
  'fluid',
  'equilibrium',
  'high',
  'performance',
  'computing',
  'science',
  'engineering',
  'pp',
  'springer',
  'cham',
  'statt',
  'schmitz',
  'virnau',
 

  'journals',
  'hollinger',
  'maloney',
  'jayashree',
  'natarajan',
  'markoski',
  'kenis',
  'nanoporous',
  'separator',
  'low',
  'fuel',
  'concentration',
  'minimize',
  'crossover',
  'direct',
  'methanol',
  'laminar',
  'flow',
  'fuel',
  'cell',
  'power',
  'sorc',
  'brushett',
  'jayashree',
  'zhou',
  'kenis',
  'investigation',
  'fuel',
  'media',
  'flexible',
  'laminar',
  'flow',
  'base',
  'fuel',
  'cells',
  'elect',
  'acta',
  'jayashree',
  'yoon',
  'brushett',
  'lopez',
  'montesinos',
  'natarajan',
  'markoski',
  'kenis',
  'performance',
  'membraneless',
  'laminar',
  'flow',
  'base',
  'fuel',
  'cell',
  'pow',
  'sorc',
  'press',
  'kolossov',
  'sprin',
  'sokolowski',
  'conour',
  'clegg',
  'kenis',
  'gaskins',
  'engineering',
  'redox',
  'sensitive',
  'linkers',
  'genetically',
  'encode',
  'fret',
  'base',
  'biosensor',
  'experimental',
  'biology',
  'medicine',
  'talreja',
  'kenis',
  'zukoski',
  'kinetic',
  'model'

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vectorizer = TfidfVectorizer(sublinear_tf=True, decode_error="ignore", tokenizer=identity_tokenizer, lowercase=False, max_df=0.9, min_df=0.1)
tf_idf = tf_idf_vectorizer.fit_transform(docs)

In [105]:
df = pd.DataFrame(tf_idf[0].T.todense(), index=tf_idf_vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print(df.head(25))

                 TF-IDF
doi            0.237508
zhao           0.208201
zhang          0.167906
interfaces     0.163942
solution       0.163942
materials      0.161053
crystal        0.146143
polymers       0.144009
highlight      0.140131
polymer        0.132743
invite         0.129451
charge         0.124561
understanding  0.123238
multiscale     0.123238
assembly       0.118381
issue          0.118381
applied        0.118159
transport      0.115818
influence      0.109544
advanced       0.108199
li             0.108055
acs            0.106545
design         0.104464
matrix         0.104110
transition     0.104110


In [109]:
from sklearn.decomposition import LatentDirichletAllocation

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

lda = LatentDirichletAllocation(n_components=10, learning_method='online')
lda.fit(tf_idf)

print_top_words(lda, tf_idf_vectorizer.get_feature_names(), 10)

Topic #0: university award science illinois professor
Topic #1: cellular kenis inspire uc co
Topic #2: colloidal physical cell macromolecules state
Topic #3: academic researcher assistant teaching teach
Topic #4: zukoski richard electrochemical medal evolution
Topic #5: stem pt formation image miller
Topic #6: teach undergraduate equilibrium biotechnology rapid
Topic #7: program undergraduate graduate assistant science
Topic #8: director associate academic carbon renewable
Topic #9: lecturer chbe new recent news



In [112]:
res = lda.transform(tf_idf[-2:])
print(res)

[[0.05000552 0.05000051 0.05000107 0.54997866 0.05000064 0.05000084
  0.05000121 0.05000981 0.05000064 0.05000108]
 [0.04143402 0.04143342 0.04143351 0.62709662 0.04143358 0.04143376
  0.04143407 0.04143373 0.0414338  0.04143349]]


In [None]:

# TODO regex matching for email, phone
# look through tags for name, title/role HTML classes

# dept_htmls = soup.find_all("h3", "list-expand-header")
# dept_names = [dh.get_text() for dh in dept_htmls]
# pprint(dept_names)

# engr_dept_faculty = {
#     "Agricultural and Biological Engineering": [],
#     "Aerospace Engineering": [],
#     "Bioengineering": [],
#     "Civil and Environmental Engineering": "",
#     "Chemical & Biomolecular Engineering": "",
#     "Computer Science",
#     "Electrical and Computer Engineering",
#     "Industrial and Enterprise Systems Engineering",
#     "Materials Science and Engineering",
#     "Mechanical Science and Engineering",
#     "Nuclear, Plasma and Radiological Engineering",
#     "Physics"
# }