In [1]:
import requests
from bs4 import BeautifulSoup

In [24]:
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the necessary information from the webpage using Beautiful Soup
    # For this example, I am assuming that the details are present in paragraphs (<p>) and headers (<h1>, <h2>, etc.)
    text_data = [element.get_text() for element in soup.find_all(['body'])]

    return " ".join(text_data)


url = "https://caesar.web.engr.illinois.edu/"
data = scrape_website(url)

In [25]:
data

'\n\n\n\nMatthew Caesar\nProfessor \nDepartment of Computer Science\nUniversity of Illinois at Urbana-Champaign\nUrbana, IL, 61801\n\nEmail: caesar (at) cs (dot) illinois (dot) edu\nOffice: Room 3118, Siebel Center \nPhone: 847-323-2968\n\n\nLinks: \n[ Publications ]\n[ Bio ]\n\n\n\n\n\nI am a Professor in the Department of Computer Science at UIUC.\nI am also an Affiliate Professor in the  Department of Electrical and Computer Engineering, an Affiliate Research Professor in the Coordinated Science Laboratory, Affiliate Professor in the School of Information Sciences, and a member of the Information Trust Institute. \nI currently serve as the Vice Chair of ACM SIGCOMM, and the co-chair of The Networking Channel, an online community talk series for the computer systems and networking community. \nI co-founded and previously served as the Chief Science Officer and President of Veriflow (sold to VMware in 2019). \nI received my Ph.D. in Computer Science from UC Berkeley.  \n\n\nMy researc

In [26]:
import spacy

# Load the SpaCy language model
nlp = spacy.load('en_core_web_sm')


def process_text(text):
    doc = nlp(text)

    # Process the text using SpaCy
    # For this example, I will extract named entities recognized by SpaCy
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]

    return named_entities

# Process the scraped data to extract structured information
structured_data = process_text(data)
print(structured_data)

[('Matthew Caesar', 'PERSON'), ('Professor \nDepartment of Computer Science\nUniversity of Illinois', 'ORG'), ('Urbana-Champaign', 'ORG'), ('Urbana', 'GPE'), ('IL', 'ORG'), ('61801', 'DATE'), ('Siebel Center \nPhone', 'WORK_OF_ART'), ('847-323-2968', 'CARDINAL'), ('the Department of Computer Science at UIUC', 'ORG'), ('Affiliate', 'ORG'), ('the  Department of Electrical and Computer Engineering', 'ORG'), ('Affiliate Research Professor', 'ORG'), ('the Coordinated Science Laboratory', 'ORG'), ('Affiliate Professor', 'ORG'), ('the School of Information Sciences', 'ORG'), ('the Information Trust Institute', 'ORG'), ('The Networking Channel', 'ORG'), ('Veriflow', 'PERSON'), ('VMware', 'ORG'), ('2019', 'DATE'), ('Ph.D. in Computer Science', 'WORK_OF_ART'), ('UC Berkeley', 'ORG'), ('one', 'CARDINAL'), ('PhD', 'WORK_OF_ART'), ('Yu-Ju Chang', 'PERSON'), ('Kuan-Yen', 'ORG'), ('Chou', 'PERSON'), ('PhD', 'WORK_OF_ART'), ('Shivram', 'PERSON'), ('Xin Jin', 'PERSON'), ('Deepti Kalasapura', 'PERSON'),

In [27]:
def clean_and_split_data(data):
    split_data = data.split('\n')
    clean_data = [info.strip() for info in split_data if info.strip()]
    return clean_data


# Get the clean and split data
clean_data = clean_and_split_data(data)

# Print the clean and split data to verify the result
for info in clean_data:
    print(info)



Matthew Caesar
Professor
Department of Computer Science
University of Illinois at Urbana-Champaign
Urbana, IL, 61801
Email: caesar (at) cs (dot) illinois (dot) edu
Office: Room 3118, Siebel Center
Phone: 847-323-2968
Links:
[ Publications ]
[ Bio ]
I am a Professor in the Department of Computer Science at UIUC.
I am also an Affiliate Professor in the  Department of Electrical and Computer Engineering, an Affiliate Research Professor in the Coordinated Science Laboratory, Affiliate Professor in the School of Information Sciences, and a member of the Information Trust Institute.
I currently serve as the Vice Chair of ACM SIGCOMM, and the co-chair of The Networking Channel, an online community talk series for the computer systems and networking community.
I co-founded and previously served as the Chief Science Officer and President of Veriflow (sold to VMware in 2019).
I received my Ph.D. in Computer Science from UC Berkeley.
My research focuses on the design, analysis, and implementation

In [None]:
def add_course(knowledge_base, course_code, course_name, semesters):
    new_course = {
        "code": course_code,
        "name": course_name,
        "semesters": semesters,
    }
    knowledge_base["teaching"]["courses"].append(new_course)