# Code to WebScrap information from MSU ComArtSci website

## Import necessary python libraries

In [1]:
import requests
from bs4 import BeautifulSoup

### Define the URL and get the list of people in the directory

In [6]:
URL = "https://comartsci.msu.edu/directory"

def get_list_faculty_members(url):
    HEADERS = ({'User-Agent':
                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
                'Accept-Language': 'en-US, en;q=0.5'})

    response = requests.get(f"{url}", data=HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        list_names = soup.find_all("h4", {"class": "explore-entity__title"})

    list_names = [name.text for name in list_names]
    return list_names
    
list_faculty = get_list_faculty_members(URL)
print(list_faculty)

['Teresa Mastin', 'Iasmim Admiden', 'Suhwoo Ahn', 'Fashina Aladé', 'Saleem E. Alhabash', 'Amanda Allard', 'Jeana-Dee Allen', 'Tracy Anderson', 'Marialina Côgo Antolini', 'Sofia Aparicio', 'Jason Archer', 'Young Anna Argyris', 'Megan Arney', 'Alexander Atwell', 'Elisavet Averkiadi', 'Russell Banks', 'Marge Barkman', 'Ava Francesca Battocchio', 'Johannes M. Bauer', 'Sevgi Baykaldi', 'Julie Beaty', 'Ken Beer', 'Eric Benderoff', 'Connie Benedict', 'Alexandra Benitez', 'Chad Bennett', 'Gary Bente', 'Daniel E. Bergan', 'John C. Besley', 'Subhalakshmi Bezbaruah', 'Christopher Bilski', 'Jia Bin', 'Jeremy Gibson Bond', 'Nicole Bond', 'Susan Bonner', 'Howard Bossen', 'Jess Brandt', 'Andrew Bredland', 'Mary Bresnahan', 'Danielle K. Brown', 'Heather Brown', 'Janice Bukovac-Phelps', 'Kristin Butler', 'Lisabeth Bylina', 'Celeste Campos-Castillo', 'Huajie Cao', 'Dustin Carnahan', 'Sue Carter']


### Crawl through each faculty member and get the information

In [22]:
def get_faculty_directory_results(url, faculty_name):
    HEADERS = ({'User-Agent':
                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
                'Accept-Language': 'en-US, en;q=0.5'})

    response = requests.get(f"{url}/{faculty_name}", data=HEADERS)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print(f"Failed to retrieve search results. Status code: {response.status_code}")
        return None


def extract_name(soup):
    name = soup.find("h1")
    if name:
        name = name.text
    return name

def extract_pos(soup):
    pos = soup.find("h4")
    if pos:
        pos = pos.text
    return pos

def extract_dept(soup):
    ul = soup.find("ul", {"class": "single-person__department"})
    list_depts = []
    if ul:
        for li in ul.findAll('li'):
            list_depts.append(li.text)

    return list_depts

def extract_email(soup):
    links = soup.find_all("a", href=True)
    email = None

    for link in links:
        if "mailto" in link.get("href"):
            email = link.get("href")
            break
            
    if email:
        email = email.replace("mailto:", "")
    return email

def extract_bio(soup):
    bio = soup.find("div", {"class": "single-person__content"})
    if bio:
        bio = bio.text
    return bio

def extract_info(soup):
    info = {}
    name = extract_name(soup)
    pos = extract_pos(soup)
    list_depts = extract_dept(soup)
    email = extract_email(soup)
    bio = extract_bio(soup)

    info["name"] = name
    info["pos"] = pos
    info["dept"] = list_depts
    info["email"] = email
    info["bio"] = bio
    return info

URL = "https://comartsci.msu.edu/our-people"
list_faculty_dict = []
for faculty in list_faculty:
    faculty_id = faculty.lower().replace(" ", "-")
    soup = get_faculty_directory_results(URL, faculty_name=faculty_id)
    if soup:
        faculty_dict = extract_info(soup)
        list_faculty_dict.append(faculty_dict)

Failed to retrieve search results. Status code: 404
Failed to retrieve search results. Status code: 404
Failed to retrieve search results. Status code: 404
Failed to retrieve search results. Status code: 404
Failed to retrieve search results. Status code: 404


### Create an excel from the information

In [29]:
import pandas as pd

list_info = ["name", "pos", "dept", "email", "bio"]
df = pd.DataFrame()

for type_info in list_info:
    list_type_info = []
    for faculty_info in list_faculty_dict:
        list_type_info.append(faculty_info[type_info])
    
    df[type_info] = list_type_info

print(df.head(5))
df.to_csv("webscraped_data.csv", index=False)

                    name                    pos  \
0   Teresa  Mastin Ph.D.           Interim Dean   
1       Iasmim  Admiden           Ph.D. Student   
2           Suhwoo  Ahn         Ph.D. Candidate   
3   Fashina  Aladé,  PhD    Assistant Professor   
4        Amanda  Allard   Ph.D. Candidate (ABD)   

                               dept             email  \
0  [Advertising + Public Relations]  mastinte@msu.edu   
1                      [Journalism]  amidenia@msu.edu   
2                   [Communication]  ahnsuhwo@msu.edu   
3  [Advertising + Public Relations]  aladefas@msu.edu   
4                   [Communication]  allardam@msu.edu   

                                                 bio  
0  Teresa Mastin, PhD (1998), Mass Media—Michigan...  
1                                               None  
2  Suhwoo Ahn is a fifth-year doctoral student in...  
3  Fashina (Shina) Aladé is an Assistant Professo...  
4  Amanda Allard is projected to graduate in the ...  
