In [1]:
#imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import sys
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
import pandas as pd
import csv
import re
import os
from datetime import datetime

In [2]:
# Get user input for filename prefix
filename_prefix = input("Enter your file name followed by the current semester: ")
date_str = datetime.today().strftime('%Y-%m-%d')

# Create directories if they don't exist
os.makedirs("webscrapedInfo", exist_ok=True)
os.makedirs("intermediateFiles", exist_ok=True)
os.makedirs("finalFiles", exist_ok=True)

# Define filenames with respective directories
output_filename = f"webscrapedInfo/{filename_prefix}_Output_{date_str}.csv"
all_faculty_info_filename = f"webscrapedInfo/{filename_prefix}_AllFacultyInfo_{date_str}.csv"
contactinfo = f"intermediateFiles/{filename_prefix}_contactinfo_{date_str}.csv"
contactinfo_Final = f"finalFiles/{filename_prefix}_contactinfo_Final_{date_str}.csv"

In [3]:
#Driver for firefox and loading the Eweb url
driver = webdriver.Firefox()
driver.implicitly_wait(10)

driver.get("https://www.easternct.edu/faculty-directory/index.html")

In [4]:
#selecting SearchButton
Search = driver.find_element(by = 'id', value = 'directorySearchButton')
Search.click()

In [5]:
#Locating the HTML where the info is stored
table = driver.find_elements(by = 'class name', value = "mix")

In [6]:
for t in table[1:]:
    t.text
# Assuming table is already defined
data = [t.text.strip() for t in table[1:]]

# Create a DataFrame
df = pd.DataFrame(data, columns=["Extracted Text"])

# Save to CSV
df.to_csv(output_filename, index=False)

print(f"CSV file saved as '{output_filename}'")

CSV file saved as 'webscrapedInfo/test9Spring2025_Output_2025-04-02.csv'


In [7]:
#Close the webpage
driver.quit()

In [8]:
def parse_professor_info(text):
    """
    Parse professor info from a multi-line string into a dictionary.
    Expected format:
      Line 1: Name
      Line 2: Position
      Line 3: Department
      Subsequent lines: 'Phone:', 'Email:', 'Office:' and optionally 'Hours:'
    """
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    professor = {
        'name': None,
        'position': None,
        'department': None,
        'phone': None,
        'email': None,
        'office': None,
        'Hours': None
    }
    
    if len(lines) >= 1:
        professor['name'] = lines[0]
    if len(lines) >= 2:
        professor['position'] = lines[1]
    if len(lines) >= 3:
        professor['department'] = lines[2]
    
    for line in lines:
        if line.startswith('Phone:'):
            professor['phone'] = line.replace('Phone:', '').strip()
        elif line.startswith('Email:'):
            professor['email'] = line.replace('Email:', '').strip()
        elif line.startswith('Office:'):
            professor['office'] = line.replace('Office:', '').strip()
        elif line.startswith('Hours:'):
            professor['Hours'] = line.replace('Hours:', '').strip()
    
    return professor

def main():
    # Load the CSV file with extracted text
    df = pd.read_csv(output_filename)
    
    # Parse each row and create a new DataFrame with structured data
    parsed_data = df["Extracted Text"].apply(parse_professor_info).tolist()
    parsed_df = pd.DataFrame(parsed_data)
    
    # Save the structured DataFrame to a new CSV file
    parsed_df.to_csv(all_faculty_info_filename, index=False)
    print(f"Parsed CSV saved as '{all_faculty_info_filename}'")

if __name__ == '__main__':
    main()

Parsed CSV saved as 'webscrapedInfo/test9Spring2025_AllFacultyInfo_2025-04-02.csv'


In [9]:

# Load the CSV file
df = pd.read_csv(all_faculty_info_filename)

# Create a new column 'userID' which is a copy of the 'name' column
df['userID'] = df['name']

# Save the updated dataframe back to a new CSV file
df.to_csv(all_faculty_info_filename, index=False)

In [10]:
#Fixing Jr. name error
# Load the CSV file
df = pd.read_csv(all_faculty_info_filename)

# Remove 'Jr.' if it appears at the end of a name with preceding spaces
# \s+  -> Matches one or more spaces before "Jr."
# Jr\. -> Matches "Jr."
# \s*$ -> Matches any trailing spaces at the end
df["userID"] = df["userID"].str.replace(r"\s+Jr\.\s*$", "", regex=True).str.strip()
df["userID"] = df["userID"].str.replace("III", "").str.strip()
df["userID"] = df["userID"].str.replace("II", "").str.strip()
df["userID"] = df["userID"].str.replace("RN", "").str.strip()

df['name'] = df['name'].str.replace(r"\s+Jr\.\s*$", "", regex=True).str.strip()
df['name'] = df['name'].str.replace("III", "").str.strip()
df['name'] = df['name'].str.replace("II", "").str.strip()
df['name'] = df['name'].str.replace("RN", "").str.strip()

# Save the modified CSV
df.to_csv(contactinfo, index=False)

print(f"Filtered data saved as '{contactinfo}'")

Filtered data saved as 'intermediateFiles/test9Spring2025_contactinfo_2025-04-02.csv'


In [11]:
# Format the name as "Last, First Initial"
df = pd.read_csv(contactinfo)

def format_name(name):
    name_parts = name.split()
    if len(name_parts) >= 2:
        last_name = name_parts[-1]  # Last name is the last word
        first_initial = name_parts[0][0]  # First initial from first word
        return f"{last_name}, {first_initial}"
    return name  # If format is unexpected, keep the name as is

df["userID"] = df["userID"].apply(format_name)

# Save the cleaned CSV
df.to_csv(contactinfo, index=False)

print(f"Updated file saved as '{contactinfo}'")

Updated file saved as 'intermediateFiles/test9Spring2025_contactinfo_2025-04-02.csv'


In [12]:
#Shrinking to desired rows
# Load the cleaned CSV
df = pd.read_csv(contactinfo)

# Select only the required columns
df_filtered = df[["userID","name", "position", "department", "phone", "email"]]

# Save the formatted CSV
df_filtered.to_csv(contactinfo_Final, index=False)

print(f"Formatted file saved as '{contactinfo_Final}'")

Formatted file saved as 'finalFiles/test9Spring2025_contactinfo_Final_2025-04-02.csv'


In [13]:
df = pd.read_csv(contactinfo_Final)

df['position'] = df['position'].str.replace(', ', ' ').str.replace(' - ', ' ').str.replace('- ', ' ').str.replace('-', ' ').str.replace(' -', ' ').str.replace('.', '').str.replace("'", '').str.replace('(', '_').str.replace(')', '_').str.replace(';', '').str.replace(":",'').str.replace("&",'')

df['department'] = df['department'].str.replace(' ', '_').str.replace('-', '').str.replace('.', '').str.replace("'", '').str.replace(",", '').str.replace('(', '_').str.replace(')', '_').str.replace("&",'')

df["name"] = df["name"].str.replace(' ', '_').str.replace('-', '').str.replace('.', '').str.replace("'", '').str.replace('(', '_').str.replace(')', '_').str.replace(',', '')
# Assuming the column with names is called 'userID', adjust if necessary
df["userID"] = df["userID"].str.replace(',', '').str.replace('-', '').str.replace(' ', '').str.replace('.', '').str.replace("'", '')
#Replace phone "-"
df["phone"] = df["phone"].str.replace('-', '_')
#Replace email "."
df["email"] = df["email"].str.replace('.', '')

#Fix No postion phone issue
df.loc[df['department'].str.contains('phone', case=False, na=False), 'department'] = 'null'
#fix hyphen and comma
df['position'] = df['position'].str.replace('–', '_')




# Convert all columns to lowercase using apply() with axis=0 to apply to each column
df = df.apply(lambda col: col.str.lower() if col.dtype == "object" else col)
#Adding commas
df['department'] = df['department'] + ', '
#removing extra position spaces
df['position'] = df['position'].str.replace(' ', '_')
#Filtering out bad info
df = df[df['userID'].apply(lambda x: len(str(x)) > 1)]

df.to_csv(contactinfo_Final, index=False)
print(f"Formatted file saved as '{contactinfo_Final}'")

Formatted file saved as 'finalFiles/test9Spring2025_contactinfo_Final_2025-04-02.csv'


In [14]:
#Combining into one string
# Load the final instructor course schedule CSV
df = pd.read_csv(contactinfo_Final)

# Function to combine the columns into a single string for each row with a space after each comma
def combine_columns(row):
    return f"{row['userID']}, {row['name']}, {row['position']}, {row['department']}{row['phone']},{row['email']}"

# Apply the function to combine the columns for each row into one formatted string
df_combined = df.apply(combine_columns, axis=1)

# Create a new DataFrame with the combined strings
df_prolog_ready = pd.DataFrame(df_combined, columns=["contactInfo"])

# Sort the DataFrame alphabetically based on the first character of the 'Courses' column
df_prolog_ready = df_prolog_ready.sort_values(by="contactInfo", ascending=True).reset_index(drop=True)

# Export the combined and sorted DataFrame to a new CSV
df_prolog_ready.to_csv(contactinfo_Final, index=False)

print(f"Exported to {contactinfo_Final}")

Exported to finalFiles/test9Spring2025_contactinfo_Final_2025-04-02.csv


Fix the contanct Info userID issue: Done
Think about Prolog questions and what facts would be put into prolog("Office Hours as a relation") : Done
Relations that are implications first order and propostional logic

TO DO:
Replace . : Done
Replace commas in department field : Done
delete ' : Done

TO DO:
Python code that creates prolog statements
Qualify the questions into there respective logic

Prolog Questions:
What time does this prof have OH
What are is this profs email, phone, office location ect..
What profs are in this building
What profs are in this department
What time is this class
Who teaches this class and sec
Does this prof have an office in x building
List all the profs that teach a Sub class
List all profs that teach at a certain time
Is x prof in there office at x time or day