In [6]:
#Utilities import
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm  

In [7]:
#Defining the URLs for Scarping
lhr_URL = "https://lhr.nu.edu.pk/faculty/" 
pwr_URL = "https://pwr.nu.edu.pk/"
isb_URL = "https://isb.nu.edu.pk/Faculty/allfaculty/"
khi_URL = "https://khi.nu.edu.pk/faculty-php/"
cfd_URL = "https://cfd.nu.edu.pk/all-departments/"

In [8]:
#Save function
def save_csv(dataframe, name):
    # Clean and format data
    dataframe['HEC Approved PhD Supervisor'] = dataframe['HEC Approved PhD Supervisor'].astype(bool)
    dataframe = dataframe.drop_duplicates().reset_index(drop=True)
    dataframe.to_csv(name, index=False)

In [9]:
#Lahore Faculty
def extract_lhr_faculty_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    all_data = []

    # Find all department containers
    departments = soup.find_all('div', class_='container')

    for dept in tqdm(departments, desc="Processing Departments", unit="dept"):
        # Extract department name
        h1 = dept.find('h1', class_='mb-2 mt-3')
        if not h1:
            continue
        department_name = h1.get_text(strip=True).replace('Faculty', '').strip()

        # Extract faculty cards
        faculty_cards = dept.find_all('div', class_='facultyCard')

        for card in tqdm(faculty_cards, desc=f"Processing {department_name}", unit="faculty", leave=False):
            try:
                # Extract ID and faculty profile link
                faculty_link_tag = card.find('a', class_='faculty-link')
                if not faculty_link_tag:
                    continue
                faculty_link = "https://lhr.nu.edu.pk" + faculty_link_tag['href']
                faculty_id = int(faculty_link_tag['href'].split('/')[-1])

                # Extract Name
                name = card.find('h5', class_='text-center').get_text(strip=True)

                # Extract Designation and HEC Approval
                designation_tag = card.find('p', class_='small text-center font-italic')
                designation_lines = designation_tag.get_text('\n').split('\n') if designation_tag else []
                designation = designation_lines[0].strip() if designation_lines else ''
                hec_approved = any("HEC Approved PhD Supervisor" in line for line in designation_lines)

                # Extract Email
                email = card.find('p', class_='mb-0 text-center').get_text(strip=True)

                # Extract Image URL
                img = card.find('img')
                image_url = "https://lhr.nu.edu.pk" + img['src'] if img else None

                # Fetch Profile Page for Extension & Education
                profile_page = BeautifulSoup(requests.get(faculty_link).text, 'html.parser')

                # Extract Extension
                extension = None
                extension_tag = profile_page.find('span', class_='small')
                if extension_tag:
                    match = re.search(r'Ext[:\s]*([\d]+)', extension_tag.text.strip())
                    extension = match.group(1) if match else None

                # Extract Highest Education
                highest_education = None
                education_heading = profile_page.find('h2', string="Education")
                if education_heading:
                    education_list = education_heading.find_next('ul')
                    if education_list:
                        highest_education = education_list.find('li').get_text(strip=True)

                # Append extracted data
                all_data.append({
                    'ID': faculty_id,
                    'Name': name,
                    'Designation': designation,
                    'HEC Approved PhD Supervisor': hec_approved,
                    'Highest Education': highest_education,
                    'Email': email,
                    'Department': department_name,
                    'Extension': extension,
                    'ImageURL': image_url
                })

            except Exception as e:
                print(f"Error processing faculty in {department_name}: {e}")
                continue

    return all_data

In [10]:
response = requests.get(lhr_URL)
if response.status_code == 200:
    faculty_data = extract_lhr_faculty_data(response.text)
    df = pd.DataFrame(faculty_data)
    save_csv(df,'lhr.csv')

Processing Departments:   0%|          | 0/11 [00:00<?, ?dept/s]
Processing FAST School of Computing:   0%|          | 0/98 [00:00<?, ?faculty/s][A
Processing FAST School of Computing:   1%|          | 1/98 [00:00<01:27,  1.10faculty/s][A
Processing FAST School of Computing:   2%|▏         | 2/98 [00:01<01:28,  1.08faculty/s][A
Processing FAST School of Computing:   3%|▎         | 3/98 [00:02<01:25,  1.12faculty/s][A
Processing FAST School of Computing:   4%|▍         | 4/98 [00:03<01:21,  1.15faculty/s][A
Processing FAST School of Computing:   5%|▌         | 5/98 [00:04<01:18,  1.18faculty/s][A
Processing FAST School of Computing:   6%|▌         | 6/98 [00:05<01:18,  1.17faculty/s][A
Processing FAST School of Computing:   7%|▋         | 7/98 [00:06<01:17,  1.18faculty/s][A
Processing FAST School of Computing:   8%|▊         | 8/98 [00:06<01:17,  1.15faculty/s][A
Processing FAST School of Computing:   9%|▉         | 9/98 [00:07<01:18,  1.13faculty/s][A
Processing FAST School 

Error processing faculty in FAST School of Computing: 'NoneType' object has no attribute 'get_text'



Processing FAST School of Computing:  63%|██████▎   | 62/98 [00:56<00:31,  1.15faculty/s][A
Processing FAST School of Computing:  64%|██████▍   | 63/98 [00:56<00:30,  1.15faculty/s][A
Processing FAST School of Computing:  65%|██████▌   | 64/98 [00:57<00:29,  1.16faculty/s][A
Processing FAST School of Computing:  66%|██████▋   | 65/98 [00:58<00:29,  1.13faculty/s][A
Processing FAST School of Computing:  67%|██████▋   | 66/98 [00:59<00:29,  1.08faculty/s][A
Processing FAST School of Computing:  68%|██████▊   | 67/98 [01:00<00:28,  1.09faculty/s][A
Processing FAST School of Computing:  69%|██████▉   | 68/98 [01:01<00:27,  1.08faculty/s][A
Processing FAST School of Computing:  70%|███████   | 69/98 [01:02<00:26,  1.08faculty/s][A
Processing FAST School of Computing:  71%|███████▏  | 70/98 [01:03<00:25,  1.09faculty/s][A
Processing FAST School of Computing:  72%|███████▏  | 71/98 [01:04<00:24,  1.08faculty/s][A
Processing FAST School of Computing:  73%|███████▎  | 72/98 [01:05<00

In [12]:
#Karachi Faculty
def extract_khi_faculty_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    all_data = []

    departments_tag = soup.find('li',
                                class_='menu-item menu-item-type-custom menu-item-object-custom current-menu-ancestor current-menu-parent menu-item-has-children menu-item-6256')
    departments = departments_tag.find_all('li', class_='menu-item')

    department_data = [(dept.a.text.strip(), dept.a['href']) for dept in departments]
    serial_number = 1

    for name, link in department_data:
        department_name = name
        response_inner = requests.get(link)
        faculty = BeautifulSoup(response_inner.text, 'html.parser')
        faculty_cards = faculty.find_all('div', class_="gdlr-core-personnel-list-column")
        print("Scraping: "+department_name)
        for card in faculty_cards:
            try:
                faculty_link = card.find('a')
                response_inner_inner = requests.get(faculty_link['href'])
                faculty_page = BeautifulSoup(response_inner_inner.text, 'html.parser')

                email_tag = faculty_page.find('div', class_='kingster-personnel-info-list kingster-type-email')
                extension_tag = faculty_page.find('div', class_='kingster-personnel-info-list kingster-type-phone')
                name_tag = faculty_page.find('h3', class_='gdlr-core-title-item-title gdlr-core-skin-title')
                designation_hec_tag = faculty_page.find('span',
                                                        class_='gdlr-core-title-item-caption gdlr-core-info-font gdlr-core-skin-caption')
                image_tag = faculty_page.find('a', class_='gdlr-core-lightgallery gdlr-core-js')

                email = email_tag.text.strip() if email_tag else 'N/A'
                extension = extension_tag.text.strip() if extension_tag else 'N/A'
                name = name_tag.text.strip() if name_tag else 'N/A'
                designation = 'N/A'
                hec_approved = False

                if designation_hec_tag:
                    designation_text = designation_hec_tag.get_text(separator=' ').strip()
                    designation = designation_text.split('\n')[0].strip()
                    hec_approved = 'HEC Approved PhD Supervisor' in designation_text

                image_url = image_tag['href'] if image_tag else 'N/A'

                education = 'N/A'
                education_heading = faculty_page.find('i', class_='gdlr-core-icon-list-icon fa fa-graduation-cap')

                if education_heading:
                    education_tag = education_heading.find_next('span')
                    if education_tag:
                        education = education_tag.text.strip()

                if education == 'N/A':
                    education_heading = faculty_page.find(lambda tag: tag.name in ['h3'] and "Education" in tag.text)
                    if education_heading:
                        education_tag = education_heading.find_next('span')
                        if education_tag:
                            education = education_tag.text.strip()

                all_data.append({
                    'ID': serial_number,
                    'Name': name,
                    'Designation': designation,
                    'HEC Approved PhD Supervisor': hec_approved,
                    'Highest Education': education,
                    'Email': email,
                    'Department': department_name,
                    'Extension': extension,
                    'ImageURL': image_url
                })
                serial_number += 1

            except Exception as e:
                print(f"Error processing faculty in {department_name}: {e}")
                continue

    return all_data

In [13]:
response = requests.get(khi_URL)
if response.status_code == 200:
    faculty_data = extract_khi_faculty_data(response.text)
    df = pd.DataFrame(faculty_data)
    save_csv(df,'khi.csv')

Scraping: Department of Computer Science
Scraping: Department of Cyber Security
Scraping: Department of Artificial Intelligence
Scraping: Department of Software Engineering
Scraping: Department of Electrical Engineering
Scraping: Department of Management Sciences
Scraping: Department of Sciences & Humanities


In [14]:
#Faisalabad Faculty
def extract_cfd_faculty_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    all_data = []
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    serial_number = 152
    departments_tag = soup.find('div', class_='kc-elm kc-css-633007 kc_row kc_row_inner')
    departments = departments_tag.find_all('div', class_='kc_col-sm-3')
    for dept in departments:
        department_name = dept.find('div', class_='content-desc').text.strip()
        department_link = dept.find('a')['href']
        response_inner = requests.get(department_link, headers=headers)
        faculty = BeautifulSoup(response_inner.text, 'html.parser')
        faculty_cards = faculty.find_all('div', class_="col-md-3 col-sm-6 col-xs-12")
        for card in faculty_cards:
            try:
                # Extract profile page link
                profile_link_tag = card.find('a', href=True)
                profile_link = profile_link_tag['href'] if profile_link_tag else 'N/A'
                response_inner_inner = requests.get(profile_link, headers=headers)
                profile = BeautifulSoup(response_inner_inner.text, 'html.parser')
                # Find the <li> containing the <span> with text "Ext:"
                teacher_address = profile.find('ul', class_='teacher__address')
                # Find the <li> where <span> contains "Ext:"
                ext_li = teacher_address.find_all('li')
                print(profile_link)

                # Extract the extension number by removing "Ext:" text
                extension = ext_li[3].get_text(strip=True).replace("Ext:", "").strip() if ext_li[3] else 'N/A'
                education = 'N/A'
                education_tag = profile.find('div', class_='htc__skill__container progress__bar--2')
                education = 'N/A'
                if education_tag.find('p'):
                    education = education_tag.find_all('p')[0].text
                if education_tag.find('li'):
                    education = education_tag.find_all('li')[0].text

                education = education.replace("\n", " ").strip()
                # Extract image URL
                image_tag = card.find('img')
                image_url = image_tag['src'] if image_tag else 'N/A'

                # Extract name
                name_tag = card.find('h4')
                name = name_tag.text.strip() if name_tag else 'N/A'

                # Extract designation
                designation_tag = card.find('h6')
                designation = designation_tag.text.strip() if designation_tag else 'N/A'

                # Check for HEC approval
                hec_approved = 'HEC approved PhD Supervisor' in card.text

                # Extract email
                email_tag = card.find('p')
                email = email_tag.text.strip() if email_tag else 'N/A'
                # Store extracted data
                all_data.append({
                    'ID': serial_number,
                    'Name': name,
                    'Designation': designation,
                    'HEC Approved PhD Supervisor': hec_approved,
                    'Highest Education': education,
                    'Email': email,
                    'Department': department_name,
                    'Extension': extension,
                    'ImageURL': image_url,
                })
                serial_number += 1

            except Exception as e:
                print(f"Error processing faculty card: {e}")
                continue

    return all_data

In [15]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(cfd_URL,headers=headers)
if response.status_code == 200:
    faculty_data = extract_cfd_faculty_data(response.text)
    df = pd.DataFrame(faculty_data)
    save_csv(df,'cfd.csv')

https://cfd.nu.edu.pk/faculty/muhammad-fayyaz/
https://cfd.nu.edu.pk/faculty/muhammad-shahzad-sarfraz/
https://cfd.nu.edu.pk/faculty/muhammad-umar-aftab/
https://cfd.nu.edu.pk/faculty/muhammad-usama/
https://cfd.nu.edu.pk/faculty/ammar-rafiq/
https://cfd.nu.edu.pk/faculty/iqra-muhammad/
https://cfd.nu.edu.pk/faculty/rabia-maqsood/
https://cfd.nu.edu.pk/faculty/usman-ghous/
https://cfd.nu.edu.pk/faculty/adeel-ashraf-cheema/
https://cfd.nu.edu.pk/faculty/asma-sattar/
https://cfd.nu.edu.pk/faculty/hafiz-tayyeb-javed/
https://cfd.nu.edu.pk/faculty/maria-maqsood/
https://cfd.nu.edu.pk/faculty/muhammad-atif/
https://cfd.nu.edu.pk/faculty/rida-ghafoor-hussain/
https://cfd.nu.edu.pk/faculty/rizwan-ul-haq/
https://cfd.nu.edu.pk/faculty/zain-iqbal/
https://cfd.nu.edu.pk/faculty/ahmad-ali-tabassam/
https://cfd.nu.edu.pk/faculty/ali-hamza/
https://cfd.nu.edu.pk/faculty/ali-raza/
https://cfd.nu.edu.pk/faculty/aliza-saeed/
https://cfd.nu.edu.pk/faculty/aqsa-younas/
https://cfd.nu.edu.pk/faculty/ayes

In [28]:
#Islamabad Faculty
from datetime import datetime

# Base URLs
faculty_api_url = "http://isb.nu.edu.pk/Faculty/GetAllEmp?id={}"
faculty_profile_url = "http://isb.nu.edu.pk/Faculty/Details1?id={}"  

# Department ID to Name Mapping
department_mapping = {
    "301": "FAST School of Computing",
    "302": "FAST School of Engineering",
    "303": "FAST School of Management",
    "313": "FAST School of Science & Humanities",
}

# Headers for requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def get_highest_education(emp_id):
    """Fetches the highest education details from the faculty profile page."""
    profile_url = faculty_profile_url.format(emp_id)
    response1 = requests.post(profile_url, headers=headers)
    
    if response1.status_code == 200:
        profile = response1.json()
        
        if "listEEdu" in profile and profile["listEEdu"]:
            last_edu = profile["listEEdu"][0]
            
            degree = last_edu.get("Degree_Name", "N/A")
            subject = last_edu.get("Degree_Subject", "N/A")
            institute = last_edu.get("Degree_Institute", "N/A")
            year = last_edu.get("Degree_Year", "N/A")

            # Convert timestamp if available
            if isinstance(year, str) and year.startswith("/Date("):
                try:
                    timestamp = int(year[6:-2]) / 1000  # Extract timestamp
                    year = datetime.utcfromtimestamp(timestamp).strftime("%Y")
                except ValueError:
                    year = "N/A"
            
            return f"{degree} in {subject}, {institute} ({year})"
    
    return "N/A"

def extract_isb_faculty_data(dept_ids):
    """Fetches faculty data for the given department IDs and returns a list of dictionaries."""
    all_data = []
    
    for dept_id in dept_ids:
        url = faculty_api_url.format(dept_id)
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            try:
                faculty_list = response.json()
                for faculty in faculty_list:
                    emp_id = faculty.get("Emp_ID", "N/A")
                    name = f"{faculty.get('Title', '')} {faculty.get('Name', 'N/A')}".strip()
                    designation = faculty.get("Designation_Title", "N/A")
                    hec_approved = faculty.get("hec_supervisor", False)
                    email = faculty.get("Email", "N/A")
                    department_name = department_mapping.get(faculty.get("Dept_ID", "N/A"), "Unknown Department")
                    extension = faculty.get("Extension", "N/A")
                    image_url = f"http://isb.nu.edu.pk{faculty.get('ImagePath', '')}" if faculty.get("ImagePath") else "N/A"

                    # Fetch highest education
                    education = get_highest_education(emp_id)
                    if education:
                     education = re.sub(r'\s+', ' ', education).strip()
                    if email:
                     email = email.replace("\n", " ").strip()
                    if name:
                     name = name.replace("\n", " ").strip()
                    if designation:
                     designation = designation.replace("\n", " ").strip()
                    if department_name:
                     department_name = department_name.replace("\n", " ").strip()
                    if extension:
                     extension = extension.replace("\n", " ").strip()
                    # Append extracted data
                    all_data.append({
                        'ID': emp_id,
                        'Name': name,
                        'Designation': designation,
                        'HEC Approved PhD Supervisor': hec_approved,
                        'Highest Education': education,
                        'Email': email,
                        'Department': department_name,
                        'Extension': extension,
                        'ImageURL': image_url
                    })
            except ValueError:
                print(f"Failed to parse JSON for department ID {dept_id}")
        else:
            print(f"Request failed for department ID {dept_id} with status code {response.status_code}")
    
    return all_data

In [29]:
dept_ids = ["301", "302", "303","313"]
faculty_data = extract_isb_faculty_data(dept_ids)
df = pd.DataFrame(faculty_data)
save_csv(df,'isb.csv')