Scraping

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time, json

# Function to initialize the driver
def initialize_driver():
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 30)
    return driver, wait

# Function to login and verify OTP
def login(driver, wait, phone_number, otp_file):
    driver.get("https://dashboard.ambitio.club/admit-finder")

    phone_input = driver.find_element(By.CSS_SELECTOR, 'input[type="tel"]')
    phone_input.send_keys(phone_number)

    send_button = driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
    send_button.click()

    time.sleep(40)  # Wait for the OTP to arrive and be entered
    with open(otp_file, "r") as file:
        otp = file.read()
    
    otp_inputs = driver.find_elements(By.CSS_SELECTOR, 'input[type="tel"]')
    for i in range(6):
        otp_inputs[i].send_keys(otp[i])

    skip_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Skip")]')))
    skip_button.click()

# Function to extract profile links
def extract_profile_links(driver):
    profile_links = []
    grid_div = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.grid.mob\\:grid-cols-1.grid-cols-3")))
    profile_divs = grid_div.find_elements(By.CSS_SELECTOR, 'a.border')

    for profile in profile_divs:
        profile_link = profile.get_attribute('href')
        profile_links.append(profile_link)
    
    return profile_links

# Function to extract data from a profile
def extract_profile_data(driver, wait, profile_link):
    profile_data = {'link': profile_link}
    driver.get(profile_link)
    
    time.sleep(1)
    name = driver.find_element(By.CSS_SELECTOR, 'p.font-primary-bold.text-primary.text-\\[1\\.1vw\\].whitespace-nowrap.truncate.mob\\:text-\\[3\\.6vw\\]').text
    term = driver.find_element(By.CSS_SELECTOR, 'p.font-secondary-bold.text-\\[\\.93vw\\].mob\\:text-\\[3vw\\].text-granite').text
    profile_data['name'] = name
    profile_data['term'] = term
    cards = driver.find_elements(By.CLASS_NAME, "admitFinderDetailsCard")
    l = 0
    for card in cards:
        title = card.find_element(By.CLASS_NAME, "admitFinderProfileSectionSubtitle").text
        if title == "Test scores":
            profile_data['test_scores'] = extract_test_scores(wait)
        elif title == "Work Experience":
            profile_data['work_experience'], l = extract_work_experience(wait)
        elif title == "Education":
            profile_data['education'] = extract_education(wait, l)
    applications_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//div[@class="flex items-center mob:justify-center gap-[.7vw] border-b-[2px]  border-whisper  px-[3vw] pb-[0.5vw] cursor-pointer mob:w-[50%] mob:pb-[2.5vw] "]/p[text()="Applications"]'))
    )
    applications_button.click()
    # time.sleep(1)

    # Extract university name, program, and status
    applications = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.border-whisper.flex.items-center.justify-between.w-full.py-\\[1\\.6vw\\].mob\\:py-\\[4\\.154vw\\]')))
    
    applications_data = []

    for app in applications:
        university_name = app.find_element(By.CSS_SELECTOR, 'p.font-primary-medium').text
        program = app.find_element(By.CSS_SELECTOR, 'p.font-secondary-medium').text
        status = app.find_elements(By.TAG_NAME, "p")[-1].text

        applications_data.append({
            'University Name': university_name,
            'Program': program,
            'Status': status
        })
    profile_data['applications'] = applications_data
    
    return profile_data

# Function to extract test scores
def extract_test_scores(wait):
    test_scores = []
    blocks = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.flex.justify-between.pb-\\[\\.83vw\\].mob\\:pb-\\[3vw\\]")))
    
    for block in blocks:
        test_name = block.find_element(By.CSS_SELECTOR, "p.admitFinderProfileItemsTitle").text
        score = block.find_element(By.CSS_SELECTOR, "p.font-secondary-bold").text
        sub_scores = block.find_elements(By.CSS_SELECTOR, "div.text-end p.font-secondary-bold")
        sub_scores_titles = block.find_elements(By.CSS_SELECTOR, "div.text-end p.font-primary-regular")
        sub_scores_text = [sub.text for sub in sub_scores]
        sub_scores_titles_text = [sub.text for sub in sub_scores_titles]

        test_scores.append({
            "test_name": test_name,
            "score": score,
            "sub_scores_titles": sub_scores_titles_text,
            "sub_scores": sub_scores_text
        })

    return test_scores

# Function to extract work experience
def extract_work_experience(wait):
    work_experience = []
    blocks = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.flex.items-center.gap-\\[\\.5vw\\].mob\\:gap-\\[1\\.5vw\\].w-full")))
    
    for block in blocks:
        role = block.find_element(By.CSS_SELECTOR, "p.admitFinderProfileItemsTitle.mob\\:w-\\[40vw\\]").text
        role_time = block.find_element(By.CSS_SELECTOR, "p.font-secondary-semibold").text
        role_type = block.find_element(By.CSS_SELECTOR, "p.font-primary-medium").text
        company = block.find_element(By.CSS_SELECTOR, "p.font-secondary-bold ").text
        
        work_experience.append({
            "role": role,
            "role_time": role_time,
            "role_type": role_type,
            "company": company
        })
    l = len(blocks)

    return work_experience, l

# Function to extract education details
def extract_education(wait, l=0):
    education = []
    blocks = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.flex.justify-between.mob\\:pt-\\[4\\.1vw\\]")))
    
    for block in blocks[l:]:
        try:
            org = block.find_element(By.CSS_SELECTOR, "p.admitFinderProfileItemsTitle").text
        except NoSuchElementException:
            org = None
        try:
            level = block.find_element(By.CSS_SELECTOR, "p.font-secondary-medium.text-granite").text
        except NoSuchElementException:
            level = None
        try:
            grade_type = block.find_element(By.CSS_SELECTOR, "p.font-primary-medium").text
        except NoSuchElementException:
            grade_type = None
        try:
            grade = block.find_element(By.CSS_SELECTOR, "p.font-secondary-bold").text
        except NoSuchElementException:
            grade = None
        try:
            course = block.find_element(By.CSS_SELECTOR, "p.font-secondary-semibold").text
        except NoSuchElementException:
            course = None
        
        education.append({
            "org": org,
            "level": level,
            "grade_type": grade_type,
            "grade": grade,
            "course": course
        })

    return education

# Function to move to the next page
def go_to_next_page(driver, wait):
    next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Go to next page"]')))
    
    # Scroll the page until the "Next" button is visible
    driver.execute_script("arguments[0].scrollIntoView();", next_button)
    
    # Wait a moment to ensure the button is fully visible and clickable
    time.sleep(1)
    
    next_button.click()
    wait.until(EC.staleness_of(next_button))  # Wait until the next page loads completely


# Function to go back to the profiles page
def go_back_to_profiles_page(driver, wait):
    back_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.flex.w-max.items-center.mob\\:gap-\\[1\\.1vw\\].pl-\\[2\\.55vw\\].pt-\\[1\\.67vw\\].mob\\:pt-\\[4\\.1vw\\]')))
    back_button.click()
    wait.until(EC.staleness_of(back_button))  # Wait until the profiles page reloads

# Main function to collect all profiles data across multiple pages
def collect_all_profiles_data(driver, wait, start_page, end_page):
    all_profiles_data = []
    for i in range(start_page-1):
        go_to_next_page(driver, wait)
    
    for page in range(start_page, end_page + 1):
        time.sleep(1)
        print(f"Scraping page {page}")
        profile_links = extract_profile_links(driver)
        
        for profile_link in profile_links:
            profile_data = extract_profile_data(driver, wait, profile_link)
            all_profiles_data.append(profile_data)
            go_back_to_profiles_page(driver, wait)
            time.sleep(1)
        
        if page < end_page:
            go_to_next_page(driver, wait)
            
    
    return all_profiles_data

# Function to save data as JSON
def save_data_as_json(data, file_name):
    try:
        # Open the file in 'r+' mode, which allows reading and writing
        with open(file_name, 'r+') as json_file:
            try:
                existing_data = json.load(json_file)  # Load existing data
            except json.JSONDecodeError:
                existing_data = []  # If the file is empty or not a valid JSON, start with an empty list
            
            existing_data.extend(data)  # Append new data

            # Move the file pointer to the beginning of the file
            json_file.seek(0)

            # Write the updated data and truncate the rest of the file content
            json.dump(existing_data, json_file, indent=2)
            json_file.truncate()
    except FileNotFoundError:
        # If the file doesn't exist, create it and write the data
        with open(file_name, 'w') as json_file:
            json.dump(data, json_file, indent=2)



# Main script execution
if __name__ == "__main__":
    driver, wait = initialize_driver()
    login(driver, wait, '7696884562', 'otp.txt')
    
    start_page = 21  # Set your starting page
    end_page = 25    # Set your ending page
    
    all_profiles_data = collect_all_profiles_data(driver, wait, start_page, end_page)
    print(json.dumps(all_profiles_data, indent=2))
    
    save_data_as_json(all_profiles_data, 'profiles_data.json')
    
    driver.quit()


Convert to CSV

In [None]:
import json
import csv

def extract_test_scores(test_scores):
    score_dict = {}
    for test in test_scores:
        test_name = test.get('test_name', '').lower().replace(" ", "_")
        score_dict[f'test_{test_name}_score'] = test.get('score', '')
        for i, (sub_title, sub_score) in enumerate(zip(test.get('sub_scores_titles', []), test.get('sub_scores', []))):
            score_dict[f'test_{test_name}_{sub_title.lower().replace(" ", "_")}'] = sub_score
    return score_dict

def extract_work_experience(work_experience):
    work_dict = {}
    for i, work in enumerate(work_experience, start=1):
        work_dict[f'company_{i}_role'] = work.get('role', '')
        work_dict[f'company_{i}_role_time'] = work.get('role_time', '')
        work_dict[f'company_{i}_role_type'] = work.get('role_type', '')
        work_dict[f'company_{i}_name'] = work.get('company', '')
    return work_dict

def extract_education(education):
    edu_dict = {}
    for i, edu in enumerate(education, start=1):
        edu_dict[f'education_{i}_organization'] = edu.get('org', '')
        edu_dict[f'education_{i}_level'] = edu.get('level', '')
        edu_dict[f'education_{i}_grade_type'] = edu.get('grade_type', '')
        edu_dict[f'education_{i}_grade'] = edu.get('grade', '')
        edu_dict[f'education_{i}_course'] = edu.get('course', '')
    return edu_dict

def extract_applications(applications):
    app_dict = {}
    for i, app in enumerate(applications, start=1):
        app_dict[f'application_{i}_University'] = app.get('University Name', '')
        app_dict[f'application_{i}_Program'] = app.get('Program', '')
        app_dict[f'application_{i}_Status'] = app.get('Status', '')
    return app_dict



def flatten_user_data(user_data):
    flattened_data = {}
    flattened_data['name'] = user_data.get('name', '')
    flattened_data['term'] = user_data.get('term', '')
    flattened_data['profile_link'] = user_data.get('link', '')

    if 'test_scores' in user_data:
        flattened_data.update(extract_test_scores(user_data['test_scores']))
    
    if 'work_experience' in user_data:
        flattened_data.update(extract_work_experience(user_data['work_experience']))

    if 'education' in user_data:
        flattened_data.update(extract_education(user_data['education']))

    if 'applications' in user_data:
        flattened_data.update(extract_applications(user_data['applications']))
    
    return flattened_data

def convert_json_to_csv(json_filename, csv_filename):
    with open(json_filename, 'r') as json_file:
        json_data = json.load(json_file)

    flattened_data_list = [flatten_user_data(user) for user in json_data]
    
    # Get all unique keys for the CSV header and sort them
    all_keys = set()
    for data in flattened_data_list:
        all_keys.update(data.keys())

    all_keys.remove('name')
    all_keys.remove('profile_link')
    all_keys.remove('term')
    
    sorted_keys = ['name', 'profile_link', 'term'] + sorted(list(all_keys))

    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
        writer.writeheader()
        writer.writerows(flattened_data_list)

# Run the conversion
if __name__ == "__main__":
    convert_json_to_csv('profiles_data.json', 'profiles_data.csv')
