In [None]:
pip install selenium
pip install requests beautifulsoup4
pip install selenium python-docx webdriver-manager


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import os
import time

# Function to read gene values from a text file
def read_gene_values(file_path):
    with open(file_path, 'r') as file:
        values = [line.strip() for line in file.readlines()]
    return values

# Read gene values from the text file
gene_values_file = 'path to text file'  
values_to_test = read_gene_values(gene_values_file)

# Set up the driver
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

# Open the website
driver.get('https://cancervar.wglab.org/')

# Function to scroll to the top of the page using JavaScript
def scroll_to_top():
    driver.execute_script("window.scrollTo(0, 0);")

# Function to scroll element into view
def scroll_into_view(element):
    driver.execute_script("arguments[0].scrollIntoView(true);", element)

# Function to enter value, click radio button, submit the form, and extract data
def submit_gene_value(value, html_file):
    try:
        # Wait for the input field to be present and get all elements with the ID 'Gene'
        gene_inputs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.ID, 'Gene'))
        )

        # Select the last element with the ID 'Gene'
        gene_input = gene_inputs[-1]

        # Clear the input element
        gene_input.clear()

        # Enter the new value
        gene_input.send_keys(value)

        # Find and click the radio button
        radio_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'qtype_strut'))
        )
        radio_button.click()

        # Scroll to the top of the page to avoid element click interception
        scroll_to_top()

        # Find and click the submit button
        submit_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'input[type="submit"].btn-primary'))
        )
        submit_button.click()

        # Wait for the results table to be present
        results_table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div#dataTable_wrapper'))
        )

        # Extract the table data
        extract_table_data(html_file)

        # Go back to the previous page
        driver.back()

    except Exception as e:
        print(f"An error occurred for value {value}: {e}")

# Function to extract table data and handle pagination
def extract_table_data(html_file):
    while True:
        # Extract the table data
        table_html = driver.find_element(By.CSS_SELECTOR, 'div#dataTable_wrapper').get_attribute('outerHTML')
        soup = BeautifulSoup(table_html, 'html.parser')
        table = soup.find('table')
        
        # Write the table content to the HTML file
        if table:
            html_file.write(str(table))

        try:
            # Check if there is a next page button and it's clickable
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, 'li.paginate_button.page-item.next'))
            )

            # Check if the next button is disabled
            if 'disabled' in next_button.get_attribute('class'):
                break  # No more pages available
            else:
                # Scroll the next button into view to avoid element click interception
                scroll_into_view(next_button)

                # Click the next button
                next_button.click()

                # Wait for the next page to load
                time.sleep(2)  # Adjust as needed

        except Exception as e:
            print(f"An error occurred while navigating pages: {e}")
            break

# Create the output directory if it does not exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

# Open the HTML file for writing
output_file_path = os.path.join(output_dir, 'cancer2.html')
with open(output_file_path, 'w') as html_file:
    # Write the initial HTML structure
    html_file.write('<html><head><title>CancerVar Results</title></head><body>')

    # Loop through each value and submit the form
    for value in values_to_test:
        html_file.write(f'<h2>Results for {value}</h2>')
        submit_gene_value(value, html_file)

    # Close the HTML structure
    html_file.write('</body></html>')

# Close the driver
driver.quit()

print("Output file saved to:", output_file_path)

In [None]:
pip install pandas
pip insatll openpyxl

In [None]:
import pandas as pd
from openpyxl import Workbook

def html_to_excel(html_file, excel_file):
    # Read the HTML file
    tables = pd.read_html(html_file)
    
    # Create a Pandas Excel writer using openpyxl as the engine
    writer = pd.ExcelWriter(excel_file, engine='openpyxl')

    # Maximum rows per sheet in Excel
    max_rows_per_sheet = 1048576

    # Consolidate all tables into a single DataFrame
    all_data = pd.concat(tables, ignore_index=True)

    # Split the consolidated DataFrame into multiple sheets if necessary
    sheet_number = 1
    for start_row in range(0, len(all_data), max_rows_per_sheet):
        end_row = start_row + max_rows_per_sheet
        sheet_data = all_data.iloc[start_row:end_row]
        sheet_name = f'Sheet{sheet_number}'
        sheet_data.to_excel(writer, sheet_name=sheet_name, index=False)
        sheet_number += 1
    
    # Save the Excel file
    writer.save()
    print(f'Converted {html_file} to {excel_file}')

# Example usage
html_file = 'give html file name and path' # Filename should be same as the one generated 
excel_file = 'give excel file name and path'
html_to_excel(html_file, excel_file)