In [12]:
import requests
from bs4 import BeautifulSoup
import warnings


BASE_URL = "http://bioinformatics.cs.ntou.edu.tw/adam"

def get_more_clusters():
    """Fetch clusters from the page after clicking the 'more...' button."""
    # Access the "more..." page directly
    more_url = f"{BASE_URL}/cluster_info.php?f=more"
    response = requests.get(more_url, verify=False)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    # Find cluster links on the "more..." page
    clusters = []
    for a in soup.find_all("a", href=True):
        if "search_d.php?f=cluster" in a['href']:
            clusters.append(a['href'])

    return clusters

def get_sequences(cluster_link):
    """Scrape a cluster page to extract sequence IDs and sequences."""
    url = f"{BASE_URL}/{cluster_link}"
    response = requests.get(url, verify=False)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract sequences from the table
    sequences = []
    rows = soup.find_all("tr")[1:]  # Skip header row
    for row in rows:
        cells = row.find_all("td")
        if len(cells) >= 3:
            seq_id = cells[0].text.strip()
            sequence = cells[2].text.strip().replace("\n", "")
            sequences.append((seq_id, sequence))

    return sequences

def save_to_csv(data, filename="sequences.csv"):
    """Save extracted sequences to a CSV file."""
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Sequence ID", "Sequence"])  # Header
        writer.writerows(data)

def main():
    """Main script to scrape and save sequences."""
    print("Fetching clusters from the 'more...' page...")
    cluster_links = get_more_clusters()

    all_sequences = []
    for cluster_link in cluster_links:
        print(f"Processing {cluster_link}...")
        sequences = get_sequences(cluster_link)
        all_sequences.extend(sequences)

    print("Saving sequences to CSV...")
    save_to_csv(all_sequences)
    print("Done!")

if __name__ == "__main__":
    main()


Fetching clusters from the 'more...' page...
Processing ./search_d.php?f=cluster,1...
Processing ./search_d.php?f=cluster,2...
Processing ./search_d.php?f=cluster,3...
Processing ./search_d.php?f=cluster,4...
Processing ./search_d.php?f=cluster,5...
Processing ./search_d.php?f=cluster,6...
Processing ./search_d.php?f=cluster,7...
Processing ./search_d.php?f=cluster,8...
Processing ./search_d.php?f=cluster,9...
Processing ./search_d.php?f=cluster,10...
Processing ./search_d.php?f=cluster,11...
Processing ./search_d.php?f=cluster,12...
Processing ./search_d.php?f=cluster,13...
Processing ./search_d.php?f=cluster,14...
Processing ./search_d.php?f=cluster,15...
Processing ./search_d.php?f=cluster,16...
Processing ./search_d.php?f=cluster,17...
Processing ./search_d.php?f=cluster,18...
Processing ./search_d.php?f=cluster,19...
Processing ./search_d.php?f=cluster,20...
Processing ./search_d.php?f=cluster,21...
Processing ./search_d.php?f=cluster,22...
Processing ./search_d.php?f=cluster,23..

In [13]:
def count_total_sequences(file_path):
    """
    Reads a file and counts the total number of sequences from the '#SEQ' column.

    Args:
        file_path (str): Path to the input file.

    Returns:
        int: Total number of sequences across all clusters.
    """
    total_sequences = 0

    try:
        with open(file_path, 'r') as file:
            for line in file:
                # Check if the line contains a numeric value for sequences
                stripped_line = line.strip()
                if stripped_line.isdigit():  # Assuming the sequence number is on its own line
                    total_sequences += int(stripped_line)
    except FileNotFoundError:
        print(f"Error: The file {file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return total_sequences

# Uncomment below to test the function with a file path
file_path = "ADAM_text.txt"
print(f"Total Sequences: {count_total_sequences(file_path)}")


Total Sequences: 2951


In [14]:
import pandas as pd

# Load the Excel file
file_path = 'sequences.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Identify rows to keep (either the first row or rows not containing 'sequence')
filtered_df = df[~df.apply(lambda row: row.astype(str).str.contains('sequence', case=False).any(), axis=1)]
filtered_df = pd.concat([df.iloc[[0]], filtered_df])  # Retain the first row

# Save the cleaned file
filtered_df.to_csv('cleaned_file.csv', index=False)
