<a href="https://colab.research.google.com/github/Alexxpark/Projecy-LLM-/blob/main/WebscrapperWIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# for carbonoffset


import requests
from bs4 import BeautifulSoup as bs
import time
import re
import os

def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_page(html):
    soup = bs(html, 'html.parser')
    title = soup.find('h1', {'id': 'firstHeading'}).get_text(strip=True)  # Extract the title
    content = soup.find('div', {'id': 'mw-content-text'}).get_text(strip=True)  # Extract the content
    return title, content

def clean_text(text):
    # Remove unwanted references and simplify whitespace
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation numbers like [1], [2], etc.
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def save_text(title, text, label, directory="data"):
    """Save cleaned text to a file, labeled with '0' for PetroWiki."""
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
    filename = os.path.join(directory, re.sub(r'[\\/:"*?<>|]+', "", title) + ".txt")  # Clean filename

    # Formatting content for better readability and adding a label
    content = f"Label: {label}\nTitle: {title}\nContent: {text}"

    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)
    print(f"Saved {filename}")

def main():
    base_url = "https://petrowiki.spe.org"
    start_page = "/carbon_offset"
    url = f"{base_url}{start_page}"

    print(f"Fetching {url}")
    html = fetch_page(url)
    if html:
        title, content = parse_page(html)
        cleaned_content = clean_text(content)
        save_text(title, cleaned_content, label="0")
        time.sleep(1)  # Respectful scraping

if __name__ == "__main__":
    main()




Fetching https://petrowiki.spe.org/Acid_fracturing
Saved data/Acid fracturing.txt


In [12]:
# for AARG

import requests
from bs4 import BeautifulSoup as bs
import time
import re
import os

def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_page(html):
    soup = bs(html, 'html.parser')
    title = soup.find('h1').get_text(strip=True)  # Extract the title assuming h1 is present for the title
    content = soup.find('div', {'class': 'mw-parser-output'}).get_text(strip=True)  # Extract the content
    return title, content

def clean_text(text):
    # Remove unwanted references and simplify whitespace
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation numbers like [1], [2], etc.
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def save_text(title, text, label, directory="data"):
    """Save cleaned text to a file, labeled with '1' for AAPG Wiki."""
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
    filename = os.path.join(directory, re.sub(r'[\\/:"*?<>|]+', "", title) + ".txt")  # Clean filename

    # Formatting content for better readability and adding a label
    content = f"Label: {label}\nTitle: {title}\nContent: {text}"

    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)
    print(f"Saved {filename}")

def main():
    url = "https://wiki.aapg.org/Carbon_dioxide_(CO2)_storage"

    print(f"Fetching {url}")
    html = fetch_page(url)
    if html:
        title, content = parse_page(html)
        cleaned_content = clean_text(content)
        save_text(title, cleaned_content, label="1")
        time.sleep(1)  # Respectful scraping

if __name__ == "__main__":
    main()


Fetching https://wiki.aapg.org/Carbon_dioxide_(CO2)_storage
Saved data/Carbon dioxide (CO2) storage.txt


In [15]:
import requests
from bs4 import BeautifulSoup as bs
import time
import re
import os

def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_page(html):
    soup = bs(html, 'html.parser')
    # Target the specific section by its 'id' in the 'span' within 'h2'
    section_header = soup.find('span', {'id': 'Acid-fracturing_candidate_selection'})
    if section_header:
        # Get the parent 'h2' to access the subsequent paragraphs
        section = section_header.parent
        content = section.get_text(strip=True) + ' '  # Include the header in the text
        # Get next siblings until it finds another 'h2' or runs out of siblings
        for sibling in section.find_next_siblings():
            if sibling.name == 'p':
                content += sibling.get_text(strip=True) + ' '
            else:
                break
        return content
    return ""

def clean_text(text):
    # Remove citation references and normalize whitespace
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation numbers like [1], [2], etc.
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def save_text(title, text, label, directory="data"):
    """Save cleaned text to a file, labeled with '0' for PetroWiki."""
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
    filename = os.path.join(directory, re.sub(r'[\\/:"*?<>|]+', "", title) + ".txt")  # Clean filename

    # Formatting content for better readability and adding a label
    content = f"Label: {label}\nTitle: {title}\nContent: {text}"

    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)
    print(f"Saved {filename}")

def main():
    url = "https://petrowiki.spe.org/Acid_fracturing"

    print(f"Fetching {url}")
    html = fetch_page(url)
    if html:
        content = parse_page(html)
        if content:
            cleaned_content = clean_text(content)
            save_text("Acid-fracturing_candidate_selection", cleaned_content, label="0")
            time.sleep(1)  # Respectful scraping

if __name__ == "__main__":
    main()


Fetching https://petrowiki.spe.org/Acid_fracturing
Saved data/Acid-fracturing_candidate_selection.txt


In [16]:
import requests
from bs4 import BeautifulSoup as bs
import time
import re
import os

def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_page(html):
    soup = bs(html, 'html.parser')
    # Find all paragraphs in the desired section
    paragraphs = soup.find_all('p')[:4]  # Adjust the index to select specific paragraphs
    content = ' '.join(p.get_text(strip=True) for p in paragraphs)
    return content

def clean_text(text):
    # Remove citation references and normalize whitespace
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation numbers like [1], [2], etc.
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

def save_text(title, text, label, directory="data"):
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
    filename = os.path.join(directory, re.sub(r'[\\/:"*?<>|]+', "", title) + ".txt")  # Clean filename
    # Formatting content for better readability and adding a label
    content = f"Label: {label}\nTitle: {title}\nContent: {text}"
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)
    print(f"Saved {filename}")

def main():
    url = "https://wiki.aapg.org/Basin-centered_gas"
    print(f"Fetching {url}")
    html = fetch_page(url)
    if html:
        content = parse_page(html)
        cleaned_content = clean_text(content)
        save_text("Basin-centered_gas", cleaned_content, label="1")
        time.sleep(1)  # Respectful scraping

if __name__ == "__main__":
    main()


Fetching https://wiki.aapg.org/Basin-centered_gas
Saved data/Basin-centered_gas.txt
