In [1]:
import time
import re
import os
import sys
from random import uniform
from collections import deque
from urllib.parse import urljoin, urlparse
import json
import torch

import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

parent_dir = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
sys.path.append(parent_dir)

from utils import logger, create_directories_if_not_exist, save_to_file
from chroma_functions import ChromaDB
from rag import RAG
from utils import current_memory

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
log = logger(__name__)

In [5]:
test = "http://www.hdm-stuttgart.de/news/news20240307102023/fotostrecke20240307141831/musik_z.jpg"
print("example.jpg".endswith(("", ".pdf", ".html", ".htm")))  # ✅ Always True!
print("randomtext".endswith(""))  # ✅ Always True!
print("https://example.com/something.jpg".endswith(""))  # ✅ Always True!

True
True
True


In [11]:
from urllib.parse import urlparse

allowed_extensions = {".pdf", ".html", ".htm", ""}
full_url = "https://www.hdm-stuttgart.de/studium/auslandssemester.jpg"
full_url = "https://www.youtube.com/watch?v=oN-QQdCGXLU&t=102s"

parsed_url = urlparse(full_url)
print(parsed_url)
domain = parsed_url.netloc
print(domain)

if (
    "#" in full_url
    or not full_url.startswith(("http://", "https://"))
    or parsed_url.path.lower() and not parsed_url.path.lower().endswith(tuple(allowed_extensions))
):
    print("out")
else:
    print("ok")


ParseResult(scheme='https', netloc='www.youtube.com', path='/watch', params='', query='v=oN-QQdCGXLU&t=102s', fragment='')
www.youtube.com
ok


In [24]:
def extract_domain_part(url: str) -> str:
    """Takes in an url and returns a string that is based on the urls domain, path and query.

    Args:
        url (str): The URL of the page being parsed.

    Returns:
        str: The url in a form thats usable as a path.
    """
    try:
        parsed_url = urlparse(url)
        print(parsed_url)
        # Extract the base domain (e.g., hdm-stuttgart from www.hdm-stuttgart.de)
        domain_match = re.search(r"(?:www\.)?(.*?)\.(de|com|org|net|pdf)", parsed_url.netloc)
        base_domain = domain_match.group(1) if domain_match else parsed_url.netloc

        # Extract the url path after .de
        path = parsed_url.path.strip("/").replace("/", "_")
        print(path)
        # Extract url query parameters after ?
        query = parsed_url.query.replace("&", "_").replace("=", "_") if parsed_url.query else ""
        print(query)
        # Combine components
        filename = f"{base_domain}"
        if path:
            filename += f"_{path}"
        if query:
            filename += f"_{query}"

        # Ensure filename is safe
        filename = re.sub(r'[<>:"/\\|?*]', "_", filename)
        return filename
    except Exception as e:
        log.error("Error generating filename from URL %s: %s", url, e)
        return "default"

In [31]:
full_url = "https://www.youtube.com/watch?v=oN-QQdCGXLU&t=102s"
full_url = "https://www.w3schools.com/python/python_classes.pdf"

extract_domain_part(full_url)

ParseResult(scheme='https', netloc='www.w3schools.com', path='/python/python_classes.pdf', params='', query='', fragment='')
python_python_classes.pdf



'w3schools_python_python_classes.pdf'

In [108]:
set1 = {"test", "blub", 4, 3}
set2 = {3, 4, 5, 6, "blub"}
set1.symmetric_difference_update(set2)
set1

{5, 6, 'test'}

In [162]:
DISALLOWED_PATHS = [
    "/studienfuehrer/vorlesungsverzeichnis/",
    "/studienfuehrer/Studiengaenge/",
    "/studienfuehrer/dozentenplaene/",
    "/studienfuehrer/raumbelegung/",
    "*/manage",
    "*/manage_main",
    "*/manage_workspace",
    "/pdm/pdm_deutsch/",
    "/pdm/pdm_englisch/",
    "/pdm/pdm_spanisch/",
    "*/html2pdf",
    "*/htmltopdf",
    "*printview=1",
    "/pmm/studiengang/team/mitarbeiter/lindig/",
    "/hochschule/neubau/webcams/tag*",
    "/ifak/startseite/redaktionzukunft/beitrag.html?beitrag_ID=1817&stars=2",
    "/*beitrag.html?beitrag_ID=1817",
    "*view_fotostrecke*",
    "*hdmnewsmail_simple*",
    "/vwif/",
    "splan.hdm-stuttgart.de",
]

def is_allowed(url: str) -> bool:
    """Check if a url is allowed based on the disallowed paths.

    Args:
        url (str): thue url to check

    Returns:
        bool: True if allowed, False if disallowed
    """
    for path in DISALLOWED_PATHS:
        if "*" in path:
            # Match wildcard patterns
            regex_path = path.replace("*", ".*")
            if re.search(regex_path, url):
                return False
        elif path in url:
            return False
    return True

In [None]:
from urllib.parse import urlparse, urljoin

ALLOWED_DOMAINS = {
    "hdm-stuttgart.de",
    "hdm-weiterbildung.de",
    "vs-hdm.de",
}

ALLOWED_EXTENSIONS = {".pdf"}

def extract_links(soup: BeautifulSoup, url: str, visited: set[str], to_visit: set[str]) -> set[str]:
    """Extracts all valid links from a webpage.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the page content.
        url (str): The URL of the page being parsed.
        visited (set): Set of already visited URLs.
        to_visit (set): Set of URLs scheduled for visiting.

    Returns:
        Set[str]: A set of valid links extracted from the page.
    """
    try:
        filtered_links = set()
        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            full_url = urljoin(url, href)
            parsed_url = urlparse(full_url)

    	    # must be https
            if not parsed_url.scheme == "https":
                continue

            # must not have # in it
            if parsed_url.fragment:
                continue

            # Only allow URLs from allowed domains
            domain = parsed_url.netloc.lower()
            if not any(domain.endswith(allowed_domain) for allowed_domain in ALLOWED_DOMAINS):
                continue
            
            # Ensure the path has an allowed extension or no extension (to capture general webpages)
            path = parsed_url.path.lower()
            if path and "." in path and not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
                continue

            # Check if the URL is allowed based on disallowed paths
            if not is_allowed(full_url):
                continue

            filtered_links.add(full_url)
            
        # Avoid duplicates in visited or to_visit
        filtered_links.difference_update(visited)
        filtered_links.difference_update(to_visit)
        return filtered_links

    except Exception as e:
        log.error(f"Failed to fetch links from {url}: {e}")
        return set()


In [203]:
tuple(ALLOWED_EXTENSIONS)

('.pdf',)

In [215]:
full_url = "https://www.hdm-stuttgart.de/studies?page=1p"
parsed_url = urlparse(full_url)
# Ensure the path has an allowed extension or no extension (to capture general webpages)
path = parsed_url.path.lower()
print(parsed_url)
print(path)
if path and "." in path and not any(path.endswith(ext) for ext in ALLOWED_EXTENSIONS):
    print("nope")
else:
    print("yes")

ParseResult(scheme='https', netloc='www.hdm-stuttgart.de', path='/studies', params='', query='page=1p', fragment='')
/studies
yes


In [166]:
current_url = "https://www.hdm-stuttgart.de/index_html"
visited = set()
to_visit_set = set()
session = requests.Session()
response = session.get(current_url, timeout=10)
response.raise_for_status()

content_type = response.headers.get("Content-Type", "").lower()
log.info("Content type: %s", content_type)

# Process the content based on the content type
if "text/html" in content_type:
    log.info("Detected HTML: %s", current_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # extracts all links from webpage, and adds them to the list/set if they are not
    # already visited or in the to_visit set
    new_links = extract_links(
        soup=soup, url=current_url, visited=visited, to_visit=to_visit_set
    )
    print(new_links)

2025-02-07 23:38:29 [INFO] __main__ - Content type: text/html; charset=utf-8
2025-02-07 23:38:29 [INFO] __main__ - Detected HTML: https://www.hdm-stuttgart.de/index_html
{'https://www.hdm-stuttgart.de/hochschule/aktuelles/terminkalender', 'https://www.hdm-stuttgart.de/view_news?ident=news20250108171856', 'https://www.hdm-stuttgart.de/studierende/mathevorkurse', 'https://www.hdm-stuttgart.de/hochschule/profil/leitwerte_leitbild', 'https://www.hdm-stuttgart.de/unternehmen/kooperation/deutschlandstipendium_foerderer', 'https://www.hdm-stuttgart.de/science/view_beitrag?science_beitrag_ID=866', 'https://www.hdm-stuttgart.de/hausordnung', 'https://www.hdm-stuttgart.de/hochschule/doepfert', 'https://www.hdm-stuttgart.de/science/view_beitrag?science_beitrag_ID=862', 'https://www.hdm-stuttgart.de/studierende/stundenplan/pers_stundenplan', 'https://www.hdm-stuttgart.de/isms', 'https://www.hdm-stuttgart.de/hochschule/aktuelles', 'https://www.hdm-stuttgart.de/unternehmen/careercenter', 'https://ww

In [115]:
def extract_links_old(soup: BeautifulSoup, url: str, visited: set[str], to_visit: set[str]) -> set[str]:
    """Extracts all valid links from a webpage.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object of the pag content.
        url (str): The URL of the page being parsed.

    Returns:
        Set[str]: A set of valid links extracted from the page.
    """
    try:
        # Extract all <a> tags with href attributes
        filtered_links = set()

        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            full_url = urljoin(url, href)

            if (
                "#" in full_url
                or "hdm" not in full_url
                or full_url.startswith("mailto:")
                or not full_url.lower().endswith((".pdf", ".html", ".htm", ""))
                or not full_url.startswith(("http://", "https://"))
            ):
                continue

            parsed_url = urlparse(full_url)
            domain = parsed_url.netloc.lower()
            if not any(domain.endswith(allowed) for allowed in ALLOWED_DOMAINS):
                continue

            if full_url not in visited and full_url not in to_visit:
                filtered_links.add(full_url)

        return filtered_links

    except requests.RequestException as e:
        log.error("Failed to fetch links from %s: %s", url, e)
        return set()

In [118]:
current_url = "https://www.hdm-stuttgart.de/hochschule/organisation/rektorat"
visited = set()
to_visit_set = set()
session = requests.Session()
response = session.get(current_url, timeout=10)
response.raise_for_status()

content_type = response.headers.get("Content-Type", "").lower()
log.info("Content type: %s", content_type)

# Process the content based on the content type
if "text/html" in content_type:
    log.info("Detected HTML: %s", current_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # extracts all links from webpage, and adds them to the list/set if they are not
    # already visited or in the to_visit set
    new_links = extract_links_old(
        soup=soup, url=current_url, visited=visited, to_visit=to_visit_set
    )
    display(new_links)

2025-02-07 23:07:49 [INFO] __main__ - Content type: text/html; charset=utf-8
2025-02-07 23:07:49 [INFO] __main__ - Detected HTML: https://www.hdm-stuttgart.de/hochschule/organisation/rektorat


{'http://www.hdm-weiterbildung.de',
 'https://moodle.hdm-stuttgart.de',
 'https://ox.hdm-stuttgart.de/',
 'https://vs-hdm.de/',
 'https://www.hdm-stuttgart.de',
 'https://www.hdm-stuttgart.de/anfahrt',
 'https://www.hdm-stuttgart.de/barrierefreiheit',
 'https://www.hdm-stuttgart.de/bibliothek',
 'https://www.hdm-stuttgart.de/bilderupload/23sdfazu.jpg',
 'https://www.hdm-stuttgart.de/bilderupload/6rwvIho7BP.jpg',
 'https://www.hdm-stuttgart.de/bilderupload/OMCpE0KJYy.jpg',
 'https://www.hdm-stuttgart.de/bilderupload/T1mYQMfWLi.jpg',
 'https://www.hdm-stuttgart.de/bilderupload/rBvqrUFbuD.jpg',
 'https://www.hdm-stuttgart.de/datenschutz',
 'https://www.hdm-stuttgart.de/datenschutzverordnung_hdm',
 'https://www.hdm-stuttgart.de/download',
 'https://www.hdm-stuttgart.de/en/about/organization/rectorate',
 'https://www.hdm-stuttgart.de/hausordnung',
 'https://www.hdm-stuttgart.de/hochschule',
 'https://www.hdm-stuttgart.de/hochschule/aktuelles',
 'https://www.hdm-stuttgart.de/hochschule/aktue

In [2]:
import requests

url = "https://www.hdm-stuttgart.de/view_news?ident=news20241212154258"  # Replace with your URL
response = requests.head(url)

if "Last-Modified" in response.headers:
    print("Last Modified:", response.headers["Last-Modified"])
else:
    print("No Last-Modified header found.")

Last Modified: Wed, 06 Dec 2023 12:56:07 GMT


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pad_token_id = (
    tokenizer.pad_token_id
    if tokenizer.pad_token_id is not None
    else tokenizer.eos_token_id
)

In [6]:
pad_token_id

128001

In [1]:
import time
import re
import os
import sys
from random import uniform
from collections import deque
from urllib.parse import urljoin, urlparse
import json
import torch

import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

parent_dir = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
sys.path.append(parent_dir)

from utils import logger, create_directories_if_not_exist, save_to_file
from chroma_functions import ChromaDB
from rag import RAG
from utils import current_memory

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RAG_MODEL = RAG(
    model_name="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    tokenizer_name="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
)

2025-02-12 21:57:25 [INFO] utils -  used space 1.2451839999999983, free space 11.632902144, total space 12.878086144 in GB
2025-02-12 21:57:26 [INFO] accelerate.utils.modeling - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.06s/it]


2025-02-12 21:57:35 [INFO] utils -  used space 7.379353599999999, free space 5.498732544, total space 12.878086144 in GB
2025-02-12 21:57:35 [INFO] rag - Initialized model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B


In [3]:
current_memory()

2025-02-12 21:57:35 [INFO] utils -  used space 7.379353599999999, free space 5.498732544, total space 12.878086144 in GB


In [14]:
# del RAG_MODEL
import gc
gc.collect()
torch.cuda.empty_cache()

In [15]:
current_memory()

2025-02-12 22:00:56 [INFO] utils -  used space 1.2703498239999984, free space 11.60773632, total space 12.878086144 in GB


In [14]:
RAG_MODEL.generate_text("wie lange geht der studiengang medieninformatik?")

2025-02-08 14:13:36 [INFO] utils -  used space 8.463581183999999, free space 4.41450496, total space 12.878086144 in GB
2025-02-08 14:13:36 [INFO] rag - Adding user query to prompt object


Batches: 100%|██████████| 1/1 [00:00<00:00, 296.54it/s]

2025-02-08 14:13:36 [INFO] chroma_functions - Closest document found: []
2025-02-08 14:13:36 [INFO] rag - No corresponding document found.
2025-02-08 14:13:36 [INFO] rag - Generating an answer for the query: wie lange geht der studiengang medieninformatik?
2025-02-08 14:13:36 [INFO] utils -  used space 8.463581183999999, free space 4.41450496, total space 12.878086144 in GB





2025-02-08 14:13:51 [INFO] rag - Question: wie lange geht der studiengang medieninformatik?
2025-02-08 14:13:51 [INFO] rag - Answer:  in stuttgart

Okay, so I need to figure out how long the study program in Media Information Technology in Stuttgart is. I'm not exactly sure about the specifics, but I can try to work it out.

First, I know that studying in Germany usually involves a Bachelor's and Master's degree. Maybe this program is structured that way. I should check the typical duration for these degrees in Germany.

In Germany, a Bachelor's degree typically takes six semesters, which is three years of study. A Master's degree usually takes another two years, so four semesters in total. That makes the entire program seven semesters or about four years.

Now, applying this to Media Information Technology. I think this might be a dual degree program, combining Bachelor and Master in one cycle. So instead of two separate degrees, it's a single six- or seven-semester program.

Wait, I 

(" in stuttgart\n\nOkay, so I need to figure out how long the study program in Media Information Technology in Stuttgart is. I'm not exactly sure about the specifics, but I can try to work it out.\n\nFirst, I know that studying in Germany usually involves a Bachelor's and Master's degree. Maybe this program is structured that way. I should check the typical duration for these degrees in Germany.\n\nIn Germany, a Bachelor's degree typically takes six semesters, which is three years of study. A Master's degree usually takes another two years, so four semesters in total. That makes the entire program seven semesters or about four years.\n\nNow, applying this to Media Information Technology. I think this might be a dual degree program, combining Bachelor and Master in one cycle. So instead of two separate degrees, it's a single six- or seven-semester program.\n\nWait, I should confirm if it's a combined program. Some universities in Germany offer combined degrees, where you earn both Bache

In [11]:
current_memory()

2025-02-08 14:10:18 [INFO] utils -  used space 7.933001727999999, free space 4.945084416, total space 12.878086144 in GB


In [10]:
torch.cuda.empty_cache()