fixes #65

WolfgangFahl · WolfgangFahl · commit 77d73d2ff8e0 · 2025-12-02T14:43:18.000+01:00
diff --git a/snapquery/github_access.py b/snapquery/github_access.py
@@ -4,28 +4,66 @@
 Created on 2025-12-02
 @author: wf
 """
+import json
+from  pathlib import Path
+from typing import List, Dict, Any, Optional, Union
+
 import requests
 
+
 class GitHub:
     """
     A simple GitHub API client for accessing repository contents.
     """
 
-    def __init__(self, owner: str, repo: str, token: str = None):
+    def __init__(self, owner: str, repo: str, token: Optional[str] = None, session: Optional[requests.Session] = None):
         """
         Initialize GitHub client.
 
         Args:
             owner: Repository owner (username or organization)
             repo: Repository name
             token: Optional GitHub API token for authentication
+            session: Optional custom requests.Session
         """
         self.owner = owner
         self.repo = repo
         self.token = token
         self.base_url = f"https://api.github.com/repos/{owner}/{repo}"
+        # Use provided token or read from file (compatible with GitHubApi)
+        self.token = token if token is not None else self._read_token()
+
+        # Use custom session or create new one
+        self.session = session or requests.Session()
+
+
+
+    def _headers(self) -> Dict[str, str]:
+        headers = {"Accept": "application/vnd.github.v3+json"}
+        if self.token:
+            headers["Authorization"] = f"token {self.token}"
+        return headers
+
+    def _read_token(self) -> Optional[str]:
+        """
+        Read GitHub token from ~/.github/access_token.json
+        (compatible with GitHubApi token storage format).
+
+        Returns:
+            GitHub token or None if not found
+        """
+        token_path = Path.home() / ".github" / "access_token.json"
+        if token_path.exists():
+            try:
+                with open(token_path, 'r') as f:
+                    data = json.load(f)
+                    return data.get("access_token")
+            except (json.JSONDecodeError, OSError):
+                pass
+        return None
 
-    def get_contents(self, path: str = ""):
+
+    def get_contents(self, path: str = "") -> Union[List[Dict[str, Any]], Dict[str, Any]]:
         """
         Get contents of a directory or file from the repository.
 
@@ -36,11 +74,46 @@ def get_contents(self, path: str = ""):
             List of dictionaries for directories, or content for files
         """
         url = f"{self.base_url}/contents/{path}"
-        headers = {"Accept": "application/vnd.github.v3+json"}
+        response = self.session.get(url, headers=self._headers(), timeout=30)
+        response.raise_for_status()
+        return response.json()
 
-        if self.token:
-            headers["Authorization"] = f"token {self.token}"
+    def list_files_recursive(self, path: str = "", suffix: Optional[str] = None) -> List[Dict[str, Any]]:
+        """
+        Recursively list files under a given path. Optionally filter by file suffix.
+
+        Args:
+            path: starting path within the repository
+            suffix: optional filename suffix filter, e.g., ".ttl"
+
+        Returns:
+            A flat list of GitHub content item dicts for files.
+        """
+        items = self.get_contents(path)
+        if isinstance(items, dict):
+            items = [items]
+
+        files: List[Dict[str, Any]] = []
+        for item in items:
+            item_type = item.get("type")
+            item_path = item.get("path", "")
+            if item_type == "file":
+                if suffix is None or item_path.endswith(suffix):
+                    files.append(item)
+            elif item_type == "dir":
+                files.extend(self.list_files_recursive(item_path, suffix=suffix))
+        return files
+
+    def download(self, download_url: str) -> str:
+        """
+        Download raw file content via a download_url.
 
-        response = requests.get(url, headers=headers)
+        Args:
+            download_url: The GitHub-provided raw download URL
+
+        Returns:
+            The text content of the file
+        """
+        response = self.session.get(download_url, headers=self._headers(), timeout=30)
         response.raise_for_status()
-        return response.json()
+        return response.text
diff --git a/snapquery/samples/endpoints.yaml b/snapquery/samples/endpoints.yaml
@@ -76,6 +76,31 @@ endpoints:
       - blazegraph
       - commons
 
+  wikidata-qlever:
+    description: QLever Freiburg Wikidata.
+    lang: sparql
+    method: POST
+    database: qlever
+    endpoint: https://qlever.dev/api/wikidata
+    website: https://qlever.dev/wikidata
+    calls_per_minute: 60
+    prefix_sets:
+      - rdf
+      - wikidata
+      - commons
+
+  wikidata-qlever-dbis:
+    description: QLever RWTH Wikidata.
+    lang: sparql
+    method: POST
+    database: qlever
+    endpoint: https://qlever-api.wikidata.dbis.rwth-aachen.de/
+    website: https://qlever.wikidata.dbis.rwth-aachen.de/wikidata
+    calls_per_minute: 60
+    prefix_sets:
+      - rdf
+      - wikidata
+
   dblp:
     description: DBLP official (qlever - reliable).
     lang: sparql
diff --git a/snapquery/samples/endpoints_optional.yaml b/snapquery/samples/endpoints_optional.yaml
@@ -26,30 +26,6 @@ endpoints:
       - rdf
       - wikidata
 
-  wikidata-qlever:
-    description: QLever Freiburg Wikidata.
-    lang: sparql
-    method: POST
-    database: qlever
-    endpoint: https://qlever.cs.uni-freiburg.de/api/wikidata
-    website: https://qlever.cs.uni-freiburg.de/wikidata
-    calls_per_minute: 60
-    prefix_sets:
-      - rdf
-      - wikidata
-      - commons
-
-  wikidata-qlever-dbis:
-    description: QLever RWTH Wikidata.
-    lang: sparql
-    method: POST
-    database: qlever
-    endpoint: https://qlever-api.wikidata.dbis.rwth-aachen.de/
-    website: https://qlever.wikidata.dbis.rwth-aachen.de/wikidata
-    calls_per_minute: 60
-    prefix_sets:
-      - rdf
-      - wikidata
 
   wikidata-scatter:
     description: Scatter experimental.
diff --git a/snapquery/snapquery_cmd.py b/snapquery/snapquery_cmd.py
@@ -3,7 +3,7 @@
 
 @author: wf
 """
-
+from tqdm import tqdm
 import logging
 import sys
 from argparse import ArgumentParser
@@ -87,6 +87,12 @@ def getArgParser(self, description: str, version_msg) -> ArgumentParser:
             action=StoreDictKeyPair,
             help="query parameters as Key-value pairs in the format key1=value1,key2=value2",
         )
+        parser.add_argument(
+            "--progress",
+            action="store_true",
+            help="show progress bars when testing queries (--testQueries)",
+        )
+
         parser.add_argument(
             "--domain",
             type=str,
@@ -144,6 +150,56 @@ def cmd_parse(self, argv: Optional[list] = None):
             self.args.func(self.args)
         return self.args
 
+    def handle_test_queries(self):
+        """
+        Handle the --testQueries option by executing queries against endpoints.
+        The endpoint is the outer loop, queries are the inner loop.
+        """
+        # Determine which endpoints to use
+        if self.args.endpointName:
+            endpoint_names = [self.args.endpointName]
+        else:
+            endpoint_names = list(self.nqm.endpoints.keys())
+
+        # Get all queries to test
+        queries = self.nqm.get_all_queries(domain=self.args.domain, namespace=self.args.namespace)
+
+        # Create execution instance
+        execution = Execution(self.nqm, debug=self.args.debug)
+
+        # Outer loop: endpoints
+        endpoint_iter = tqdm(endpoint_names, desc="Testing endpoints") if self.args.progress else endpoint_names
+        for endpoint_name in endpoint_iter:
+            # Inner loop: queries
+            query_iter = tqdm(queries, desc=f"Queries for {endpoint_name}", leave=False) if self.args.progress else queries
+            for i, nq in enumerate(query_iter, start=1):
+                execution.execute(
+                    nq,
+                    endpoint_name=endpoint_name,
+                    context=self.args.context,
+                    title=f"{endpoint_name}::query {i:3}/{len(queries)}",
+                    prefix_merger=QueryPrefixMerger.get_by_name(self.args.prefix_merger),
+                )
+
+    def handle_test_queries_no_progress_version(self):
+        if self.args.endpointName:
+            endpoint_names = [self.args.endpointName]
+        else:
+            endpoint_names = list(self.nqm.endpoints.keys())
+        queries = self.nqm.get_all_queries(domain=self.args.domain, namespace=self.args.namespace)
+        execution = Execution(self.nqm, debug=self.args.debug)
+        query_iter = tqdm(queries, desc="Testing queries") if self.args.progress else queries
+        for i, nq in enumerate(query_iter, start=1):
+            for endpoint_name in endpoint_names:
+                execution.execute(
+                    nq,
+                    endpoint_name=endpoint_name,
+                    context=self.args.context,
+                    title=f"query {i:3}/{len(queries)}::{endpoint_name}",
+                    prefix_merger=QueryPrefixMerger.get_by_name(self.args.prefix_merger),
+                )
+
+
     def handle_args(self, args) -> bool:
         """
         handle the command line args
@@ -174,21 +230,8 @@ def handle_args(self, args) -> bool:
                 print(f"{namespace}:{count}")
             handled = True
         elif self.args.testQueries:
-            if self.args.endpointName:
-                endpoint_names = [self.args.endpointName]
-            else:
-                endpoint_names = list(nqm.endpoints.keys())
-            queries = self.nqm.get_all_queries(domain=self.args.domain, namespace=self.args.namespace)
-            execution = Execution(self.nqm, debug=self.args.debug)
-            for i, nq in enumerate(queries, start=1):
-                for endpoint_name in endpoint_names:
-                    execution.execute(
-                        nq,
-                        endpoint_name=endpoint_name,
-                        context=self.args.context,
-                        title=f"query {i:3}/{len(queries)}::{endpoint_name}",
-                        prefix_merger=QueryPrefixMerger.get_by_name(self.args.prefix_merger),
-                    )
+            self.handle_test_queries()
+            handled = True
         elif self.args.queryName is not None or self.args.query_id is not None:
             if self.args.query_id is not None:
                 query_name = QueryName.from_query_id(self.args.query_id)
diff --git a/snapquery/snapquery_core.py b/snapquery/snapquery_core.py
@@ -728,7 +728,7 @@ def from_samples(
             for source_class, pk in [
                 (NamedQuery, "query_id"),
                 (QueryStats, "stats_id"),
-                (QueryDetails, "quer_id"),
+                (QueryDetails, "query_id"),
             ]:
                 # Fetch sample records from the specified class
                 sample_records = cls.get_sample_records(source_class=source_class)
diff --git a/tests/test_sib_examples.py b/tests/test_sib_examples.py
@@ -1,35 +1,74 @@
 """
+Test SIB SPARQL Examples fetching.
+Verifies integration between SIB fetcher and Snapquery Core.
 
-SIB Swiss Institute of Bioinformatics
-
-sparql examples
-
-@author wf
+Created on 2025-12-02
+@author: wf
 """
+import os
 import unittest
-
 from basemkit.basetest import Basetest
-from rdflib import Graph
-
+from snapquery.snapquery_core import NamedQueryManager
+from snapquery.sib_sparql_examples import SibSparqlExamples
 
 class TestSibExamples(Basetest):
     """
-    Test for Issue #59
-    https://github.com/WolfgangFahl/snapquery/issues/59
-    https://github.com/sib-swiss/sparql-examples/
-
-    Snap query should consider using the sparql-examples style of encoding queries as their own entities as the basis of it's data interchange
+    Test retrieving SIB SPARQL examples using GitHub cache/api.
     """
 
-    def setUp(self, debug=False, profile=True):
+    def setUp(self, debug=True, profile=True):
         Basetest.setUp(self, debug=debug, profile=profile)
 
-
-    @unittest.skip("needs github clone to work -postpone")
-    def testBgee(self):
+    @unittest.skipIf(Basetest.inPublicCI(), "avoid github rate limit in CI")
+    def test_sib_examples_fetch_and_store(self):
         """
-        test a single example
+        Test retrieving SIB examples, populating the DB, and exporting to YAML.
         """
-        g= Graph().parse("examples/Bgee/001.ttl", format="turtle")
-        query = g.value(None, g.namespace("sh")["select"])  # Gets sh:select literal
-        print(query)  # Full SPARQL text
+        db_path = "/tmp/sib_examples.db"
+        yaml_path = "/tmp/sib_examples.yaml"
+
+        if os.path.exists(db_path):
+            os.remove(db_path)
+        if os.path.exists(yaml_path):
+            os.remove(yaml_path)
+
+        nqm = NamedQueryManager.from_samples(db_path=db_path)
+        sib_fetcher = SibSparqlExamples(nqm, debug=self.debug)
+
+        # Limit for testing efficiency
+        limit = 7 # if self.inPublicCI() else None
+        if self.debug:
+            print(f"Fetching SIB examples (limit={limit})...")
+
+        loaded_queries = sib_fetcher.extract_queries(limit=limit, debug_print=self.debug)
+
+        self.assertTrue(len(loaded_queries) > 0, "Should have loaded at least one query")
+        self.assertEqual(len(sib_fetcher.named_query_set), len(loaded_queries))
+
+        # Verify SQL Database Storage
+        records = nqm.sql_db.query(
+            """
+            SELECT count(*) as count
+            FROM NamedQuery
+            WHERE namespace=? AND domain=?
+            """,
+            (sib_fetcher.named_query_set.namespace, sib_fetcher.named_query_set.domain)
+        )
+        db_count = records[0]['count']
+        self.assertEqual(db_count, len(loaded_queries), "DB count should match loaded count")
+
+        # Verify YAML Export
+        if self.debug:
+            print(f"Exporting to {yaml_path}...")
+        sib_fetcher.save_to_yaml(yaml_path)
+
+        self.assertTrue(os.path.exists(yaml_path))
+
+        # Optional: Read back to verify YamlAble structure
+        with open(yaml_path, 'r') as f:
+            content = f.read()
+            self.assertIn("sib-examples", content)
+            self.assertIn("queries:", content)
+
+        if self.debug:
+            print(f"Successfully processed {db_count} queries.")