In [2]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.7.2 rdflib-7.1.1


In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

In [4]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# Set up SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 50  # Number of rows to fetch per query
total_records = 100  # Total number of records to fetch
retry_delay = 5  # Delay in seconds before retrying on failure

# SPARQL query loop
for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination
    sparql.setQuery(f"""
        SELECT ?tree ?name ?family ?genus ?order ?nativeArea ?conservationStatus
        WHERE {{
            ?tree a dbo:Plant .
            ?tree foaf:name ?name .
            OPTIONAL {{ ?tree dbo:family ?family . }}
            OPTIONAL {{ ?tree dbo:genus ?genus . }}
            OPTIONAL {{ ?tree dbo:order ?order . }}
            OPTIONAL {{ ?tree dbo:nativeArea ?nativeArea . }}
            OPTIONAL {{ ?tree dbo:conservationStatus ?conservationStatus . }}
            FILTER (lang(?name) = 'en')  # Ensure English results
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    success = False
    retries = 3  # Retry count

    while not success and retries > 0:
        try:
            # Execute query
            results = sparql.query().convert()
            for result in results["results"]["bindings"]:
                # Append result to data
                data.append({
                    "Tree": result["tree"]["value"],
                    "Name": result["name"]["value"],
                    "Family": result.get("family", {}).get("value"),
                    "Genus": result.get("genus", {}).get("value"),
                    "Order": result.get("order", {}).get("value"),
                    "Native Region": result.get("nativeArea", {}).get("value"),
                    "Conservation Status": result.get("conservationStatus", {}).get("value"),
                })
            success = True
        except Exception as e:
            print(f"Error occurred for offset {offset}: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Skipping this batch after multiple failed attempts.")

    # Add a delay to avoid rate-limiting
    time.sleep(1)

# Convert collected data to a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("tree_dataset.csv", index=False)
print("Dataset saved as tree_dataset.csv")

# Display the first few rows of the dataset
print(df.head())


Fetching rows 1 to 50...
Fetching rows 51 to 100...
Dataset saved as tree_dataset.csv
                                                Tree                    Name  \
0       http://dbpedia.org/resource/American_ginseng        American ginseng   
1       http://dbpedia.org/resource/American_ginseng        American ginseng   
2       http://dbpedia.org/resource/American_ginseng        American ginseng   
3       http://dbpedia.org/resource/American_ginseng        American ginseng   
4  http://dbpedia.org/resource/Amesiella_philippe...  Amesiella philippensis   

                                       Family  \
0     http://dbpedia.org/resource/Aralioideae   
1      http://dbpedia.org/resource/Araliaceae   
2     http://dbpedia.org/resource/Aralioideae   
3      http://dbpedia.org/resource/Araliaceae   
4  http://dbpedia.org/resource/Epidendroideae   

                                   Genus  \
0      http://dbpedia.org/resource/Panax   
1      http://dbpedia.org/resource/Panax   
2     

In [6]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# Set up SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 50  # Number of rows to fetch per query
total_records = 100  # Total number of records to fetch
retry_delay = 5  # Delay in seconds before retrying on failure

# SPARQL query loop
for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination (fetch only ?tree)
    sparql.setQuery(f"""
        SELECT DISTINCT ?tree
        WHERE {{
            ?tree a dbo:Plant .
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    success = False
    retries = 3  # Retry count

    while not success and retries > 0:
        try:
            # Execute query
            results = sparql.query().convert()
            for result in results["results"]["bindings"]:
                # Append only the Tree link
                data.append({"Tree": result["tree"]["value"]})
            success = True
        except Exception as e:
            print(f"Error occurred for offset {offset}: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Skipping this batch after multiple failed attempts.")

    # Add a delay to avoid rate-limiting
    time.sleep(1)

# Convert collected data to a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("tree_dataset_100.csv", index=False)
print("tree_dataset_100.csv")

# Display the first few rows of the dataset
print(df.head())


Fetching rows 1 to 50...
Fetching rows 51 to 100...
tree_dataset_100.csv
                                               Tree
0      http://dbpedia.org/resource/Cadaba_insularis
1  http://dbpedia.org/resource/Caesalpinia_coriaria
2          http://dbpedia.org/resource/Caia_(plant)
3              http://dbpedia.org/resource/Calabash
4     http://dbpedia.org/resource/Caladenia_lyallii


In [7]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# Set up SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 100  # Number of rows to fetch per query
total_records = 1000  # Total number of records to fetch
retry_delay = 5  # Delay in seconds before retrying on failure

# SPARQL query loop
for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination (fetch only ?tree)
    sparql.setQuery(f"""
        SELECT DISTINCT ?tree
        WHERE {{
            ?tree a dbo:Plant .
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    success = False
    retries = 3  # Retry count

    while not success and retries > 0:
        try:
            # Execute query
            results = sparql.query().convert()
            for result in results["results"]["bindings"]:
                # Append only the Tree link
                data.append({"Tree": result["tree"]["value"]})
            success = True
        except Exception as e:
            print(f"Error occurred for offset {offset}: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Skipping this batch after multiple failed attempts.")

    # Add a delay to avoid rate-limiting
    time.sleep(1)

# Convert collected data to a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("tree_dataset_1000.csv", index=False)
print("tree_dataset_1000.csv")

# Display the first few rows of the dataset
print(df.head())


Fetching rows 1 to 100...
Fetching rows 101 to 200...
Fetching rows 201 to 300...
Fetching rows 301 to 400...
Fetching rows 401 to 500...
Fetching rows 501 to 600...
Fetching rows 601 to 700...
Fetching rows 701 to 800...
Fetching rows 801 to 900...
Fetching rows 901 to 1000...
tree_dataset_1000.csv
                                               Tree
0      http://dbpedia.org/resource/Cadaba_insularis
1  http://dbpedia.org/resource/Caesalpinia_coriaria
2          http://dbpedia.org/resource/Caia_(plant)
3              http://dbpedia.org/resource/Calabash
4     http://dbpedia.org/resource/Caladenia_lyallii


In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# Set up SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 500  # Number of rows to fetch per query
total_records = 10000  # Total number of records to fetch
retry_delay = 5  # Delay in seconds before retrying on failure

# SPARQL query loop
for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination (fetch only ?tree)
    sparql.setQuery(f"""
        SELECT DISTINCT ?tree
        WHERE {{
            ?tree a dbo:Plant .
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    success = False
    retries = 3  # Retry count

    while not success and retries > 0:
        try:
            # Execute query
            results = sparql.query().convert()
            for result in results["results"]["bindings"]:
                # Append only the Tree link
                data.append({"Tree": result["tree"]["value"]})
            success = True
        except Exception as e:
            print(f"Error occurred for offset {offset}: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Skipping this batch after multiple failed attempts.")

    # Add a delay to avoid rate-limiting
    time.sleep(1)

# Convert collected data to a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("tree_dataset_10K.csv", index=False)
print("tree_dataset_10K.csv")

# Display the first few rows of the dataset
print(df.head())


Fetching rows 1 to 500...
Fetching rows 501 to 1000...
Fetching rows 1001 to 1500...
Fetching rows 1501 to 2000...
Fetching rows 2001 to 2500...
Fetching rows 2501 to 3000...
Fetching rows 3001 to 3500...
Fetching rows 3501 to 4000...
Fetching rows 4001 to 4500...
Fetching rows 4501 to 5000...
Fetching rows 5001 to 5500...
Fetching rows 5501 to 6000...
Fetching rows 6001 to 6500...
Fetching rows 6501 to 7000...
Fetching rows 7001 to 7500...
Fetching rows 7501 to 8000...
Fetching rows 8001 to 8500...
Fetching rows 8501 to 9000...
Fetching rows 9001 to 9500...
Fetching rows 9501 to 10000...
tree_dataset_10K.csv
                                               Tree
0      http://dbpedia.org/resource/Cadaba_insularis
1  http://dbpedia.org/resource/Caesalpinia_coriaria
2          http://dbpedia.org/resource/Caia_(plant)
3              http://dbpedia.org/resource/Calabash
4     http://dbpedia.org/resource/Caladenia_lyallii


In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# Set up SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 500  # Number of rows to fetch per query
total_records = 1000000  # Total number of records to fetch
retry_delay = 5  # Delay in seconds before retrying on failure

# SPARQL query loop
for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination (fetch only ?tree)
    sparql.setQuery(f"""
        SELECT DISTINCT ?tree
        WHERE {{
            ?tree a dbo:Plant .
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    success = False
    retries = 3  # Retry count

    while not success and retries > 0:
        try:
            # Execute query
            results = sparql.query().convert()
            for result in results["results"]["bindings"]:
                # Append only the Tree link
                data.append({"Tree": result["tree"]["value"]})
            success = True
        except Exception as e:
            print(f"Error occurred for offset {offset}: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Skipping this batch after multiple failed attempts.")

    # Add a delay to avoid rate-limiting
    time.sleep(1)

# Convert collected data to a DataFrame
df = pd.DataFrame(data)

# Save the dataset to a CSV file
df.to_csv("tree_dataset_1M.csv", index=False)
print("tree_dataset_1M.csv")

# Display the first few rows of the dataset
print(df.head())


Fetching rows 1 to 500...
Fetching rows 501 to 1000...
Fetching rows 1001 to 1500...
Fetching rows 1501 to 2000...
Fetching rows 2001 to 2500...
Fetching rows 2501 to 3000...
Fetching rows 3001 to 3500...
Fetching rows 3501 to 4000...
Fetching rows 4001 to 4500...
Fetching rows 4501 to 5000...
Fetching rows 5001 to 5500...
Fetching rows 5501 to 6000...
Fetching rows 6001 to 6500...
Fetching rows 6501 to 7000...
Fetching rows 7001 to 7500...
Fetching rows 7501 to 8000...
Fetching rows 8001 to 8500...
Fetching rows 8501 to 9000...
Fetching rows 9001 to 9500...
Fetching rows 9501 to 10000...
Fetching rows 10001 to 10500...
Fetching rows 10501 to 11000...
Fetching rows 11001 to 11500...
Fetching rows 11501 to 12000...
Fetching rows 12001 to 12500...
Fetching rows 12501 to 13000...
Fetching rows 13001 to 13500...
Fetching rows 13501 to 14000...
Fetching rows 14001 to 14500...
Fetching rows 14501 to 15000...
Fetching rows 15001 to 15500...
Fetching rows 15501 to 16000...
Fetching rows 16001 