In [2]:
!pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.7.2 rdflib-7.1.1


In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [4]:

sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Adjusted SPARQL query
sparql.setQuery("""
    SELECT ?device ?name ?manufacturer ?releaseDate
    WHERE {
        ?device a dbo:Device .
        ?device foaf:name ?name .
        OPTIONAL { ?device dbo:manufacturer ?manufacturer . }
        OPTIONAL { ?device dbo:releaseDate ?releaseDate . }
        FILTER (lang(?name) = 'en')
    }
    LIMIT 100
""")

sparql.setReturnFormat(JSON)

try:
    results = sparql.query().convert()
    # Process results
    data = []
    for result in results["results"]["bindings"]:
        data.append({
            "Device": result["device"]["value"],
            "Name": result["name"]["value"],
            "Manufacturer": result.get("manufacturer", {}).get("value"),
            "Release Date": result.get("releaseDate", {}).get("value"),
        })

    df = pd.DataFrame(data)
    df.to_csv("electronic_devices_fixed.csv", index=False)
    print("Dataset saved as electronic_devices_fixed.csv")
    print(df.head())
except Exception as e:
    print(f"Error: {e}")


Dataset saved as electronic_devices_fixed.csv
                                              Device  \
0  http://dbpedia.org/resource/Casio_G'zOne_Commando   
1         http://dbpedia.org/resource/Royole_FlexPai   
2      http://dbpedia.org/resource/Samsung_Continuum   
3  http://dbpedia.org/resource/Samsung_Galaxy_(or...   
4     http://dbpedia.org/resource/Samsung_Galaxy_A01   

                                   Name  \
0                 Casio G'zOne Commando   
1                        Royole FlexPai   
2                Samsung Continuum i400   
3  Samsung Galaxy (GT-I7500, GT-I7500L)   
4                    Samsung Galaxy A01   

                                        Manufacturer Release Date  
0  http://dbpedia.org/resource/NEC_Casio_Mobile_C...   2011-04-28  
1                 http://dbpedia.org/resource/Royole   2018-01-08  
2    http://dbpedia.org/resource/Samsung_Electronics   2010-06-04  
3                http://dbpedia.org/resource/Samsung   2009-06-29  
4    http://dbpedi

In [5]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

# Set up the SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 100  # Number of rows to fetch per query
total_records = 1000  # Target total number of records

for offset in range(0, total_records, rows_per_query):
    # SPARQL query with pagination
    sparql.setQuery(f"""
        SELECT ?device ?name ?manufacturer ?releaseDate
        WHERE {{
            ?device a dbo:Device .
            ?device foaf:name ?name .
            OPTIONAL {{ ?device dbo:manufacturer ?manufacturer . }}
            OPTIONAL {{ ?device dbo:releaseDate ?releaseDate . }}
            FILTER (lang(?name) = 'en')
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    try:
        # Execute the query
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            # Append data to the list
            data.append({
                "Device": result["device"]["value"],
                "Name": result["name"]["value"],
                "Manufacturer": result.get("manufacturer", {}).get("value"),
                "Release Date": result.get("releaseDate", {}).get("value"),
            })
    except Exception as e:
        print(f"Error occurred for offset {offset}: {e}")
        break

# Convert the collected data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("electronic_devices_1000.csv", index=False)
print("Dataset saved as electronic_devices_1000.csv")

# Display the first few rows
print(df.head())


Dataset saved as electronic_devices_1000.csv
                                              Device  \
0  http://dbpedia.org/resource/Casio_G'zOne_Commando   
1         http://dbpedia.org/resource/Royole_FlexPai   
2      http://dbpedia.org/resource/Samsung_Continuum   
3  http://dbpedia.org/resource/Samsung_Galaxy_(or...   
4     http://dbpedia.org/resource/Samsung_Galaxy_A01   

                                   Name  \
0                 Casio G'zOne Commando   
1                        Royole FlexPai   
2                Samsung Continuum i400   
3  Samsung Galaxy (GT-I7500, GT-I7500L)   
4                    Samsung Galaxy A01   

                                        Manufacturer Release Date  
0  http://dbpedia.org/resource/NEC_Casio_Mobile_C...   2011-04-28  
1                 http://dbpedia.org/resource/Royole   2018-01-08  
2    http://dbpedia.org/resource/Samsung_Electronics   2010-06-04  
3                http://dbpedia.org/resource/Samsung   2009-06-29  
4    http://dbpedia

In [6]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time  # To add delay if needed to avoid overloading the endpoint

# Set up the SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 500  # Number of rows to fetch per query
total_records = 10000  # Target total number of records

for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination
    sparql.setQuery(f"""
        SELECT ?device ?name ?manufacturer ?releaseDate
        WHERE {{
            ?device a dbo:Device .
            ?device foaf:name ?name .
            OPTIONAL {{ ?device dbo:manufacturer ?manufacturer . }}
            OPTIONAL {{ ?device dbo:releaseDate ?releaseDate . }}
            FILTER (lang(?name) = 'en')
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    try:
        # Execute the query
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            # Append data to the list
            data.append({
                "Device": result["device"]["value"],
                "Name": result["name"]["value"],
                "Manufacturer": result.get("manufacturer", {}).get("value"),
                "Release Date": result.get("releaseDate", {}).get("value"),
            })

        # Add a small delay to avoid overloading the endpoint
        time.sleep(1)  # Optional: Adjust if you experience rate-limiting issues
    except Exception as e:
        print(f"Error occurred for offset {offset}: {e}")
        break

# Convert the collected data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("electronic_devices_10000.csv", index=False)
print("Dataset saved as electronic_devices_10000.csv")

# Display the first few rows
print(df.head())


Fetching rows 1 to 500...
Fetching rows 501 to 1000...
Fetching rows 1001 to 1500...
Fetching rows 1501 to 2000...
Fetching rows 2001 to 2500...
Fetching rows 2501 to 3000...
Fetching rows 3001 to 3500...
Fetching rows 3501 to 4000...
Fetching rows 4001 to 4500...
Fetching rows 4501 to 5000...
Fetching rows 5001 to 5500...
Fetching rows 5501 to 6000...
Fetching rows 6001 to 6500...
Fetching rows 6501 to 7000...
Fetching rows 7001 to 7500...
Fetching rows 7501 to 8000...
Fetching rows 8001 to 8500...
Fetching rows 8501 to 9000...
Fetching rows 9001 to 9500...
Fetching rows 9501 to 10000...
Dataset saved as electronic_devices_10000.csv
                                              Device  \
0  http://dbpedia.org/resource/Casio_G'zOne_Commando   
1         http://dbpedia.org/resource/Royole_FlexPai   
2      http://dbpedia.org/resource/Samsung_Continuum   
3  http://dbpedia.org/resource/Samsung_Galaxy_(or...   
4     http://dbpedia.org/resource/Samsung_Galaxy_A01   

                     

In [7]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time

# Set up the SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 500  # Number of rows to fetch per query
total_records = 1000000  # Target total number of records
retry_delay = 5  # Seconds to wait before retrying a failed query

for offset in range(0, total_records, rows_per_query):
    print(f"Fetching rows {offset + 1} to {offset + rows_per_query}...")

    # SPARQL query with pagination
    sparql.setQuery(f"""
        SELECT ?device ?name ?manufacturer ?releaseDate
        WHERE {{
            ?device a dbo:Device .
            ?device foaf:name ?name .
            OPTIONAL {{ ?device dbo:manufacturer ?manufacturer . }}
            OPTIONAL {{ ?device dbo:releaseDate ?releaseDate . }}
            FILTER (lang(?name) = 'en')
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    success = False
    retries = 3  # Number of retries for each query

    while not success and retries > 0:
        try:
            # Execute the query
            results = sparql.query().convert()
            for result in results["results"]["bindings"]:
                # Append data to the list
                data.append({
                    "Device": result["device"]["value"],
                    "Name": result["name"]["value"],
                    "Manufacturer": result.get("manufacturer", {}).get("value"),
                    "Release Date": result.get("releaseDate", {}).get("value"),
                })
            success = True
        except Exception as e:
            print(f"Error occurred for offset {offset}: {e}")
            retries -= 1
            if retries > 0:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print("Skipping this batch after multiple failed attempts.")

    # Optional: Save intermediate results periodically to avoid data loss
    if offset % 10000 == 0 and offset > 0:
        df_partial = pd.DataFrame(data)
        df_partial.to_csv(f"electronic_devices_partial_{offset}.csv", index=False)
        print(f"Saved intermediate dataset up to offset {offset}.")

    # Add a small delay to avoid overloading the endpoint
    time.sleep(1)  # Optional: Adjust this delay if rate limits occur

# Convert the collected data to a DataFrame
df = pd.DataFrame(data)

# Save the complete dataset to a CSV file
df.to_csv("electronic_devices_1M.csv", index=False)
print("Dataset saved as electronic_devices_1M.csv")

# Display the first few rows
print(df.head())


Fetching rows 1 to 500...
Fetching rows 501 to 1000...
Fetching rows 1001 to 1500...
Fetching rows 1501 to 2000...
Fetching rows 2001 to 2500...
Fetching rows 2501 to 3000...
Fetching rows 3001 to 3500...
Fetching rows 3501 to 4000...
Fetching rows 4001 to 4500...
Fetching rows 4501 to 5000...
Fetching rows 5001 to 5500...
Fetching rows 5501 to 6000...
Fetching rows 6001 to 6500...
Fetching rows 6501 to 7000...
Fetching rows 7001 to 7500...
Fetching rows 7501 to 8000...
Fetching rows 8001 to 8500...
Fetching rows 8501 to 9000...
Fetching rows 9001 to 9500...
Fetching rows 9501 to 10000...
Fetching rows 10001 to 10500...
Saved intermediate dataset up to offset 10000.
Fetching rows 10501 to 11000...
Fetching rows 11001 to 11500...
Fetching rows 11501 to 12000...
Fetching rows 12001 to 12500...
Fetching rows 12501 to 13000...
Fetching rows 13001 to 13500...
Fetching rows 13501 to 14000...
Fetching rows 14001 to 14500...
Fetching rows 14501 to 15000...
Fetching rows 15001 to 15500...
Fetch

In [5]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

# Set up the SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Initialize an empty list to store data
data = []

# Define query parameters
rows_per_query = 500  # Number of rows to fetch per query
total_records = 1000000  # Target total number of records

for offset in range(0, total_records, rows_per_query):
    # SPARQL query with pagination
    sparql.setQuery(f"""
        SELECT ?device ?name ?manufacturer ?releaseDate
        WHERE {{
            ?device a dbo:Device .
            ?device foaf:name ?name .
            OPTIONAL {{ ?device dbo:manufacturer ?manufacturer . }}
            OPTIONAL {{ ?device dbo:releaseDate ?releaseDate . }}
            FILTER (lang(?name) = 'en')
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)
    sparql.setReturnFormat(JSON)

    try:
        # Execute the query
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            # Append data to the list
            data.append({
                "Device": result["device"]["value"],
                "Name": result["name"]["value"],
                "Manufacturer": result.get("manufacturer", {}).get("value"),
                "Release Date": result.get("releaseDate", {}).get("value"),
            })
    except Exception as e:
        print(f"Error occurred for offset {offset}: {e}")
        break

# Convert the collected data to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("electronic_devices_1M.csv", index=False)
print("Dataset saved as electronic_devices_1M.csv")

# Display the first few rows
print(df.head())


Dataset saved as electronic_devices_1M.csv
                                              Device  \
0  http://dbpedia.org/resource/Casio_G'zOne_Commando   
1         http://dbpedia.org/resource/Royole_FlexPai   
2      http://dbpedia.org/resource/Samsung_Continuum   
3  http://dbpedia.org/resource/Samsung_Galaxy_(or...   
4     http://dbpedia.org/resource/Samsung_Galaxy_A01   

                                   Name  \
0                 Casio G'zOne Commando   
1                        Royole FlexPai   
2                Samsung Continuum i400   
3  Samsung Galaxy (GT-I7500, GT-I7500L)   
4                    Samsung Galaxy A01   

                                        Manufacturer Release Date  
0  http://dbpedia.org/resource/NEC_Casio_Mobile_C...   2011-04-28  
1                 http://dbpedia.org/resource/Royole   2018-01-08  
2    http://dbpedia.org/resource/Samsung_Electronics   2010-06-04  
3                http://dbpedia.org/resource/Samsung   2009-06-29  
4    http://dbpedia.o