In [None]:
!pip install SPARQLWrapper


Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.1.0-py3-none-any.whl.metadata (11 kB)
Collecting html5lib-modern<2.0,>=1.2 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading html5lib_modern-1.2-py2.py3-none-any.whl.metadata (7.8 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Downloading rdflib-7.1.0-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.1/562.1 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading html5lib_modern-1.2-py2.py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.2/116.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, html5lib

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [None]:

# Set up the DBpedia SPARQL endpoint
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

# Define a SPARQL query to get data for famous people
# This example retrieves name, birth year, occupation, country, field, and notable achievements for each person
sparql.setQuery("""
    SELECT ?person ?name ?birthYear ?occupation ?country ?field ?notableAchievement
    WHERE {
        ?person a dbo:Person .
        ?person foaf:name ?name .
        OPTIONAL { ?person dbo:birthYear ?birthYear . }
        OPTIONAL { ?person dbo:occupation ?occupation . }
        OPTIONAL { ?person dbo:birthPlace ?birthPlace . ?birthPlace dbo:country ?country . }
        OPTIONAL { ?person dbo:field ?field . }
        OPTIONAL { ?person dbp:notableAchievement ?notableAchievement . }
        FILTER (lang(?name) = 'en')
    }
    LIMIT 100  # Adjust the limit for more data
""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Process the results and create a structured dataset
data = []
for result in results["results"]["bindings"]:
    person_data = {
        "Link": result["person"]["value"],
        "Name": result["name"]["value"],
        "Birth Year": result["birthYear"]["value"] if "birthYear" in result else None,
        "Occupation": result["occupation"]["value"] if "occupation" in result else None,
        "Country": result["country"]["value"] if "country" in result else None,
        "Field": result["field"]["value"] if "field" in result else None,
        "Notable Achievement": result["notableAchievement"]["value"] if "notableAchievement" in result else None
    }
    data.append(person_data)

# Convert to a DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv("famous_persons_dbpedia.csv", index=False)
print("Dataset saved as famous_persons_dbpedia.csv")

# Display the first few rows of the DataFrame
print(df.head())


Dataset saved as famous_persons_dbpedia.csv
                                        Link    Name Birth Year  \
0  http://dbpedia.org/resource/Cabral_Ibacka  Cabral       1977   
1  http://dbpedia.org/resource/Cabral_Ibacka  Cabral       1977   
2  http://dbpedia.org/resource/Cabral_Ibacka  Cabral       1977   
3  http://dbpedia.org/resource/Cabral_Ibacka  Cabral       1977   
4  http://dbpedia.org/resource/Cabral_Ibacka  Cabral       1977   

                                          Occupation  \
0  http://dbpedia.org/resource/Cabral_Ibacka__Per...   
1  http://dbpedia.org/resource/Television_persona...   
2                  http://dbpedia.org/resource/Actor   
3         http://dbpedia.org/resource/Philanthropist   
4           http://dbpedia.org/resource/TV_presenter   

                               Country Field Notable Achievement  
0  http://dbpedia.org/resource/Romania  None                None  
1  http://dbpedia.org/resource/Romania  None                None  
2  http://dbped

In [None]:
# Set up variables for pagination
rows_per_query = 1000
total_records = 10000  # Adjust this to the approximate number of records you want
data = []

for offset in range(0, total_records, rows_per_query):
    # Update query with limit and offset for pagination
    sparql.setQuery(f"""
        SELECT ?person ?name ?birthYear ?occupation ?country ?field ?notableAchievement
        WHERE {{
            ?person a dbo:Person .
            ?person foaf:name ?name .
            OPTIONAL {{ ?person dbo:birthYear ?birthYear . }}
            OPTIONAL {{ ?person dbo:occupation ?occupation . }}
            OPTIONAL {{ ?person dbo:birthPlace ?birthPlace . ?birthPlace dbo:country ?country . }}
            OPTIONAL {{ ?person dbo:field ?field . }}
            OPTIONAL {{ ?person dbp:notableAchievement ?notableAchievement . }}
            FILTER (lang(?name) = 'en')
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)

    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # Process results as before
    for result in results["results"]["bindings"]:
        person_data = {
            "Link": result["person"]["value"],
            "Name": result["name"]["value"],
            "Birth Year": result["birthYear"]["value"] if "birthYear" in result else None,
            "Occupation": result["occupation"]["value"] if "occupation" in result else None,
            "Country": result["country"]["value"] if "country" in result else None,
            "Field": result["field"]["value"] if "field" in result else None,
            "Notable Achievement": result["notableAchievement"]["value"] if "notableAchievement" in result else None
        }
        data.append(person_data)

# Convert to a DataFrame and save as before
df = pd.DataFrame(data)
df.to_csv("famous_persons_dbpedia_large.csv", index=False)
print("Large dataset saved as famous_persons_dbpedia_large.csv")



Large dataset saved as famous_persons_dbpedia_large.csv


In [None]:
from google.colab import files
files.download("famous_persons_dbpedia.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Set up variables for pagination
rows_per_query = 1000
total_records = 100000  # Adjust this to the approximate number of records you want
data = []

for offset in range(0, total_records, rows_per_query):
    # Update query with limit and offset for pagination
    sparql.setQuery(f"""
        SELECT ?person ?name ?birthYear ?occupation ?country ?field ?notableAchievement
        WHERE {{
            ?person a dbo:Person .
            ?person foaf:name ?name .
            OPTIONAL {{ ?person dbo:birthYear ?birthYear . }}
            OPTIONAL {{ ?person dbo:occupation ?occupation . }}
            OPTIONAL {{ ?person dbo:birthPlace ?birthPlace . ?birthPlace dbo:country ?country . }}
            OPTIONAL {{ ?person dbo:field ?field . }}
            OPTIONAL {{ ?person dbp:notableAchievement ?notableAchievement . }}
            FILTER (lang(?name) = 'en')
        }}
        LIMIT {rows_per_query} OFFSET {offset}
    """)

    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # Process results as before
    for result in results["results"]["bindings"]:
        person_data = {
            "Link": result["person"]["value"],
            "Name": result["name"]["value"],
            "Birth Year": result["birthYear"]["value"] if "birthYear" in result else None,
            "Occupation": result["occupation"]["value"] if "occupation" in result else None,
            "Country": result["country"]["value"] if "country" in result else None,
            "Field": result["field"]["value"] if "field" in result else None,
            "Notable Achievement": result["notableAchievement"]["value"] if "notableAchievement" in result else None
        }
        data.append(person_data)

# Convert to a DataFrame and save as before
df = pd.DataFrame(data)
df.to_csv("famous_persons_dbpedia_large.csv", index=False)
print("Large dataset saved as famous_persons_dbpedia_large.csv")



Large dataset saved as famous_persons_dbpedia_large.csv
