In [3]:
import matplotlib.pyplot as plt
import numpy as np
import practicalSPARQL
import pandas as pd

In [6]:
root_q = 'queries'
root_d = 'results'

# read login data
login = pd.read_json('sparql/config_sparql.json')
ENDPOINT = login['endpoint'][0]
USERNAME = login['username'][0]
PASSWORD = login['password'][0]

# create sparql object
sparql = practicalSPARQL.practicalWrapper(ENDPOINT)
sparql.setCredentials(USERNAME, PASSWORD)

print("--- Querying ENDPOINT: {} ---".format(ENDPOINT))

--- Querying ENDPOINT: http://devmeta.sphaera.mpiwg-berlin.mpg.de/sparql ---


In [7]:
q = practicalSPARQL.stringify_SPARQL('sparql/elements_query_050824.sparql')
df = sparql.select_as_dataframe(q)

In [8]:
#add books without images to the df - find a query with books and parts to add them here!

q = practicalSPARQL.stringify_SPARQL('sparql/books_query.sparql')    # select data from the ttl file as a dataframe
books = sparql.select_as_dataframe(q)

In [11]:
# Rename columns
df = df.rename(columns={
    'cluster_name': 'cluster',
    'custom_identifier': 'part_id'
})

# Select only the desired columns
columns_to_keep = [
    'images',
    'cluster',
    'cks',
    'bid',
    'part_id',
    'place',
    'year',
    'flag'
]

# Create the final DataFrame
df = df[columns_to_keep]

In [12]:
#all adaption of 100
q = practicalSPARQL.stringify_SPARQL('sparql/adaptions_100.sparql')
df_adaptions_100 = sparql.select_as_dataframe(q)

In [13]:
#all influenced by 100
q = practicalSPARQL.stringify_SPARQL('sparql/influenced_100.sparql')
df_influenced_100 = sparql.select_as_dataframe(q) 

In [14]:
#all adaption of 104
q = practicalSPARQL.stringify_SPARQL('sparql/adaptions_104.sparql')
df_adaptions_104 = sparql.select_as_dataframe(q) 

In [15]:
#all influenced by 104
q = practicalSPARQL.stringify_SPARQL('sparql/Influenced_104.sparql')
df_influenced_104 = sparql.select_as_dataframe(q) 

In [16]:
# Convert part_id to numeric, forcing errors to NaN
df.loc[:, 'part_id'] = pd.to_numeric(df['part_id'], errors='coerce')

# Convert custom_identifier columns in each small dataframe to numeric
df_adaptions_100.loc[:, 'custom_identifier'] = pd.to_numeric(df_adaptions_100['custom_identifier'], errors='coerce')
df_influenced_100.loc[:, 'custom_identifier'] = pd.to_numeric(df_influenced_100['custom_identifier'], errors='coerce')
df_adaptions_104.loc[:, 'custom_identifier'] = pd.to_numeric(df_adaptions_104['custom_identifier'], errors='coerce')
df_influenced_104.loc[:, 'custom_identifier'] = pd.to_numeric(df_influenced_104['custom_identifier'], errors='coerce')

# Create lists of unique custom_identifier values for each DataFrame
ids_adaptions_100 = df_adaptions_100['custom_identifier'].unique()
ids_influenced_100 = df_influenced_100['custom_identifier'].unique()
ids_adaptions_104 = df_adaptions_104['custom_identifier'].unique()
ids_influenced_104 = df_influenced_104['custom_identifier'].unique()

# Print the lists of unique custom identifiers
print("Adaptions 100:", ids_adaptions_100)
print("Influenced 100:", ids_influenced_100)
print("Adaptions 104:", ids_adaptions_104)
print("Influenced 104:", ids_influenced_104)


Adaptions 100: [231 296 411 229 412 329 417 346 270 259 267 408 261 226 260 888 291 274
 279 268 343 418 410 937 938 940 939 941 322 432 429 421 425 426 932 313
 295 276 283 359 934 423 390 272 419 379 292 273 935 933 936 282 428 266
 367 316 287 348 333 212]
Influenced 100: [688 779 777 808 812 919 928 929 251 233 309 315 370 520 503 654 192 257
 311 368 578 635 650 568 542 561 659 661 651 677]
Adaptions 104: [277 297 320 349 424 6002 6005 6004 6010 6003 6012 6013 6001 6009 6007
 6000]
Influenced 104: [6015]


In [17]:

# Create lists of unique custom_identifier values for each group
ids_adaptions_100 = df_adaptions_100['custom_identifier'].unique()
ids_influenced_100 = df_influenced_100['custom_identifier'].unique()
ids_adaptions_104 = df_adaptions_104['custom_identifier'].unique()
ids_influenced_104 = df_influenced_104['custom_identifier'].unique()

# Define part_type based on part_id and custom_identifier values
df.loc[:, 'part_type'] = df['part_id'].apply(lambda pid: 
    '100' if pid == 100 else    # Assign '100' if part_id is 100
    '104' if pid == 104 else    # Assign '104' if part_id is 104
    'adaption_100' if pid in ids_adaptions_100 else
    'influenced_100' if pid in ids_influenced_100 else
    'adaption_104' if pid in ids_adaptions_104 else
    'influenced_104' if pid in ids_influenced_104 else
    'other'  # Assign 'other' if no conditions match
)

# Print the unique values in the 'part_type' column after assignment
print(df['part_type'].unique())



['adaption_100' 'other' '104' '100' 'adaption_104' 'influenced_100']


In [18]:
# Group by 'part_type' and count the unique 'bid' values for each 'part_type'
part_type_counts = df.groupby('part_type')['bid'].nunique()

# Print the results
print(part_type_counts)


part_type
100                48
104                10
adaption_100      195
adaption_104       10
influenced_100    109
other             196
Name: bid, dtype: int64


In [19]:
df

Unnamed: 0,images,cluster,cks,bid,part_id,place,year,flag,part_type
0,http://db.sphaera.mpiwg-berlin.mpg.de/containe...,SAC_SIL_00061,CK_Introduction to Geometry,1622,329.0,Venice,1494,,adaption_100
1,http://db.sphaera.mpiwg-berlin.mpg.de/containe...,SAC_SIL_01218,"CK_09 Spheres, CK_T-O Maps",1622,329.0,Venice,1494,,adaption_100
2,http://db.sphaera.mpiwg-berlin.mpg.de/containe...,SAC_SIL_01467,CK_Geometry for Geometry,1622,329.0,Venice,1494,,adaption_100
3,http://db.sphaera.mpiwg-berlin.mpg.de/containe...,SAC_SIL_00326,CK_Sphericity of the Water Element,1622,329.0,Venice,1494,,adaption_100
4,http://db.sphaera.mpiwg-berlin.mpg.de/containe...,SAC_SIL_01349,"CK_Sphericity of the Earth, CK_Populated Earth...",1622,329.0,Venice,1494,,adaption_100
...,...,...,...,...,...,...,...,...,...
20949,http://www.researchspace.org/ontology/ImageReg...,SAC_SIL_01695,CK_Negligible Dimensions of the Earth,2281,940.0,Antwerp,1582,,adaption_100
20950,http://www.researchspace.org/ontology/ImageReg...,SAC_SIL_01616,"CK_Sphericity of the Heavens, CK_Apparent Size...",2281,940.0,Antwerp,1582,,adaption_100
20951,http://www.researchspace.org/ontology/ImageReg...,SAC_SIL_01511,"CK_Sphericity of the Water Element, CK_Terraqu...",2281,940.0,Antwerp,1582,,adaption_100
20952,http://www.researchspace.org/ontology/ImageReg...,SAC_SIL_01507,CK_Daily Rotation of the Firmament,2281,940.0,Antwerp,1582,,adaption_100


In [20]:
# Split the 'cks' column by comma and explode the result into separate rows
df_exploded = df.assign(cks=df['cks'].str.split(',')).explode('cks')

# Strip any leading or trailing whitespaces from each exploded keyword
df_exploded['cks'] = df_exploded['cks'].str.strip()

# Print the result to check
print(df_exploded[['bid', 'part_type', 'cks']].head())


    bid     part_type                                 cks
0  1622  adaption_100         CK_Introduction to Geometry
1  1622  adaption_100                       CK_09 Spheres
1  1622  adaption_100                         CK_T-O Maps
2  1622  adaption_100            CK_Geometry for Geometry
3  1622  adaption_100  CK_Sphericity of the Water Element


In [21]:
#adding year intervals to part_types 

# Ensure the year columns are of integer type (use .loc to avoid SettingWithCopyWarning)
df_exploded.loc[:, 'year'] = df_exploded['year'].astype(int)

# Define bins and labels for year intervals
bins = [1470, 1490, 1510, 1530, 1550, 1570, 1590, 1610, 1630, 1650]
labels = [
    '1470-1489', '1490-1509', '1510-1529', '1530-1549',
    '1550-1569', '1570-1589', '1590-1609', '1610-1629', '1630-1650'
]

# Add 'interval' column to the filtered_df DataFrame based on custom bins (use .loc to avoid SettingWithCopyWarning)
df_exploded.loc[:, 'year_interval'] = pd.cut(df_exploded['year'], bins=bins, labels=labels, right=False)

In [22]:
#adding place categories (to both df)

# Define the place categories
large_centers = ['Venice', 'Paris', 'Wittenberg']
medium_centers = ['Antwerp', 'Leipzig', 'Frankfurt (Main)', 'Lyon', 'Cologne', 'London']
small_centers = ['Rome', 'Strasbourg', 'Seville', 'Leiden', 'Milan', 'Saint Gervais', 
                 'Florence', 'Kraków', 'Salamanca', 'Lisbon', 'Bologna', 'Madrid', 
                 'Sine loco', 'Basel', 'Lemgo', 'Dijon', 'Valladolid', 'Perugia']
one_book_centers = ['Siena', 'Avignon', 'Vienna', 'Ferrara', 'Padua', 'Nuremberg', 
                    'Neustadt an der Weinstraße', 'Mexico City', 'Mainz', 'Coimbra', 'Leuven', 
                    'Ingolstadt', 'Heidelberg', 'Geneva', 'Dillingen an der Donau', 'Alcalá de Henares']

# Create a dictionary mapping cities to place categories
city_to_category = {}

for city in large_centers:
    city_to_category[city] = 'Large Center'
for city in medium_centers:
    city_to_category[city] = 'Medium Center'
for city in small_centers:
    city_to_category[city] = 'Small Center'
for city in one_book_centers:
    city_to_category[city] = 'One Book Center'

# Function to assign place category based on city name
def assign_place_category(city):
    return city_to_category.get(city, 'Unknown')  # Default to 'Unknown' if the city is not in the list

# Add the 'place_category' column to df
df_exploded['place_category'] = df_exploded['place'].apply(assign_place_category)

In [23]:
# Save the DataFrame to a CSV file
df_exploded.to_csv('kremer_project.csv', index=False)