In [1]:
from seshat_api import SeshatAPI
import pandas as pd
from ollama import chat, ChatResponse
import re

# Questions for DeepSeek-R1

Can a recent top performing LLM (DeepSeek-R1) correctly predict whether a variable from the Seshat Global History Databank (e.g. "Scientific Literature") should be "present" or "absent" for a selection of polities, given a definition of the variable, the name of the polity and the years in which it existed?

If DeepSeek has a good understanding of history, can it use this to guess from the polity name and years what time/place the prompt is referring to, and whether the variable in question would have been present?

**Important:** I have included some example CSVs of DeepSeek responses to prompts, since running hundreds of prompts took several hours on my laptop (M1 Mac). If you want to generate new responses from DeepSeek, make sure you uncomment the calls to the `deepseek_responses` function below, but also make sure to *not* overwrite your own responses in the next cell where I load the pre-made responses from CSV.

In [2]:
client = SeshatAPI(base_url="https://seshat-db.com/api")

## Getting data from Seshat

First let's define our variable to be used in an LLM prompt later (taken from seshat-db.com), then load the data from the Seshat API to use as our ground truth to test against.

In [3]:
variable = 'Scientific Literature'
definition = "Talking about Kinds of Written Documents, Scientific literature includes mathematics, natural sciences, social sciences"

In [4]:
from seshat_api.sc import ScientificLiteratures
scientific_literatures = ScientificLiteratures(client)
scientific_literatures_df = pd.DataFrame(scientific_literatures.get_all())
len(scientific_literatures_df)

421

In [5]:
# scientific_literatures_df[scientific_literatures_df['new_name'] == 'tr_ottoman_emp_1']
scientific_literatures_df.head()

Unnamed: 0,id,year_from,year_to,description,note,finalized,created_date,modified_date,tag,is_disputed,is_uncertain,expert_reviewed,drb_reviewed,name,scientific_literature,polity,comment,private_comment,citations,curator
0,154,930.0,1100.0,absent/present/unknown Durrenberger notes the...,,True,,,TRS,False,False,True,,scientific_literature,absent,"{'id': 115, 'name': 'IsCommw', 'start_year': 9...",,"{'id': 1, 'text': 'NO_PRIVATE_COMMENTS'}",[],[]
1,467,1066.0,1115.0,,,False,2024-11-24T18:12:19.616490Z,2024-11-24T18:14:04.438817Z,SSP,False,False,False,False,Scientific_literature,absent,"{'id': 586, 'name': 'gb_england_norman', 'star...","{'id': 1029, 'text': 'a new_comment_text'}",,[],[]
2,155,1101.0,1262.0,absent/present/unknown Durrenberger notes the...,,True,,,TRS,False,False,True,,scientific_literature,present,"{'id': 115, 'name': 'IsCommw', 'start_year': 9...",,"{'id': 1, 'text': 'NO_PRIVATE_COMMENTS'}",[],[]
3,466,1116.0,1153.0,,,False,2024-11-24T17:57:20.281912Z,2024-11-24T18:05:21.430097Z,TRS,False,False,False,False,Scientific_literature,present,"{'id': 586, 'name': 'gb_england_norman', 'star...","{'id': 1028, 'text': 'a new_comment_text'}",,[],[]
4,236,1200.0,1299.0,"""Astronomical almanacs inferred for Classic p...",,True,,,SSP,False,False,True,,scientific_literature,unknown,"{'id': 15, 'name': 'MxPostM', 'start_year': 12...",,"{'id': 1, 'text': 'NO_PRIVATE_COMMENTS'}",[],[]


In [6]:
test = pd.DataFrame(scientific_literatures_df['polity'].tolist())
test['scientific_literature'] = scientific_literatures_df['scientific_literature']
test[test['new_name'] == 'tr_ottoman_emp_1']

Unnamed: 0,id,name,start_year,end_year,long_name,new_name,polity_tag,general_description,shapefile_name,private_comment,created_date,modified_date,home_nga,home_seshat_region,private_comment_n,scientific_literature
351,174,TrOttm2,1402,1517,Ottoman Empire I,tr_ottoman_emp_1,LEGACY,During the fifteenth century the Ottomans reco...,,,,,"{'id': 11, 'name': 'Konya Plain', 'subregion':...","{'id': 43, 'name': 'Anatolia-Caucasus', 'subre...","{'id': 1, 'text': 'NO_PRIVATE_COMMENTS'}",present


Let's just use expert reviewed data and ignore examples where the value is anything other than "present" or "absent" to create a subsample of the dataset.

We should also reformat the dataframe so we have information about the polities such as the start and end year alongside the variable value, then remove columns we aren't interested in.

In [7]:
def process_df(seshat_df, variable_id):

    # Extract the polities column to a new dataframe
    polities_df = pd.DataFrame(seshat_df['polity'].tolist())

    # Add columns to the new dataframe
    polities_df[variable_id] = seshat_df[variable_id]
    polities_df['expert_reviewed'] = seshat_df['expert_reviewed']

    # Filter out the records that are not expert reviewed
    polities_df = polities_df[polities_df['expert_reviewed'] == True]

    # Filter out records where the variable is not either 'present' or 'absent'
    polities_df = polities_df[polities_df[variable_id].isin(['present', 'absent'])]

    # Filter out records where the variable is NaN
    polities_df = polities_df[polities_df[variable_id].notna()]
    print("There are", len(polities_df), variable_id, "records after filtering.")

    # Get rid of the columns we don't need
    polities_df = polities_df[['new_name', 'long_name', 'start_year', 'end_year', variable_id, 'general_description']]

    return polities_df

In [8]:
polities_with_scientific_literatures_df = process_df(scientific_literatures_df, 'scientific_literature')

There are 255 scientific_literature records after filtering.


In [9]:
# Sense check
polities_with_scientific_literatures_df[polities_with_scientific_literatures_df['new_name'] == 'tr_ottoman_emp_1']

Unnamed: 0,new_name,long_name,start_year,end_year,scientific_literature,general_description
351,tr_ottoman_emp_1,Ottoman Empire I,1402,1517,present,During the fifteenth century the Ottomans reco...


In [10]:
polities_with_scientific_literatures_df.sample(5)

Unnamed: 0,new_name,long_name,start_year,end_year,scientific_literature,general_description
361,ye_qatabanian_commonwealth,Qatabanian Commonwealth,-450,-111,absent,The Yemeni Coastal Plain or Plateau is the nor...
321,mx_monte_alban_1_early,Early Monte Alban I,-500,-300,absent,The Monte Albán Early I phase runs from 500 to...
335,pe_cuzco_6,Cuzco - Late Intermediate II,1250,1400,absent,"After the collapse of the Wari empire, the Cuz..."
223,uz_sogdiana_city_states,Sogdiana - City-States Period,604,711,present,General description:<br>The Sogdian City State...
250,in_achik_2,Late A'chik,1867,1956,absent,"The Garo Hills, located in Meghalaya in northe..."


## "Prompt engineering"

First let's define a prompt function to use with our dataframe:

In [11]:
def year_CE(year):
    if year >= 0:
        return f"{year} CE"
    else:
        return f"{abs(year)} BCE"

def prompt_func(seshat_df, polity_name, variable, variable_definition):
    df = seshat_df[seshat_df['new_name'] == polity_name]
    polity = list(df['long_name'])[0]
    # description = list(df['general_description'])[0]  # TODO: we could add this to the prompt to add more context
    start_year = list(df['start_year'])[0]
    end_year = list(df['end_year'])[0]
    prompt = "Use your knowledge of world history to answer the following question. "
    prompt += f"Given your knowledge of the historical polity '{polity}', "
    prompt += f"a polity that existed between {year_CE(start_year)} and {year_CE(end_year)}"
    prompt += f", do you expect that {variable} was present or absent? "
    prompt += f"{variable} is defined as: '{variable_definition}'. "
    
    # To help extract the answer from the text response later, make sure we have a string that can be found with a regex:
    prompt += "Answer 'XXXpresentXXX' if you expect it to be present, and 'XXXabsentXXX' if you expect it to be absent."
    return prompt
    

How does the prompt look with an example "new_name" (Seshat ID) of `tn_fatimid_cal`, which we know has a record for scientific literature in the database:

In [12]:
test_seshat_polity_name = 'tn_fatimid_cal'
test_prompt = prompt_func(polities_with_scientific_literatures_df, test_seshat_polity_name, variable, definition)
test_prompt

"Use your knowledge of world history to answer the following question. Given your knowledge of the historical polity 'Fatimid Caliphate', a polity that existed between 909 CE and 1171 CE, do you expect that Scientific Literature was present or absent? Scientific Literature is defined as: 'Talking about Kinds of Written Documents, Scientific literature includes mathematics, natural sciences, social sciences'. Answer 'XXXpresentXXX' if you expect it to be present, and 'XXXabsentXXX' if you expect it to be absent."

Ok, let's see what DeepSeek responds for this prompt:

In [13]:
response: ChatResponse = chat(model='deepseek-r1', messages=[
  {
    'role': 'user',
    'content': test_prompt,
  },
])
print(response.message.content)

<think>
Alright, so I need to figure out whether scientific literature existed during the Fatimid Caliphate period between 909 CE and 1171 CE. The question defines scientific literature as including math, natural sciences, and social sciences.

First, I remember that the Fatimid Caliphate was a significant cultural and scientific hub in the Islamic Golden Age. It was based in Cairo, which is known for its libraries like the Dar al-Hikma (House of Wisdom). I think scholars there contributed to various fields including medicine, astronomy, and mathematics.

I recall that during this period, there were major advancements. For example, Al-Fazari is mentioned as a significant translator of Indian numerals into Arabic, which would have been part of mathematical literature. There was also work on optics by Al-Kindi and others, so natural sciences were definitely active.

In social sciences, there might have been studies on inheritance laws or societal structures since the Fatimids had a compl

Was that correct? Let's check the dataframe to see:

In [14]:
polities_with_scientific_literatures_df[polities_with_scientific_literatures_df['new_name'] == test_seshat_polity_name]

Unnamed: 0,new_name,long_name,start_year,end_year,scientific_literature,general_description
346,tn_fatimid_cal,Fatimid Caliphate,909,1171,present,The Fatimid Caliphate lasted from 909 to 1171 ...


### Get DeepSeek's answer data

Let's write a function to collect responses from DeepSeek for all the polities in our dataset and make a new dataframe with the results:

In [15]:
def deepseek_responses(seshat_df, variable, definition, count=None):  # Use the count parameter to limit the number of responses
    def generate_response(polity):
        response: ChatResponse = chat(model='deepseek-r1', messages=[
            {
                'role': 'user',
                'content': prompt_func(seshat_df, polity, variable, definition),
            },
        ])
        return response

    responses = pd.DataFrame(columns=['new_name', 'answer', 'full'])
    for polity in seshat_df['new_name']:
        if count is not None and len(responses) >= count:
            break
        response = generate_response(polity)
        while 'XXXabsentXXX' not in response.message.content and 'XXXpresentXXX' not in response.message.content:
            response = generate_response(polity)  # Keep asking until we get a valid response
        answer = re.search(r'(XXXabsentXXX|XXXpresentXXX)', response.message.content).group(1)
        answer = answer.replace("XXX", "")

        # Add the new row to the DataFrame using pd.concat
        responses = pd.concat([responses, pd.DataFrame([{
            'new_name': polity,
            'answer': answer,
            'full': response.message.content
        }])], ignore_index=True)

    return responses

Now let's generate the answer data for this variable across polities in our dataset:

In [16]:
# Get the responses for the scientific literature variable - uncomment to run (you can set count to limit the number of responses)
scientific_literature_responses = deepseek_responses(polities_with_scientific_literatures_df, variable, definition)

In [17]:
# Save the responses to a CSV file - uncomment to run
scientific_literature_responses.to_csv('scientific_literature_responses.csv', index=False)

# Load the responses from the CSV file
# scientific_literature_responses = pd.read_csv('scientific_literature_responses.csv')

In [18]:
scientific_literature_responses.sample(5)

Unnamed: 0,new_name,answer,full
170,ir_il_khanate,present,"<think>\nOkay, so I need to figure out whether..."
216,mx_monte_alban_1_late,absent,"<think>\nOkay, so I need to figure out whether..."
101,af_kushan_emp,absent,"<think>\nOkay, so I need to figure out whether..."
56,it_roman_rep_1,present,"<think>\nOkay, so I have this question about w..."
24,cn_northern_wei_dyn,absent,"<think>\nOkay, so I have this question about t..."


### How well did the model perform?

Let's do a simple check to see what percentage of the answers were correct, according to the Seshat ground truth:

In [19]:
def performance(polities_df, responses_df, variable_id):
    total = 0
    correct = 0
    for _, response in responses_df.iterrows():
        polity = response['new_name']
        seshat_answer = polities_df[polities_df['new_name'] == polity][variable_id].values[0]
        if seshat_answer == response['answer']:
            correct += 1
        # print(polity, ": ", seshat_answer, response['answer'])
        total += 1
    percentage = correct / total * 100
    print(f"Correct: {correct}, Total: {total}, Percentage: {percentage:.2f}%")
    return percentage

In [20]:
performance(polities_with_scientific_literatures_df, scientific_literature_responses, 'scientific_literature')

Correct: 180, Total: 255, Percentage: 70.59%


70.58823529411765

## Let's try that again

Now let's run the pipeline again with a different variable:

In [21]:
from seshat_api.sc import DrinkingWaterSupplies
drinking_water_supplies = DrinkingWaterSupplies(client)
drinking_water_supplies_df = pd.DataFrame(drinking_water_supplies.get_all())
polities_with_drinking_water_supplies_df = process_df(drinking_water_supplies_df, 'drinking_water_supply_system')
polities_with_drinking_water_supplies_df.sample(5)

There are 201 drinking_water_supply_system records after filtering.


Unnamed: 0,new_name,long_name,start_year,end_year,drinking_water_supply_system,general_description
236,iq_ubaid,Ubaid,-5500,-4201,present,The name of 'Ubaid polity' derives from the ar...
208,fr_capetian_k_2,French Kingdom - Late Capetian,1150,1328,present,In the history of France the Late Capetian per...
269,mx_basin_of_mexico_3,Early Formative Basin of Mexico,-1200,-801,absent,The Basin or Valley of Mexico is a highlands p...
184,ec_shuar_1,Shuar - Colonial,1534,1830,absent,"The forested foothills of the Andes, near the ..."
228,in_achik_2,Late A'chik,1867,1956,absent,"The Garo Hills, located in Meghalaya in northe..."


In [22]:
# Get the responses for the Drinking Water Supply System variable
polities_with_drinking_water_supplies_responses = deepseek_responses(polities_with_drinking_water_supplies_df,
                                                     'Drinking Water Supply System',
                                                     "Talking about Specialized Buildings, drinking water supply systems are polity owned (which includes owned by the community, or the state), we have coded the absence or presence of the variable",
                                                     )

In [23]:
# Save the responses to a CSV file
polities_with_drinking_water_supplies_responses.to_csv('polities_with_drinking_water_supplies_responses.csv', index=False)

# Load the responses from the CSV file
# polities_with_drinking_water_supplies_responses = pd.read_csv('polities_with_drinking_water_supplies_responses.csv')

In [24]:
polities_with_drinking_water_supplies_responses.sample(5)

Unnamed: 0,new_name,answer,full
36,is_icelandic_commonwealth,present,"<think>\nOkay, so I need to figure out whether..."
119,fr_capetian_k_2,present,"<think>\nOkay, so I need to figure out whether..."
165,jp_tokugawa_shogunate,present,"<think>\nOkay, so I need to figure out whether..."
70,us_woodland_2,absent,"<think>\nOkay, so I need to figure out whether..."
67,us_woodland_1,present,"<think>\nOkay, so I need to figure out whether..."


In [25]:
performance(polities_with_drinking_water_supplies_df, polities_with_drinking_water_supplies_responses, 'drinking_water_supply_system')

Correct: 127, Total: 201, Percentage: 63.18%


63.18407960199005

## Another one!

In [26]:
from seshat_api.sc import Roads
roads = Roads(client)
roads_df = pd.DataFrame(roads.get_all())
polities_with_roads_df = process_df(roads_df, 'road')
polities_with_roads_df.sample(5)

There are 271 road records after filtering.


Unnamed: 0,new_name,long_name,start_year,end_year,road,general_description
97,eg_old_k_1,Egypt - Classic Old Kingdom,-2650,-2350,present,The Old Kingdom period of Egypt covers the Thi...
176,us_woodland_2,Cahokia - Middle Woodland,-150,300,absent,2000 BCE<div>Period of population growth begin...
157,tr_konya_mnl,Konya Plain - Ceramic Neolithic,-7000,-6600,absent,
262,mr_wagadu_2,Middle Wagadu Empire,700,1077,present,The Kingdom of Ghana was the first documented ...
336,tr_byzantine_emp_2,Byzantine Empire II,867,1072,present,The phase of the Byzantine Empire from 867-107...


In [27]:
# Get the responses for the Roads variable
polities_with_roads_responses = deepseek_responses(polities_with_roads_df,
                                                     'Roads',
                                                     "Talking about Transport infrastructure, roads refers to deliberately constructed roads that connect settlements or other sites. It excludes streets/accessways within settlements and paths between settlements that develop through repeated use"
                                                     )

In [28]:
# Save the responses to a CSV file
polities_with_roads_responses.to_csv('polities_with_roads_responses.csv', index=False)

# Load the responses from the CSV file
# polities_with_roads_responses = pd.read_csv('polities_with_roads_responses.csv')

In [29]:
polities_with_roads_responses.sample(5)

Unnamed: 0,new_name,answer,full
203,jp_azuchi_momoyama,present,"<think>\nOkay, so I need to figure out whether..."
98,id_kediri_k,absent,"<think>\nOkay, so I need to figure out whether..."
65,is_icelandic_commonwealth,present,"<think>\nOkay, so I need to figure out whether..."
29,cn_eastern_han_dyn,present,"<think>\nOkay, so I need to figure out whether..."
14,cn_wei_k,absent,"<think>\nOkay, so I need to figure out whether..."


In [30]:
performance(polities_with_roads_df, polities_with_roads_responses, 'road')

Correct: 194, Total: 271, Percentage: 71.59%


71.58671586715867

## One more...

In [31]:
from seshat_api.wf import Irons
irons = Irons(client)
irons_df = pd.DataFrame(irons.get_all())
polities_with_irons_df = process_df(irons_df, 'iron')
polities_with_irons_df.sample(5)

There are 345 iron records after filtering.


Unnamed: 0,new_name,long_name,start_year,end_year,iron,general_description
149,cn_jin_spring_and_autumn,Jin,-780,-404,absent,The Spring and Autumn period was a period of t...
335,tr_tabal_k,Tabal Kingdoms,-900,-730,present,During the 900-730 BCE period the region of th...
340,us_oneota,Oneota,1400,1650,absent,'Oneota' is the modern name given to a group o...
226,ir_ak_koyunlu,Ak Koyunlu,1339,1501,present,The Ak Koyunlu were a loose confederation of n...
5,mn_hunnu_early,Early Xiongnu,-1400,-300,present,The Orkhon Valley is located on either side of...


In [32]:
# Get the responses for the Military use of Metals: Iron variable
polities_with_irons_responses = deepseek_responses(polities_with_irons_df,
                                                     'Military use of Metals: Iron',
                                                     "The absence or presence of iron as a military technology used in warfare"
                                                     )

In [33]:
# Save the responses to a CSV file
polities_with_irons_responses.to_csv('polities_with_irons_responses.csv', index=False)

# Load the responses from the CSV file
# polities_with_irons_responses = pd.read_csv('polities_with_irons_responses.csv')

In [34]:
polities_with_irons_responses.sample(5)

Unnamed: 0,new_name,answer,full
80,ir_susa_3,present,"<think>\nOkay, so I need to figure out whether..."
311,pk_kachi_urban_2,present,"<think>\nOkay, so I need to figure out whether..."
118,tr_konya_lca,present,"<think>\nOkay, so I need to figure out whether..."
63,in_vakataka_k,present,"<think>\nOkay, so I need to figure out whether..."
59,in_hoysala_k,present,"<think>\nOkay, so I need to figure out whether..."


In [35]:
performance(polities_with_irons_df, polities_with_irons_responses, 'iron')

Correct: 220, Total: 345, Percentage: 63.77%


63.76811594202898

# Conclusion

Using DeepSeek-R1, as an example of a recent language model with advanced capabilities, we have provided a series of prompts to test its knowledge of history, using expert reviewed data from the Seshat Global History data as a ground truth. We have specifically asked a set of questions that would require some level of reasoning, rather than just knowledge. Since the model was run locally, it was unable to query online resources, all answers coming directly from the model itself.

The questions are all about social complexity and military technology, and pertain to variables for which data has been collected in Seshat, across a range of historical polities. These polities vary by era and geography, but the only information we gave the LLM was the polity name and the years in which it was active. The prompts read something like this:

> "Use your knowledge of world history to answer the following question. Given your knowledge of the historical polity **'Fatimid Caliphate'**, a polity that existed between **909 CE** and **1171 CE**, do you expect that **Scientific Literature** was *present* or *absent*? **Scientific Literature** is defined as: **'Talking about Kinds of Written Documents, Scientific literature includes mathematics, natural sciences, social sciences'**. Answer *'XXXpresentXXX'* if you expect it to be present, and *'XXXabsentXXX'* if you expect it to be absent."

We then extracted the answers from DeepSeek's responses. See results above

# Add whether answer was correct to CSV

In [36]:
def correctness(polities_df, responses_df, variable_id):
    # Add a new column to the responses_df to store the correct answer
    responses_df['correct'] = responses_df.apply(lambda row: polities_df[polities_df['new_name'] == row['new_name']][variable_id].values[0], axis=1)
    return responses_df

In [37]:
scientific_literature_responses = correctness(polities_with_scientific_literatures_df, scientific_literature_responses, 'scientific_literature')
polities_with_drinking_water_supplies_responses = correctness(polities_with_drinking_water_supplies_df, polities_with_drinking_water_supplies_responses, 'drinking_water_supply_system')
polities_with_roads_responses = correctness(polities_with_roads_df, polities_with_roads_responses, 'road')
polities_with_irons_responses = correctness(polities_with_irons_df, polities_with_irons_responses, 'iron')

In [38]:
# Re-save the responses to a CSVs
scientific_literature_responses.to_csv('scientific_literature_responses.csv', index=False)
polities_with_drinking_water_supplies_responses.to_csv('polities_with_drinking_water_supplies_responses.csv', index=False)
polities_with_roads_responses.to_csv('polities_with_roads_responses.csv', index=False)
polities_with_irons_responses.to_csv('polities_with_irons_responses.csv', index=False)

In [39]:
scientific_literature_responses[scientific_literature_responses['new_name'] == 'tr_ottoman_emp_1']

Unnamed: 0,new_name,answer,full,correct
242,tr_ottoman_emp_1,present,"<think>\nOkay, so I need to figure out whether...",present
