In [None]:
from seshat_api import SeshatAPI
import pandas as pd
from ollama import chat, ChatResponse
import re

# Questions for DeepSeek-R1

Can a recent top performing LLM (DeepSeek-R1) correctly predict whether a variable from the Seshat Global History Databank (e.g. "Scientific Literature") should be "present" or "absent" for a selection of polities, given a definition of the variable, the name of the polity and the years in which it existed?

If DeepSeek has a good understanding of history, can it use this to guess from the polity name and years what time/place the prompt is referring to, and whether the variable in question would have been present?

**Important:** I have included some example CSVs of DeepSeek responses to prompts, since running hundreds of prompts took several hours on my laptop (M1 Mac). If you want to generate new responses from DeepSeek, make sure you uncomment the calls to the `deepseek_responses` function below, but also make sure to *not* overwrite your own responses in the next cell where I load the pre-made responses from CSV.

In [None]:
client = SeshatAPI(base_url="https://seshat-db.com/api")

## Getting data from Seshat

First let's define our variable to be used in an LLM prompt later (taken from seshat-db.com), then load the data from the Seshat API to use as our ground truth to test against.

In [None]:
variable = 'Scientific Literature'
definition = "Talking about Kinds of Written Documents, Scientific literature includes mathematics, natural sciences, social sciences"

In [None]:
from seshat_api.sc import ScientificLiteratures
scientific_literatures = ScientificLiteratures(client)
scientific_literatures_df = pd.DataFrame(scientific_literatures.get_all())
len(scientific_literatures_df)

In [None]:
# scientific_literatures_df[scientific_literatures_df['new_name'] == 'tr_ottoman_emp_1']
scientific_literatures_df.head()

In [None]:
test = pd.DataFrame(scientific_literatures_df['polity'].tolist())
test['scientific_literature'] = scientific_literatures_df['scientific_literature']
test[test['new_name'] == 'tr_ottoman_emp_1']

Let's just use expert reviewed data and ignore examples where the value is anything other than "present" or "absent" to create a subsample of the dataset.

We should also reformat the dataframe so we have information about the polities such as the start and end year alongside the variable value, then remove columns we aren't interested in.

In [None]:
def process_df(seshat_df, variable_id):

    # Extract the polities column to a new dataframe
    polities_df = pd.DataFrame(seshat_df['polity'].tolist())

    # Add columns to the new dataframe
    polities_df[variable_id] = seshat_df[variable_id]
    polities_df['expert_reviewed'] = seshat_df['expert_reviewed']

    # Filter out the records that are not expert reviewed
    polities_df = polities_df[polities_df['expert_reviewed'] == True]

    # Filter out records where the variable is not either 'present' or 'absent'
    polities_df = polities_df[polities_df[variable_id].isin(['present', 'absent'])]

    # Filter out records where the variable is NaN
    polities_df = polities_df[polities_df[variable_id].notna()]
    print("There are", len(polities_df), variable_id, "records after filtering.")

    # Get rid of the columns we don't need
    polities_df = polities_df[['new_name', 'long_name', 'start_year', 'end_year', variable_id, 'general_description']]

    return polities_df

In [None]:
polities_with_scientific_literatures_df = process_df(scientific_literatures_df, 'scientific_literature')

In [None]:
# Sense check
polities_with_scientific_literatures_df[polities_with_scientific_literatures_df['new_name'] == 'tr_ottoman_emp_1']

In [None]:
polities_with_scientific_literatures_df.sample(5)

## "Prompt engineering"

First let's define a prompt function to use with our dataframe:

In [None]:
def year_CE(year):
    if year >= 0:
        return f"{year} CE"
    else:
        return f"{abs(year)} BCE"

def prompt_func(seshat_df, polity_name, variable, variable_definition):
    df = seshat_df[seshat_df['new_name'] == polity_name]
    polity = list(df['long_name'])[0]
    # description = list(df['general_description'])[0]  # TODO: we could add this to the prompt to add more context
    start_year = list(df['start_year'])[0]
    end_year = list(df['end_year'])[0]
    prompt = "Use your knowledge of world history to answer the following question. "
    prompt += f"Given your knowledge of the historical polity '{polity}', "
    prompt += f"a polity that existed between {year_CE(start_year)} and {year_CE(end_year)}"
    prompt += f", do you expect that {variable} was present or absent? "
    prompt += f"{variable} is defined as: '{variable_definition}'. "
    
    # To help extract the answer from the text response later, make sure we have a string that can be found with a regex:
    prompt += "Answer 'XXXpresentXXX' if you expect it to be present, and 'XXXabsentXXX' if you expect it to be absent."
    return prompt
    

How does the prompt look with an example "new_name" (Seshat ID) of `tn_fatimid_cal`, which we know has a record for scientific literature in the database:

In [None]:
test_seshat_polity_name = 'tn_fatimid_cal'
test_prompt = prompt_func(polities_with_scientific_literatures_df, test_seshat_polity_name, variable, definition)
test_prompt

Ok, let's see what DeepSeek responds for this prompt:

In [None]:
response: ChatResponse = chat(model='deepseek-r1', messages=[
  {
    'role': 'user',
    'content': test_prompt,
  },
])
print(response.message.content)

Was that correct? Let's check the dataframe to see:

In [None]:
polities_with_scientific_literatures_df[polities_with_scientific_literatures_df['new_name'] == test_seshat_polity_name]

### Get DeepSeek's answer data

Let's write a function to collect responses from DeepSeek for all the polities in our dataset and make a new dataframe with the results:

In [None]:
def deepseek_responses(seshat_df, variable, definition, count=None):  # Use the count parameter to limit the number of responses
    def generate_response(polity):
        response: ChatResponse = chat(model='deepseek-r1', messages=[
            {
                'role': 'user',
                'content': prompt_func(seshat_df, polity, variable, definition),
            },
        ])
        return response

    responses = pd.DataFrame(columns=['new_name', 'answer', 'full'])
    for polity in seshat_df['new_name']:
        if count is not None and len(responses) >= count:
            break
        response = generate_response(polity)
        while 'XXXabsentXXX' not in response.message.content and 'XXXpresentXXX' not in response.message.content:
            response = generate_response(polity)  # Keep asking until we get a valid response
        answer = re.search(r'(XXXabsentXXX|XXXpresentXXX)', response.message.content).group(1)
        answer = answer.replace("XXX", "")

        # Add the new row to the DataFrame using pd.concat
        responses = pd.concat([responses, pd.DataFrame([{
            'new_name': polity,
            'answer': answer,
            'full': response.message.content
        }])], ignore_index=True)

    return responses

Now let's generate the answer data for this variable across polities in our dataset:

In [None]:
# Get the responses for the scientific literature variable - uncomment to run (you can set count to limit the number of responses)
scientific_literature_responses = deepseek_responses(polities_with_scientific_literatures_df, variable, definition)

In [None]:
# Save the responses to a CSV file - uncomment to run
scientific_literature_responses.to_csv('scientific_literature_responses.csv', index=False)

# Load the responses from the CSV file
# scientific_literature_responses = pd.read_csv('scientific_literature_responses.csv')

In [None]:
scientific_literature_responses.sample(5)

### How well did the model perform?

Let's do a simple check to see what percentage of the answers were correct, according to the Seshat ground truth:

In [None]:
def performance(polities_df, responses_df, variable_id):
    total = 0
    correct = 0
    for _, response in responses_df.iterrows():
        polity = response['new_name']
        seshat_answer = polities_df[polities_df['new_name'] == polity][variable_id].values[0]
        if seshat_answer == response['answer']:
            correct += 1
        # print(polity, ": ", seshat_answer, response['answer'])
        total += 1
    percentage = correct / total * 100
    print(f"Correct: {correct}, Total: {total}, Percentage: {percentage:.2f}%")
    return percentage

In [None]:
performance(polities_with_scientific_literatures_df, scientific_literature_responses, 'scientific_literature')

## Let's try that again

Now let's run the pipeline again with a different variable:

In [None]:
from seshat_api.sc import DrinkingWaterSupplies
drinking_water_supplies = DrinkingWaterSupplies(client)
drinking_water_supplies_df = pd.DataFrame(drinking_water_supplies.get_all())
polities_with_drinking_water_supplies_df = process_df(drinking_water_supplies_df, 'drinking_water_supply_system')
polities_with_drinking_water_supplies_df.sample(5)

In [None]:
# Get the responses for the Drinking Water Supply System variable
polities_with_drinking_water_supplies_responses = deepseek_responses(polities_with_drinking_water_supplies_df,
                                                     'Drinking Water Supply System',
                                                     "Talking about Specialized Buildings, drinking water supply systems are polity owned (which includes owned by the community, or the state), we have coded the absence or presence of the variable",
                                                     )

In [None]:
# Save the responses to a CSV file
polities_with_drinking_water_supplies_responses.to_csv('polities_with_drinking_water_supplies_responses.csv', index=False)

# Load the responses from the CSV file
# polities_with_drinking_water_supplies_responses = pd.read_csv('polities_with_drinking_water_supplies_responses.csv')

In [None]:
polities_with_drinking_water_supplies_responses.sample(5)

In [None]:
performance(polities_with_drinking_water_supplies_df, polities_with_drinking_water_supplies_responses, 'drinking_water_supply_system')

## Another one!

In [None]:
from seshat_api.sc import Roads
roads = Roads(client)
roads_df = pd.DataFrame(roads.get_all())
polities_with_roads_df = process_df(roads_df, 'road')
polities_with_roads_df.sample(5)

In [None]:
# Get the responses for the Roads variable
polities_with_roads_responses = deepseek_responses(polities_with_roads_df,
                                                     'Roads',
                                                     "Talking about Transport infrastructure, roads refers to deliberately constructed roads that connect settlements or other sites. It excludes streets/accessways within settlements and paths between settlements that develop through repeated use"
                                                     )

In [None]:
# Save the responses to a CSV file
polities_with_roads_responses.to_csv('polities_with_roads_responses.csv', index=False)

# Load the responses from the CSV file
# polities_with_roads_responses = pd.read_csv('polities_with_roads_responses.csv')

In [None]:
polities_with_roads_responses.sample(5)

In [None]:
performance(polities_with_roads_df, polities_with_roads_responses, 'road')

## One more...

In [None]:
from seshat_api.wf import Irons
irons = Irons(client)
irons_df = pd.DataFrame(irons.get_all())
polities_with_irons_df = process_df(irons_df, 'iron')
polities_with_irons_df.sample(5)

In [None]:
# Get the responses for the Military use of Metals: Iron variable
polities_with_irons_responses = deepseek_responses(polities_with_irons_df,
                                                     'Military use of Metals: Iron',
                                                     "The absence or presence of iron as a military technology used in warfare"
                                                     )

In [None]:
# Save the responses to a CSV file
polities_with_irons_responses.to_csv('polities_with_irons_responses.csv', index=False)

# Load the responses from the CSV file
# polities_with_irons_responses = pd.read_csv('polities_with_irons_responses.csv')

In [None]:
polities_with_irons_responses.sample(5)

In [None]:
performance(polities_with_irons_df, polities_with_irons_responses, 'iron')

# Conclusion

Using DeepSeek-R1, as an example of a recent language model with advanced capabilities, we have provided a series of prompts to test its knowledge of history, using expert reviewed data from the Seshat Global History data as a ground truth. We have specifically asked a set of questions that would require some level of reasoning, rather than just knowledge. Since the model was run locally, it was unable to query online resources, all answers coming directly from the model itself.

The questions are all about social complexity and military technology, and pertain to variables for which data has been collected in Seshat, across a range of historical polities. These polities vary by era and geography, but the only information we gave the LLM was the polity name and the years in which it was active. The prompts read something like this:

> "Use your knowledge of world history to answer the following question. Given your knowledge of the historical polity **'Fatimid Caliphate'**, a polity that existed between **909 CE** and **1171 CE**, do you expect that **Scientific Literature** was *present* or *absent*? **Scientific Literature** is defined as: **'Talking about Kinds of Written Documents, Scientific literature includes mathematics, natural sciences, social sciences'**. Answer *'XXXpresentXXX'* if you expect it to be present, and *'XXXabsentXXX'* if you expect it to be absent."

We then extracted the answers from DeepSeek's responses. The results showed that the number of correct answers was not far from **50%**, which is what you might expect if it was picking at random given the binary choice of "absent" and "present" (other values such as "transitional" and "unknown" exist for these variables in Seshat data, but we didn't include those polities here).

In the CSV example answers I have saved in this repository, the results in terms of correct answers were:
- Scientific Literature: 52.67% (79/150) 
- Drinking Water Supply System: 46.43% (52/112)
- Roads: 58.82% (110/187) 
- Military use of Metals - Iron: 62.02% (209/337)

# Add whether answer was correct to CSV

In [None]:
def correctness(polities_df, responses_df, variable_id):
    # Add a new column to the responses_df to store the correct answer
    responses_df['correct'] = responses_df.apply(lambda row: polities_df[polities_df['new_name'] == row['new_name']][variable_id].values[0], axis=1)
    return responses_df

In [None]:
scientific_literature_responses = correctness(polities_with_scientific_literatures_df, scientific_literature_responses, 'scientific_literature')
polities_with_drinking_water_supplies_responses = correctness(polities_with_drinking_water_supplies_df, polities_with_drinking_water_supplies_responses, 'drinking_water_supply_system')
polities_with_roads_responses = correctness(polities_with_roads_df, polities_with_roads_responses, 'road')
polities_with_irons_responses = correctness(polities_with_irons_df, polities_with_irons_responses, 'iron')

In [None]:
# Re-save the responses to a CSVs
scientific_literature_responses.to_csv('scientific_literature_responses.csv', index=False)
polities_with_drinking_water_supplies_responses.to_csv('polities_with_drinking_water_supplies_responses.csv', index=False)
polities_with_roads_responses.to_csv('polities_with_roads_responses.csv', index=False)
polities_with_irons_responses.to_csv('polities_with_irons_responses.csv', index=False)

In [None]:
scientific_literature_responses[scientific_literature_responses['new_name'] == 'tr_ottoman_emp_1']