In [None]:
from seshat_api import SeshatAPI
import pandas as pd
from ollama import chat, ChatResponse

# Questions for DeepSeek-R1

Can a recent top performing LLM (DeepSeek-R1) correctly predict whether a variable from the Seshat Global History Databank (e.g. "Scientific Literature") should be "present" or "absent" for a selection of polities, given a definition of the variable, the name of the polity and the years in which it existed?

If DeepSeek has a good understanding of history, can it use this to guess from the polity name and years what time/place the prompt is referring to, and whether the variable in question would have been present?

In [None]:
client = SeshatAPI(base_url="https://seshat-db.com/api")

## Getting data from Seshat

First let's define our variable to be used in an LLM prompt later (taken from seshat-db.com), then load the data from the Seshat API to use as our ground truth to test against.

In [None]:
variable = 'Scientific Literature'
definition = "Talking about Kinds of Written Documents, Scientific literature includes mathematics, natural sciences, social sciences"

In [None]:
from seshat_api.sc import ScientificLiteratures
scientific_literatures = ScientificLiteratures(client)
scientific_literatures_df = pd.DataFrame(scientific_literatures.get_all())
len(scientific_literatures_df)

Let's just use expert reviewed data and ignore examples where the value is anything other than "present" or "absent" to create a subsample of the dataset.

We should also reformat the dataframe so we have information about the polities such as the start and end year alongside the variable value, then remove columns we aren't interested in.

In [None]:
def process_df(seshat_df, variable_id):
    # Filter out the records that are not expert reviewed
    seshat_df = seshat_df[seshat_df['expert_reviewed'] == True]
    print(len(seshat_df))

    # Filter out records where the variable is not either 'present' or 'absent'
    seshat_df = seshat_df[seshat_df[variable_id].isin(['present', 'absent'])]
    print(len(seshat_df))

    # Extract the polities column to a new dataframe
    polities_df = pd.DataFrame(seshat_df['polity'].tolist())

    # Add the scientific_literature column to the new dataframe
    polities_df[variable_id] = seshat_df[variable_id]
    print(len(polities_df))

    # Filter out records where scientific_literature is NaN
    polities_df = polities_df[polities_df[variable_id].notna()]
    print(len(polities_df))

    # Get rid of the columns we don't need
    polities_df = polities_df[['new_name', 'long_name', 'start_year', 'end_year', variable_id, 'general_description']]

    return polities_df

In [None]:
polities_with_scientific_literatures_df = process_df(scientific_literatures_df, 'scientific_literature')

In [None]:
polities_with_scientific_literatures_df.sample(5)

## "Prompt engineering"

First let's define a prompt function to use with our dataframe:

In [None]:
def year_CE(year):
    if year >= 0:
        return f"{year} CE"
    else:
        return f"{abs(year)} BCE"

def prompt_func(seshat_df, polity_name, variable, variable_definition):
    df = seshat_df[seshat_df['new_name'] == polity_name]
    polity = list(df['long_name'])[0]
    # description = list(df['general_description'])[0]  # TODO: we could add this to the prompt to add more context
    start_year = list(df['start_year'])[0]
    end_year = list(df['end_year'])[0]
    prompt = "Use your knowledge of world history to answer the following question. "
    prompt += f"Given your knowledge of the historical polity '{polity}', "
    prompt += f"a polity that existed between {year_CE(start_year)} and {year_CE(end_year)}"
    prompt += f", do you expect that {variable} was present or absent? "
    prompt += f"{variable} is defined as: '{variable_definition}'. "
    
    # To help extract the answer from the text response later, make sure we have a string that can be found with a regex:
    prompt += "Answer 'XXXpresentXXX' if you expect it to be present, and 'XXXabsentXXX' if you expect it to be absent."
    return prompt
    

How does the prompt look with an example "new_name" (Seshat ID) of `tn_fatimid_cal`, which we know has a record for scientific literature in the database:

In [None]:
test_seshat_polity_name = 'tn_fatimid_cal'
test_prompt = prompt_func(polities_with_scientific_literatures_df, test_seshat_polity_name, variable, definition)
test_prompt

Ok, let's see what DeepSeek responds for this prompt:

In [None]:
response: ChatResponse = chat(model='deepseek-r1', messages=[
  {
    'role': 'user',
    'content': test_prompt,
  },
])
print(response.message.content)

Was that correct? Let's check the dataframe to see:

In [None]:
polities_with_scientific_literatures_df[polities_with_scientific_literatures_df['new_name'] == test_seshat_polity_name]

### Get DeepSeek's answer data

Let's write a function to collect responses from DeepSeek for all the polities in our dataset and make a new dataframe with the results:

In [None]:
def deepseek_responses(seshat_df, variable, definition, count):  # Use the count parameter to limit the number of responses
    def generate_response(polity):
        response: ChatResponse = chat(model='deepseek-r1', messages=[
            {
                'role': 'user',
                'content': prompt_func(seshat_df, polity, variable, definition),
            },
        ])
        return response

    responses = pd.DataFrame(columns=['new_name', 'answer', 'full'])
    for polity in seshat_df['new_name']:
        if len(responses) >= count:
            break
        response = generate_response(polity)
        answer = ""
        try:
            answer = re.search(r'(XXXabsentXXX|XXXpresentXXX)', response.message.content).group(1)
        except:
            print("No answer found for", polity)
        answer = answer.replace("XXX", "")
        print(polity, ": ", answer)

        # Add the new row to the DataFrame using pd.concat
        responses = pd.concat([responses, pd.DataFrame([{
            'new_name': polity,
            'answer': answer,
            'full': response.message.content
        }])], ignore_index=True)

    return responses

Now let's generate the answer data for this variable across polities in our dataset:

In [None]:
# Get the responses for the scientific literature variable
scientific_literature_responses = deepseek_responses(polities_with_scientific_literatures_df, variable, definition, 2)
scientific_literature_responses

In [None]:
# Save the responses to a CSV file
scientific_literature_responses.to_csv('scientific_literature_responses.csv', index=False)

# Load the responses from the CSV file
# scientific_literature_responses = pd.read_csv('scientific_literature_responses.csv')

### How well did the model perform?

Let's do a simple check to see what percentage of the answers were correct, according to the Seshat ground truth:

In [None]:
def performance(polities_df, responses_df, variable_id):
    total = 0
    correct = 0
    for _, response in responses_df.iterrows():
        polity = response['new_name']
        seshat_answer = polities_df[polities_df['new_name'] == polity][variable_id].values[0]
        if seshat_answer == response['answer']:
            correct += 1
        # print(polity, ": ", seshat_answer, response['answer'])
        total += 1
    percentage = correct / total * 100
    print(f"Correct: {correct}, Total: {total}, Percentage: {percentage:.2f}%")
    return percentage

In [None]:
performance(polities_with_scientific_literatures_df, scientific_literature_responses, 'scientific_literature')

## Let's try that again

Now let's run the pipeline again with a different variable: