In [2]:
from langchain_community.llms import Ollama

In [3]:
import wikipediaapi

In [4]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

In [334]:
import json
import time


In [6]:
import os

### Try Historical Novels Page

https://en.wikipedia.org/wiki/List_of_historical_novels


TODONE:

* created semantic filter class!
* pipelined the whole thing all the way out to generating books


TODO:

* need pipeline verification cuz there are some real authors in there...; try jay's idea of just the first 10 words or so? Frank Yerby and Joseph Henry Shorthouse two examples
* also we still haven't incorporated check results into a pipeline...might want a pipeline class that shows you each aspect of the pipeline. the part we need to include is to add the check quality class and create a csv, then write that csv. the csv will be what our front end ingests for geodisplay. might want some other tertiary stuff like getting lat lons for the city country pairs?  anyways, include it so you don't lose that part of the pipeline!



In [338]:
os.path.exists('filtered_pages.json')

True

In [407]:

os.path.exists('data/whn_pages/' + 'filtered_pages_final.json')  # means self.filter_pages() ran


True

In [429]:
class FetchWikiHNPages:
    """
    Going to get a nice clean set of book pages from this website: https://en.wikipedia.org/wiki/List_of_historical_novels
    """

    def __init__(self, root_page="List_of_historical_novels", data_dir='data/whn_pages/'):
        self.llm = Ollama(model="llama3")
        self.wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
        self.root_page = self.wiki.page(root_page)
        self.data_dir = data_dir

    def triple_filter_short_pipeline(self):
        """
        temp pipeline to get triple filtering running
        TODO: merge with run_full_pipeline
        note: inefficient because we're better off just running the pages we already single filtered
        """

        self.load_filtered_and_unfiltered_pages()
        self.triple_filter_pages()

    def run_full_pipeline(self, mode='continue'):
        """

        runs full pipeline
        TODO: write this so that we can tell where we are and if we failed jobs. Wherever state of completion we are when
        we have the list, we start from there and keep going. there could also be a setting for delete current records and start from
        scratch. So if we already have a filtered_pages_final.json just use that. that would look like:


        mode: takes on vals 'continue' to continue from last known point or 'overwrite' if we want to start from the beginning
        """

        if mode == 'continue':
            # use existence of files written during process to determine where we left off
            if os.path.exists(self.data_dir + 'filtered_pages_final.json'):  # means self.filter_pages() ran
                print('final page list already exists--loading...')
                with open(self.data_dir + 'filtered_pages_final.json', "r") as f:
                    self.filtered_pages = json.load(f)

            # elif os.path.exists(
                #    self.data_dir + 'filtered_pages_cache.json'):  # means self.filter_pages ran but did not complete...
                # hard case...because there are some edge cases, but let's right a solution that updates with cached results and continues
                # not really adequate because for instance filtered_pages_cache might update but unfiltered_pages_cache might not, in same loop
                # meaning they would be mismatched
                # actually...let's just leave this case for another day, I want to move on to generating actual data rather than writing endless cases

            # replace / reformat this elif and below with load_filtered_and_unfiltered_pages
            # note the get_unfiltered_pages() in the else is included in load_filtered_and_unfiltered_pages()
            elif os.path.exists(self.data_dir + 'filtered_pages.json'):  # means self.reduce_pages() ran
                with open(self.data_dir + "unfiltered_pages.json", "r") as f:
                    self.unfiltered_pages = json.load(f)
                with open(self.data_dir + "filtered_pages.json", "r") as f:
                    self.filtered_pages = json.load(f)
                self.filter_pages()
            else:

                self.get_unfiltered_pages()
                self.reduce_pages()
                self.filter_pages()

        elif mode == 'overwrite':

            self.get_unfiltered_pages()
            self.reduce_pages()
            self.filter_pages()

    def load_filtered_and_unfiltered_pages(self):

        if os.path.exists(self.data_dir + 'filtered_pages.json'):  # means self.reduce_pages() ran
            with open(self.data_dir + "unfiltered_pages.json", "r") as f:
                self.unfiltered_pages = json.load(f)
            with open(self.data_dir + "filtered_pages.json", "r") as f:
                self.filtered_pages = json.load(f)  

        else:
            self.get_unfiltered_pages()
            self.reduce_pages()

    def get_unfiltered_pages(self):
        """
        Queries pages from hns site and does a validity check
        """
        print('started querying unfiltered pages at ' + time.ctime())

        root_links = self.root_page.links  # gets a dictionary whose keys are all the linked pages, potentially including red links though
        self.unfiltered_pages = list(root_links.keys())

        self.unfiltered_pages = [pg for pg in self.unfiltered_pages if self.wiki.page(pg).exists()]
        self.filtered_pages = []
        print('ended querying unfiltered pages at ' + time.ctime())

    def reduce_pages(self):
        """
        We want to limit the number of API calls we'll make, so let's move any pages with novel in the title over to our filtered list.
        This isn't robust to all possible edge cases but looks good for the actual list we're dealing with here (based on direct inspection).
        """

        # manually parsed suffixes that identify whether a wiki page in our list is a novel or a non-novel
        novel_title_suffix_list = ['novel', 'historical novel', 'book', 'Uris novel', 'Seton novel', '1962 novel',
                                   'Prus novel', 'novel series', 'Foote novel', 'Gann novel', 'Mario Puzo novel',
                                   'Spanish novel', 'Sutcliff novel', 'Scott novel', 'Penman novel']
        not_novel_title_suffix_list = ['journalist', 'author', 'novelist', 'writer', 'abolitionist', 'World War II',
                                       'consul 218 BC', 'fictional character', '535-554', '1919–1922', '410',
                                       'Third Punic War', 'Polish history']
        novels_by_wiki_title = [pg for pg in self.unfiltered_pages if
                                pg.rstrip(')').split('(')[-1] in novel_title_suffix_list]
        not_novels_by_wiki_title = [pg for pg in self.unfiltered_pages if
                                    pg.rstrip(')').split('(')[-1] in not_novel_title_suffix_list]

        self.unfiltered_pages = list(
            set(self.unfiltered_pages) - set(novels_by_wiki_title) - set(not_novels_by_wiki_title))
        self.filtered_pages.extend(novels_by_wiki_title)

        # write results of this stage of pipeline
        with open(self.data_dir + "unfiltered_pages.json", "w") as f:
            json.dump(self.unfiltered_pages, f)
        with open(self.data_dir + "filtered_pages.json", "w") as f:
            json.dump(self.filtered_pages, f)

    def filter_pages(self):
        """
        Checks that a page exists and that it's the page for an actual author
        """
        print('started filtering pages at ' + time.ctime())

        self.unfiltered_pages_class_dict = {}  # just as a backup, record the llm-generated class in a dict
        self.unfiltered_remaining = self.unfiltered_pages.copy()
        for pg in self.unfiltered_pages:
            self.wc = WikiClassify(w_page=pg)
            print('filtering page:  ' + str(pg) + ' at ' + time.ctime())

            self.wc.classify_page_by_summary()
            self.unfiltered_pages_class_dict[pg] = self.wc.parsed_json['class']
            if self.wc.parsed_json['class'] == 'book':
                self.filtered_pages.append(pg)
                with open(self.data_dir + "filtered_pages_cache.json", "w") as f:
                    json.dump(self.filtered_pages, f)

                self.unfiltered_remaining.remove(pg)
                with open(self.data_dir + "unfiltered_pages_cache.json", "w") as f:
                    json.dump(self.unfiltered_remaining, f)

        with open(self.data_dir + "filtered_pages_final.json", "w") as f:
            json.dump(self.filtered_pages, f)


    def triple_filter_pages(self):
        """
        Checks that a page exists and that it's the page for an actual author
        Uses three filters: classifying the summary page, classifying the text up to the first period, and up to the first comma.
        We use a triple filter because we saw that just classifying based on the summary text yielded too much misclassification
        Instead, just pass through pages that are classified as books by all three 
        """
        print('started filtering pages at ' + time.ctime())
        ## TODO: need to manage self.filtered_pages and self.unfiltered_pages and connect it to pipeline
        self.triple_filter_results = {}
        for pg in self.unfiltered_pages:
            self.wc = WikiClassify(w_page=pg)
            print('filtering page:  ' + str(pg) + ' at ' + time.ctime())

            self.wc.classify_page_by_summary()
            first_pass = self.wc.parsed_json['class']

            self.wc.classify_page_to_first_period()
            second_pass = self.wc.parsed_json['class']

            self.wc.classify_page_to_first_comma()
            third_pass = self.wc.parsed_json['class']

            self.triple_filter_results[pg] = {'first_pass': first_pass, 'second_pass':second_pass, 'third_pass':third_pass}

            if (first_pass=='book') & (second_pass=='book') & (third_pass=='book'):
                self.filtered_pages.append(pg)


        with open(self.data_dir + "triple_filtered_pages.json", "w") as f:
            json.dump(self.filtered_pages, f)

        with open(self.data_dir + "triple_filtered_pages_verbose.json", "w") as f:
            json.dump(self.triple_filter_results, f)


In [None]:
hnp = FetchWikiHNPages()
hnp.triple_filter_short_pipeline()

started filtering pages at Tue Jul  2 17:32:53 2024
filtering page:  The Stronghold at Tue Jul  2 17:32:54 2024


In [416]:
tdict = {}

x = 'hi'
y = 'bye'

tdict[x] = y

In [417]:
tdict

{'hi': 'bye'}

In [418]:
with open("dummy.json", "w") as f:
    json.dump(tdict, f)

In [412]:
hnp = FetchWikiHNPages()
hnp.run_full_pipeline()

started filtering pages at Tue Jul  2 15:57:19 2024
filtering page:  The Stronghold at Tue Jul  2 15:57:19 2024
filtering page:  American Civil War at Tue Jul  2 15:57:33 2024
filtering page:  Operation Wrath of God at Tue Jul  2 15:57:45 2024
filtering page:  Reconquista at Tue Jul  2 15:57:51 2024
filtering page:  Charles Frazier at Tue Jul  2 15:58:01 2024
filtering page:  Victor Hugo at Tue Jul  2 15:58:06 2024


KeyboardInterrupt: 

In [351]:
hnp = FetchWikiHNPages()
hnp.run_full_pipeline()

started filtering pages at Tue Jun 25 21:52:16 2024
filtering page:  The Stronghold at Tue Jun 25 21:52:16 2024
filtering page:  American Civil War at Tue Jun 25 21:52:22 2024
filtering page:  Operation Wrath of God at Tue Jun 25 21:52:35 2024
filtering page:  Reconquista at Tue Jun 25 21:52:40 2024
filtering page:  Charles Frazier at Tue Jun 25 21:52:50 2024
filtering page:  Victor Hugo at Tue Jun 25 21:52:55 2024
filtering page:  The Siege of Krishnapur at Tue Jun 25 21:53:02 2024
filtering page:  The Glory at Tue Jun 25 21:53:08 2024
filtering page:  Nellie Campobello at Tue Jun 25 21:53:11 2024
filtering page:  Socrates at Tue Jun 25 21:53:17 2024
filtering page:  Valerio Massimo Manfredi at Tue Jun 25 21:53:25 2024
filtering page:  Edward P. Jones at Tue Jun 25 21:53:29 2024
filtering page:  Sacred Hunger at Tue Jun 25 21:53:33 2024
filtering page:  The Heart of Jade at Tue Jun 25 21:53:38 2024
filtering page:  Tai-Pan at Tue Jun 25 21:53:43 2024
filtering page:  Oldest Living Con

In [366]:
hnp.filtered_pages

['Aangan (novel)',
 'Alaska (novel)',
 'All That Matters (novel)',
 'Andersonville (novel)',
 'Avalon (novel)',
 'Aztec (book)',
 'Aztec (novel)',
 'Bano (novel)',
 'Beloved (novel)',
 'Brother Jonathan (novel)',
 'Burr (novel)',
 'Captain Blood (novel)',
 'Captain from Castile (novel)',
 'Caravans (novel)',
 'Caribbean (novel)',
 'Centennial (novel)',
 'Chesapeake (novel)',
 'Cold Mountain (novel)',
 'Cry of the Peacock (novel)',
 'Destiny in Sydney (novel)',
 'Dharmaraja (novel)',
 'Doctor Zhivago (novel)',
 'Dragon Seed (novel)',
 'Empire of the Sun (novel)',
 'Empress (novel)',
 'Exodus (Uris novel)',
 'Fingersmith (novel)',
 'Flashman (novel)',
 'Footsteps (novel)',
 'Forever Amber (novel)',
 'Funeral Games (novel)',
 'Gai-Jin (novel)',
 'Girl with a Pearl Earring (novel)',
 'Gone with the Wind (novel)',
 'Hawaii (novel)',
 'Hawksmoor (novel)',
 'Hereward the Wake (novel)',
 'Hild (novel)',
 'Hypatia (novel)',
 'Ides of March (novel)',
 'Jalna (novel series)',
 'Journey (novel)',


In [353]:
hnp.unfiltered_pages_class_dict['Frank Yerby']

'book'

In [395]:
x = 'Musashi_(novel)'

In [397]:
wc = WikiClassify(x)
wc.classify_page_by_summary()
wc.parsed_json

{'class': 'book'}

In [392]:

wc.classify_page_by_first_sentence(x)
wc.parsed_json

{'class': 'book'}

In [393]:
wc.classify_page_by_first_comma(x)
wc.parsed_json

{'class': 'book'}

In [394]:
class WikiClassify:
    """
    Classify wikipedia articles
    """

    def __init__(self, w_page):
        self.llm = Ollama(model="llama3")
        self.wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
        self.w_page = w_page
        self.query_summary()

    def query_summary(self):

        self.w_summary = self.wiki.page(self.w_page).summary
        self.w_summary_to_first_period = self.w_summary.split('.')[0]
        self.w_summary_to_first_comma = self.w_summary.split(',')[0]

    def classify_page_to_first_period(self):
        self.classify_page_by_w_text(w_text = self.w_summary_to_first_period)

    def classify_page_to_first_comma(self):
        self.classify_page_by_w_text(w_text = self.w_summary_to_first_comma)

    def classify_page_by_summary(self):
        self.classify_page_by_w_text(w_text = self.w_summary)
        
    def classify_page_by_w_text(self, w_text):
        """
        For a wikipedia page, classify it as belonging to a novel, an author, or neither ("other")
        """

        output1 = '<result>\n{\n"class": "author"\n}\n</result>'
        output2 = '<result>\n{\n"class": "book"\n}\n</result>'
        output3 = '<result>\n{\n"class": "other"\n}\n</result>'
        
        prompt = "You are a bot that reads wikipedia articles about books and returns information about them. Your output must be in valid JSON. \
        Do not output anything other than the JSON.\
        First, determine if the wikipedia page is a page for an author, a book, or something else.\
        If it is a page for an author (not a book), add the value author to the JSON with the key class. \
        If it is a page for a book (not an author), add the value book to the JSON with the key class. \
        If it is neither for a book nor an author, add the value other to the JSON with the key class \
        Finally, surround your JSON output with <result></result> tags. \
        \
        Ensure the JSON output has one key, class, and one value, which can either be author, book, or other.\
        Do not return any other type of output.\
        \
        Here is the text from the wikipedia article for this book: "
        
        
        
        end_prompt = "Based on the book's wikipedia article provided above, create a JSON describing this book. Your output must be in valid JSON. \
        Do not output anything other than the JSON.\
        First, determine if the wikipedia page is a page for an author, a book, or something else.\
        If it is a page for an author (not a book), add the value author to the JSON with the key class. \
        If it is a page for a book (not an author), add the value book to the JSON with the key class. \
        If it is neither for a book nor an author, add the value other to the JSON with the key class \
        \
        Ensure the JSON output has one key, class, and one value, which can either be author,book, or other.\
        Do not return any other type of output.\
        \
        Finally, surround your JSON output with <result></result> tags.\
        The three allowable outputs are: "
        
                
        self.llm_response = self.llm(prompt + w_text + end_prompt + output1 + "or " + output2 + "or " + output3)
        self.parse_response()

    def parse_response(self):

        json_string = self.llm_response.split('<result>')[1].split('</result>')[0]
        self.parsed_json = json.loads(json_string.strip())

In [248]:
wc = WikiClassify()
wc.classify_page()


In [201]:
class SaveWork:
    """
    This is great work so far. We are able to pass a book's wiki page title, query it on wikipedia, \
    parse the article, and extract key info: author, title, and city, and country.
    """
    def __init__(self):
        self.llm = Ollama(model="llama3")
        self.wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
        return 

    def pull_book(self, wpage ="Musashi_(novel)" ):

        prompt = "You are a bot that reads wikipedia articles about books and returns information about them. Your output must be in valid JSON. \
        Do not output anything other than the JSON.\
        First, find the book's title and add it to the JSON with the key 'title'.\
        Next, find the book's author and add it to the JSON with the key 'author'.\
        Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
        you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
        Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
        If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
        with the key 'country'. Finally, surround your JSON output with <result></result> tags. \
        Here is the text from the wikipedia article for this book: "
        
        
        
        end_prompt = "Based on the book's wikipedia article provided above, create a JSON describing this book. Your output must be in valid JSON. \
        Do not output anything other than the JSON.\
        First, find the book's title and add it to the JSON with the key 'title'.\
        Next, find the book's author and add it to the JSON with the key 'author'.\
        Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
        you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
        Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
        If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
        with the key 'country'. Finally, surround your JSON output with <result></result> tags."
                
        self.llm_response = self.llm(prompt + self.wiki.page(wpage).text + end_prompt)

    def parse_response(self):

        json_string = self.llm_response.split('<result>')[1].split('</result>')[0]
        self.parsed_json = json.loads(json_string.strip())
        
sw = SaveWork()
sw.pull_book(wpage='Adalbert Stifter')
sw.parse_response()

In [None]:
wc.wiki.page()

In [249]:
wc.llm_response

'<result>\n{\n"class": "book"\n}\n</result>'

In [259]:
wc.parsed_json['class']

'other'

In [239]:
wc.parse_response()

In [250]:
wc.classify_page('Adalbert_Stifter')
wc.parsed_json

{'class': 'author'}

In [252]:
wc.classify_page('International_Booker_Prize')
wc.parsed_json


{'class': 'other'}

In [251]:
wc.llm_response

'<result>\n{\n"class": "author"\n}\n</result>'

In [242]:
x = wc.llm_response.split('<result>')[1].split('</result>')[0]

In [243]:
x

'\n{\n"class": "author",\n}\n'

In [244]:
json.loads(x.strip())

JSONDecodeError: Expecting property name enclosed in double quotes: line 3 column 1 (char 21)

In [227]:
json.loads(wc.llm_response.split('<result>')[1].split('</result>')[0].strip())

{'class': 'book'}

In [187]:

wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')  
hns = wiki.page("List_of_historical_novels")

        


In [192]:
x = hns.sections

In [198]:
y = hns.links

In [256]:
list(y.keys())

['1715 Jacobite rising',
 '1745 Jacobite rising',
 '1960 United States presidential election',
 'A Body in the Bath House',
 'A Council of Dolls',
 'A Dying Light in Corduba',
 'A Journal of the Plague Year',
 'A Place of Greater Safety',
 'A Spoke in the Wheel',
 'A Star Called Henry',
 'A Struggle for Rome',
 'A Tale of Old Mortality',
 'A Tale of Two Cities',
 'A fekete város',
 'Aag Ka Darya',
 'Aangan (novel)',
 'Aaron Burr',
 'Abd-ar-Rahman III',
 'Abd ar-Rahman II',
 'Across Five Aprils',
 'Adalbert Stifter',
 'Aden Polydoros',
 'Akhenaten',
 'Akilan',
 'Alans',
 'Alaska',
 'Alaska (novel)',
 'Alberto Laiseca',
 'Alberto Vázquez-Figueroa',
 'Alcibiades',
 'Alejandro Núñez Alonso',
 'Alejo Carpentier',
 'Aleksander Kamiński',
 'Aleksandr Solzhenitsyn',
 'Alessandro Baricco',
 'Alessandro Manzoni',
 'Alex Haley',
 'Alex Rutherford',
 'Alexander Farnese, Duke of Parma',
 'Alexander the Great',
 'Alexandre Dumas, père',
 'Alfarrábios',
 'Alfonso VIII of Castile',
 'Alfonso VII of Le

In [196]:
x[2]

Section: Australia (1):
The Playmaker and Bring Larks and Heroes by Thomas Keneally (18th century colonial Australia)
Morgan's Run by Colleen McCullough (end of the 18th century)
Destiny in Sydney by D. Manning Richards (1787–1902 Scots-Irish, Aboriginal, and Chinese family saga story)
The Lambing Flat by Nerida Newton (mid-19th century Australian gold rushes)
The Secret River by Kate Grenville (19th century colonial Australia)
Jack Maggs by Peter Carey (19th century colonial Australia)
The Thorn Birds by Colleen McCullough (end of the 19th century)
True History of the Kelly Gang by Peter Carey (Kelly Gang, 1878–1880)
An Angel in Australia by Thomas Keneally (World War II)
Oscar and Lucinda by Peter Carey
Jasper Jones by Craig Silvey (1960s Western Australia)
Bila Yarrudhanggalangdhuray by Anita Heiss (19th century indigenous New South Wales)
Subsections (0):

In [197]:
x

[Section: Afghanistan (1):
 The Afghan Campaign by Steven Pressfield (Alexander the Great's invasion of the Afghan kingdoms in 330 BC)
 Flashman by George MacDonald Fraser (1840s, First Anglo-Afghan War)
 Caravans by James Michener (post-World War II)
 Subsections (0):,
 Section: Argentina (1):
 On Heroes and Tombs by Ernesto Sabato (19th century, during the Civil War)
 Facundo: Civilization and Barbarism by Domingo F. Sarmiento (19th century)
 Santa Evita by Tomás Eloy Martínez (20th century, Eva Perón)
 El combate perpetuo by Marcos Aguinis (19th century, Admiral William Brown)
 La fragata Proserpina by Luis Delgado Bañón (19th century)
 El queche Hiena by Luis Delgado Bañón (19th century)
 Subsections (0):,
 Section: Australia (1):
 The Playmaker and Bring Larks and Heroes by Thomas Keneally (18th century colonial Australia)
 Morgan's Run by Colleen McCullough (end of the 18th century)
 Destiny in Sydney by D. Manning Richards (1787–1902 Scots-Irish, Aboriginal, and Chinese family s

### Check Results

Check results of extraction to see if db is of good quality


find early way to display results?

In [7]:
db_dir = './../data/'

In [19]:
booklist  = os.listdir(db_dir)
booklist.sort()
booklist[1]

'10 Minutes 38 Seconds in This Strange World.json'

In [23]:
book = booklist[1]

In [41]:
booklist

['.ipynb_checkpoints',
 '10 Minutes 38 Seconds in This Strange World.json',
 'A Bend in the River.json',
 'A Brief History of Seven Killings.json',
 'A Case of Exploding Mangoes.json',
 'A Clubbable Woman.json',
 'A Disaffection.json',
 'A Fairly Honourable Defeat.json',
 'A Fine Balance.json',
 'A Fraction of the Whole.json',
 'A Guilty Thing Surprised.json',
 'A Little Life.json',
 'A Long Long Way.json',
 'A Passage North.json',
 'A Place in England.json',
 'A Short History of Tractors in Ukrainian.json',
 'A Spell of Good Things.json',
 'A Spool of Blue Thread.json',
 'A Tale for the Time Being.json',
 'According to Mark.json',
 'According to Queeney.json',
 'Alias Grace.json',
 'All That Man Is.json',
 'Amongst Women.json',
 'An Artist of the Floating World.json',
 'An Ice-Cream War.json',
 'An Instant in the Wind.json',
 'An Orchestra of Minorities.json',
 "Animal's People.json",
 'Anthills of the Savannah.json',
 'Any Human Heart.json',
 'Arthur & George.json',
 'Astonishing Spl

In [45]:
[book for book in booklist if book.split('.')[-1] == 'json']

['10 Minutes 38 Seconds in This Strange World.json',
 'A Bend in the River.json',
 'A Brief History of Seven Killings.json',
 'A Case of Exploding Mangoes.json',
 'A Clubbable Woman.json',
 'A Disaffection.json',
 'A Fairly Honourable Defeat.json',
 'A Fine Balance.json',
 'A Fraction of the Whole.json',
 'A Guilty Thing Surprised.json',
 'A Little Life.json',
 'A Long Long Way.json',
 'A Passage North.json',
 'A Place in England.json',
 'A Short History of Tractors in Ukrainian.json',
 'A Spell of Good Things.json',
 'A Spool of Blue Thread.json',
 'A Tale for the Time Being.json',
 'According to Mark.json',
 'According to Queeney.json',
 'Alias Grace.json',
 'All That Man Is.json',
 'Amongst Women.json',
 'An Artist of the Floating World.json',
 'An Ice-Cream War.json',
 'An Instant in the Wind.json',
 'An Orchestra of Minorities.json',
 "Animal's People.json",
 'Anthills of the Savannah.json',
 'Any Human Heart.json',
 'Arthur & George.json',
 'Astonishing Splashes of Colour.json',


In [24]:
book

'10 Minutes 38 Seconds in This Strange World.json'

In [26]:
book_dir = db_dir+book

In [27]:
book_dir

'./../data/10 Minutes 38 Seconds in This Strange World.json'

In [34]:
with open(book_dir, 'r') as f:
    data = json.load(f)
    print(data)

{'title': '10 Minutes and 38 Seconds in This Strange World', 'author': 'Elif Shafak', 'city': 'Istanbul', 'country': 'Turkey', 'page': '10 Minutes 38 Seconds in This Strange World'}


In [37]:
data['title']

'10 Minutes and 38 Seconds in This Strange World'

In [164]:
class CheckOutput:

    def __init__(self, db_dir = './../data/', book_schema = {'title':'', 'author':'', 'city':'', 'country':'', 'page':''}):
        self.db_dir = db_dir
        self.book_schema = book_schema

    def run(self):
        self.clean_db()
        
    def clean_db(self):
        booklist = os.listdir(self.db_dir)
        booklist.sort()


        self.book_dicts = {}
        self.failed_check = []
        self.failed_schema = {}

        # only allow json
        self.booklist = [book for book in booklist if book.split('.')[-1] == 'json'] # ensures book ends in .json

        for book in self.booklist:
            self.check_book(book)

        
        self.clean_db_df = pd.DataFrame(list(self.book_dicts.values()))
        
    def check_book(self, book):
        
        book_file = self.db_dir + book
        book_data = self.get_book_data(book_file)
        if isinstance(book_data, dict):
            self.book_dicts[book] = book_data
            """
            if book_data.keys() == self.book_schema.keys():
                self.book_dicts[book] = book_data
            else:
                self.failed_check.append(book)
                self.failed_schema[book] = book_data
            """
        else:
            self.failed_check.append(book)
    
        
    @staticmethod 
    def get_book_data(book_file):

        with open(book_file, 'r') as f:
            data = json.load(f)

        return data

In [165]:
co = CheckOutput()
co.clean_db()

In [169]:
co.clean_db_df[co.clean_db_df.city=='Kyoto']

Unnamed: 0,title,author,city,country,page


In [181]:
co.clean_db_df.loc[co.clean_db_df.city.isna()]

Unnamed: 0,title,author,city,country,page
4,A Clubbable Woman,Reginald Hill,,,A Clubbable Woman
8,A Fraction of the Whole,Steve Toltz,,,A Fraction of the Whole
29,Any Human Heart,William Boyd,,,Any Human Heart
31,Astonishing Splashes of Colour,Clare Morrall,,,Astonishing Splashes of Colour
50,Do Not Say We Have Nothing,,,,Do Not Say We Have Nothing
56,"England, England",Julian Barnes,,,"England, England"
61,Fire From Heaven,Mary Renault,,,Fire From Heaven
63,Flaubert's Parrot,Julian Barnes,,,Flaubert's Parrot
69,"Girl, Woman, Other",Bernadine Evaristo,,,"Girl, Woman, Other"
78,Hotel World,Ali Smith,,,Hotel World


In [158]:
co.book_dicts['10 Minutes 38 Seconds in This Strange World.json'].keys() == co.book_schema.keys()

True

In [163]:
pd.DataFrame(list(co.book_dicts.values()))

Unnamed: 0,title,author,city,country,page
0,10 Minutes and 38 Seconds in This Strange World,Elif Shafak,Istanbul,Turkey,10 Minutes 38 Seconds in This Strange World
1,A Bend in the River,V.S. Naipaul,Kisangani,Zaire,A Bend in the River
2,A Brief History of Seven Killings,Marlon James,,Jamaica,A Brief History of Seven Killings
3,A Thousand Splendid Suns,Khaled Hosseini,Lahore,Pakistan,A Case of Exploding Mangoes
4,A Clubbable Woman,Reginald Hill,,,A Clubbable Woman
...,...,...,...,...,...
278,What Was Lost,Catherine O'Flynn,Birmingham,England,What Was Lost
279,What's Bred in the Bone,Robertson Davies,,Canada,What's Bred in the Bone
280,When We Were Orphans,Kazuo Ishiguro,Shanghai,China,When We Were Orphans
281,Wolf Hall,Hilary Mantel,,England,Wolf Hall


In [155]:
co.failed_schema

{'A Clubbable Woman.json': {'title': 'A Clubbable Woman',
  'author': 'Reginald Hill',
  'page': 'A Clubbable Woman'},
 'A Fraction of the Whole.json': {'title': 'A Fraction of the Whole',
  'author': 'Steve Toltz',
  'page': 'A Fraction of the Whole'},
 'Any Human Heart.json': {'title': 'Any Human Heart',
  'author': 'William Boyd',
  'page': 'Any Human Heart'},
 'Astonishing Splashes of Colour.json': {'title': 'Astonishing Splashes of Colour',
  'author': 'Clare Morrall',
  'page': 'Astonishing Splashes of Colour'},
 'Do Not Say We Have Nothing.json': {'title': 'Do Not Say We Have Nothing',
  'author': '',
  'page': 'Do Not Say We Have Nothing'},
 'England, England.json': {'title': 'England, England',
  'author': 'Julian Barnes',
  'page': 'England, England'},
 'Fire From Heaven.json': {'title': 'Fire From Heaven',
  'author': 'Mary Renault',
  'page': 'Fire From Heaven'},
 "Flaubert's Parrot.json": {'title': "Flaubert's Parrot",
  'author': 'Julian Barnes',
  'page': "Flaubert's Par

In [144]:
co.failed_check

[]

In [86]:
co.book_dicts['A Brief History of Seven Killings.json'].keys() ==['title', 'author', 'city', 'country', 'page']

False

In [74]:
empty_list = []
for key, value in co.book_dicts.items():
    if value=={}:
        empty_list.append(key)
        

In [75]:
empty_list

[]

In [47]:
co = CheckOutput
x = co.get_dict(book_dir)

In [49]:
x.is_dict()

AttributeError: 'dict' object has no attribute 'is_dict'

In [52]:
x

{'title': '10 Minutes and 38 Seconds in This Strange World',
 'author': 'Elif Shafak',
 'city': 'Istanbul',
 'country': 'Turkey',
 'page': '10 Minutes 38 Seconds in This Strange World'}

In [51]:
if isinstance(x, dict):
    print('hai!')

hai!


  warn_deprecated(


{'title': 'Musashi',
 'author': 'Eiji Yoshikawa',
 'city': 'NA',
 'country': 'Japan'}

# using ollama 

### PM Time:

Currently we can pull a wikipedia page and have a seemingly reliable llm prompt that rips name, author, city, and country info from the book. Now we want to generate an eligible list of wikipedia page titles, starting with all booker prize books, and then scriptify so we can scrape their info correctly. 

### infra for whole notebook


In [5]:
llm = Ollama(model="llama3")
llm("The first man on the summit of Mount Everest, the highest peak on Earth, was ...")

  warn_deprecated(


"Tenzing Norgay, a Nepali Sherpa mountaineer, and Sir Edmund Hillary, a New Zealand mountaineer and explorer. They reached the summit of Mount Everest (then known as Chomolungma) on May 29, 1953. This achievement marked a major milestone in the history of mountain climbing and earned them international recognition.\n\nTenzing Norgay and Sir Edmund Hillary's successful ascent of Mount Everest was part of a British expedition led by John Hunt. The team spent several weeks acclimatizing to the high altitude and making their way up the mountain via the South Col route. After a long and challenging climb, Tenzing and Hillary reached the summit at around 11:30 am local time.\n\nTheir achievement marked the first time humans had reached the top of Mount Everest, which stands at an incredible 8,848 meters (29,029 feet) above sea level. Today, the summit is considered one of the most iconic and coveted destinations in the world for mountaineers and adventure seekers."

In [6]:
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
page_py = wiki_wiki.page('Python_(programming_language)')

## Access the List of Valid Books

booker prize books

struggling to get table data with the wikipedia api so let's try more old school

https://medium.com/analytics-vidhya/web-scraping-a-wikipedia-table-into-a-dataframe-c52617e1f451

had success with beautiful soup!

OK totally devious plan, most excellent--we take the scraped list of book titles from the beautiful soup made dataframe, and then we cross ref against the list of linked pages we get from the wikipedia api's .links call. from that intersection we'll get the set of all book pages that are booker winners AND have active wiki pages. From there we can run our llm. 

In [23]:
class BookList:

    def __init__(self):
        self.wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')
        self.booker_page = 'List_of_winners_and_nominated_authors_of_the_Booker_Prize'
        self.booker_url = 'https://en.wikipedia.org/wiki/' + self.booker_page

    def run(self):

        self.get_booker_table()
        self.xref_booker_pages()
        
    def get_booker_table(self):
        wikiurl=self.booker_url
        table_class="wikitable sortable jquery-tablesorter"
        response=requests.get(wikiurl)
        print(response.status_code)
        
        self.soup = BeautifulSoup(response.text, 'html.parser')
        bookertable=self.soup.find('table',{'class':"wikitable"})
        df=pd.read_html(str(bookertable))
        self.booker_df=pd.DataFrame(df[0])

    def xref_booker_pages(self):
        self.booker_titles_list = self.booker_df.Title.dropna().tolist()
        self.booker = self.wiki.page(self.booker_page)
        self.booker_wiki_links = list(self.booker.links.keys())
        self.booker_eligible_pages = list(set(self.booker_wiki_links) & set(self.booker_titles_list))
            

In [24]:
bl = BookList()
bl.run()

200


  df=pd.read_html(str(bookertable))


In [28]:
bl.booker_df.sort_values('Author')

Unnamed: 0,Year,Award,Author,Title,Publisher,Judges
387,2011[17],Shortlist,A D Miller,Snowdrops,Atlantic Books,Dame Stella Rimington (chair) Matthew d'Ancona...
7,1970,Shortlist,A. L. Barker,John Brown's Body,Hogarth,Antonia Fraser Ross Higgins Richard Hoggart Da...
342,2007[13],Longlist,A. N. Wilson,Winnie & Wolf,Hutchinson,Howard Davies (chair) Wendy Cope Giles Foden R...
357,2009[15],Shortlist,A. S. Byatt,The Children's Book,Chatto and Windus,James Naughtie (chair) Lucasta Miller John Mul...
140,1990,Winner,A. S. Byatt,Possession: A Romance,Chatto & Windus,Sir Denis Forman (chair) Susannah Clapp A. Wal...
...,...,...,...,...,...,...
470,2017,Longlist,Zadie Smith,Swing Time,Hamish Hamilton,Baroness Lola Young (chair) Lila Azam Zanganeh...
247,2002[8],Longlist,Zadie Smith,The Autograph Man,Hamish Hamilton,Lisa Jardine (chair) David Baddiel Russell Cel...
253,2003[9],Shortlist,Zoë Heller,Notes on a Scandal,Viking,John Carey (Chair) A. C. Grayling Francine Sto...
220,2001[7],Longlist,Zvi Jagendorf,Wolfy and the Strudelbakers,Dewi Lewis,Kenneth Baker (chair) Philip Hensher Michèle R...


In [29]:
bl.booker_df

Unnamed: 0,Year,Award,Author,Title,Publisher,Judges
0,1969,Winner,P. H. Newby,Something to Answer For,Faber & Faber,David Farrer Frank Kermode Stephen Spender Dam...
1,1969,Shortlist,Barry England,Figures in a Landscape,Jonathan Cape,David Farrer Frank Kermode Stephen Spender Dam...
2,1969,Shortlist,Nicholas Mosley,Impossible Object,Hodder & Stoughton,David Farrer Frank Kermode Stephen Spender Dam...
3,1969,Shortlist,Iris Murdoch,The Nice and the Good,Chatto & Windus,David Farrer Frank Kermode Stephen Spender Dam...
4,1969,Shortlist,Muriel Spark,The Public Image,Macmillan,David Farrer Frank Kermode Stephen Spender Dam...
...,...,...,...,...,...,...
546,2023,Longlist[38],Siân Hughes,Pearl,Indigo Press,Esi Edugyan (chair) Adjoa Andoh Mary Jean Chan...
547,2023,Longlist[38],Viktoria Lloyd-Barlow,All the Little Bird-Hearts,Tinder Press,Esi Edugyan (chair) Adjoa Andoh Mary Jean Chan...
548,2023,Longlist[38],Martin MacInnes,In Ascension,Atlantic Books,Esi Edugyan (chair) Adjoa Andoh Mary Jean Chan...
549,2023,Longlist[38],Tan Twan Eng,The House of Doors,Canongate,Esi Edugyan (chair) Adjoa Andoh Mary Jean Chan...


In [25]:
bl.booker_eligible_pages # also can get author and prize type (winner, shortlist, etc) from the table

['Satin Island',
 'Translated Accounts',
 'Arthur & George',
 'Last Letters from Hav',
 'Rumours of Rain',
 'According to Queeney',
 'Elizabeth Costello',
 'The Nice and the Good',
 'Sacred Hunger',
 'The Mulberry Empire',
 'Figures in a Landscape',
 'Girl, Woman, Other',
 'A Fine Balance',
 'An Insular Possession',
 'Rhine Journey',
 'The Folding Star',
 'Slow Man',
 'Such a Fun Age',
 'The 27th Kingdom',
 'Night Boat to Tangier',
 'Nice Work',
 'The Hand Reared Boy',
 'An Artist of the Floating World',
 'Dorian, an Imitation',
 'The Long Take',
 'No One Is Talking About This',
 "Flaubert's Parrot",
 'Beyond Black',
 'The Vivisector',
 'Briefing for a Descent into Hell',
 'Sour Sweet',
 'The Bee Sting',
 'Grace Notes',
 'Possession: A Romance',
 'The Testament of Mary',
 'Bring Up the Bodies',
 'The Amber Spyglass',
 'The Blackwater Lightship',
 'The Bird of Night',
 'The Stars in the Bright Sky',
 'The Big Chapel',
 'A Clubbable Woman',
 'Love and Summer',
 'The Stone Carvers',
 'A B

In [16]:
bl.booker_eligible_pages.sort()

In [17]:
bl.booker_eligible_pages

['10 Minutes 38 Seconds in This Strange World',
 'A Bend in the River',
 'A Brief History of Seven Killings',
 'A Case of Exploding Mangoes',
 'A Clubbable Woman',
 'A Disaffection',
 'A Fairly Honourable Defeat',
 'A Fine Balance',
 'A Five-Year Sentence',
 'A Fraction of the Whole',
 'A Green Equinox',
 'A Guilty Thing Surprised',
 'A Little Life',
 'A Little Of What You Fancy?',
 'A Long Long Way',
 'A Passage North',
 'A Place in England',
 'A Short History of Tractors in Ukrainian',
 'A Spell of Good Things',
 'A Spool of Blue Thread',
 'A Tale for the Time Being',
 'According to Mark',
 'According to Queeney',
 'Alias Grace',
 'All That Man Is',
 'Amongst Women',
 'An Artist of the Floating World',
 'An Ice-Cream War',
 'An Instant in the Wind',
 'An Insular Possession',
 'An Orchestra of Minorities',
 "Animal's People",
 'Anthills of the Savannah',
 'Any Human Heart',
 'Arthur & George',
 'Astonishing Splashes of Colour',
 'Beside the Ocean of Time',
 'Bewilderment',
 'Beyond Bl

In [None]:
bl.get_booker_table()

In [97]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_winners_and_nominated_authors_of_the_Booker_Prize"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [101]:
soup = BeautifulSoup(response.text, 'html.parser')
bookertable=soup.find('table',{'class':"wikitable"})

In [104]:
df=pd.read_html(str(bookertable))
# convert list to dataframe
df=pd.DataFrame(df[0])


  df=pd.read_html(str(bookertable))


In [110]:
df.Title.dropna().tolist()

['Something to Answer For',
 'Figures in a Landscape',
 'Impossible Object',
 'The Nice and the Good',
 'The Public Image',
 'From Scenes Like These',
 'The Elected Member',
 "John Brown's Body",
 'Eva Trout',
 "Bruno's Dream",
 "Mrs Eckdorf in O'Neill's Hotel",
 'The Conjunction',
 'Troubles',
 'The Birds on the Trees',
 'The Bay of Noon',
 'Fire From Heaven',
 "The Driver's Seat",
 'The Vivisector',
 'The Hand Reared Boy',
 'A Little Of What You Fancy?',
 'A Place in England',
 'Down All the Days',
 'Bomber',
 'The Circle',
 'A Clubbable Woman',
 "I'm the King of the Castle",
 'A Domestic Animal',
 'The Fire Dwellers',
 'Out of the Shelter',
 'A Fairly Honourable Defeat',
 'Fireflies',
 'Master and Commander',
 'Head To Toe',
 'A Guilty Thing Surprised',
 'In a Free State',
 'The Big Chapel',
 'Briefing for a Descent into Hell',
 "St. Urbain's Horseman",
 'Goshawk Squadron',
 'Mrs. Palfrey at the Claremont',
 'G.',
 'The Bird of Night',
 'The Chant of Jimmie Blacksmith',
 'Pasmore',


In [92]:
booker = wiki_wiki.page('List_of_winners_and_nominated_authors_of_the_Booker_Prize')

In [114]:
list(booker.links.keys())

['10 Minutes 38 Seconds in This Strange World',
 '2006 Man Booker Prize',
 '2007 Man Booker Prize',
 '2008 Man Booker Prize',
 '2009 Man Booker Prize',
 '2010 Man Booker Prize',
 '2011 Man Booker Prize',
 '2012 Man Booker Prize',
 '2013 Man Booker Prize',
 '2014 Man Booker Prize',
 '2015 Man Booker Prize',
 '2016 Man Booker Prize',
 '2017 Man Booker Prize',
 '2018 Man Booker Prize',
 '2019 Booker Prize',
 '2020 Booker Prize',
 '2021 Booker Prize',
 '2022 Booker Prize',
 '2023 Booker Prize',
 '4 3 2 1 (novel)',
 '4th Estate',
 '4th Estate (imprint)',
 'A. C. Grayling',
 'A. L. Barker',
 'A. L. Kennedy',
 'A. N. Wilson',
 'A. S. Byatt',
 'A. Walton Litz',
 'A Bend in the River',
 'A Brief History of Seven Killings',
 'A Case of Exploding Mangoes',
 'A Clubbable Woman',
 'A Disaffection',
 'A Distant Shore (novel)',
 'A Fairly Honourable Defeat',
 'A Fine Balance',
 'A Five-Year Sentence',
 'A Fraction of the Whole',
 'A Green Equinox',
 'A Guilty Thing Surprised',
 'A Little Life',
 'A L

In [5]:
p_wiki = wiki_wiki.page("The House of Doors")

In [22]:
p_wiki.text

'The House of Doors is a 2023 historical novel by Tan Twan Eng, published by Bloomsbury Publishing. The novel, set in the 1920s British colony of the Federated Malay States, tells the stories of the local residents and visitors, including a fictionalized version of William Somerset Maugham.\nThe novel was longlisted for the 2023 Booker Prize and listed among notable fiction works in 2023 by The Washington Post and The Financial Times.\n\nNarrative\nThe book tells of a fictionalized account of William Somerset Maugham\'s travels through the Federated Malay States in the 1920s. While in Penang, Maugham and Gerald Haxton, who is ostensibly his travelling secretary but is actually his lover, stay with Maugham\'s friend Robert Hamlyn. Robert and his wife Lesley are British expatriots living in the Federated Malay States. While staying with the Hamlyns, Maughan develops a friendship with Lesley.\nThis eventually leads Lesley to confide in Maugham, revealing many personal secrets which would 

In [7]:
llm("Tell me which city this novel takes place in, if it takes place in a city, based on its wikipedia summary which is as follows:" + p_wiki.text)

'According to the Wikipedia summary, the novel "The House of Doors" by Tan Twan Eng takes place in Penang, which is part of the Federated Malay States (now Malaysia). Therefore, the city where this novel takes place is Penang.'

In [39]:
prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. if multiple cities are mentioned in a novel but the novel is mostly set in one place, then return that place (city and country). \
if a novel is set in many places and not focused on one place, then return NA as your response for city and country. \
if a novel is not set in a real place on earth, for example Duke being set in Arrakis, then return NA as your response for city and country.\
return the name, author, city, and country information as a dictionary with name, author, city, and country as four dictionary keys.\
\Finally, it is possible that the wikipedia article provided is not the wikipedia article for a book. \
Add a fifth dictionary key called is_book and if the text provided seems like the wikipedia article for a book, return True as the value. \
If the wikipedia article is not that of a book, return False as the value for is_book. In this case, return NA for all other values.\
Do not return any text other than the dictionary in your response. Do not make a human readable sentence. \
Here is the text of the wikipedia article for which you will return a dictionary: "


In [None]:
p_wiki = wiki_wiki.page("The House of Doors")

In [64]:
llm(prompt + p_wiki.text)

"{'name': 'The House of Doors', 'author': 'Tan Twan Eng', 'city': 'Penang', 'country': 'Malaysia', 'is_book': True}"

In [60]:
test_wiki = wiki_wiki.page("In_Ascension")


In [65]:
prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. if multiple cities are mentioned in a novel but the novel is mostly set in one place, then return that place (city and country). \
if a novel is set in many places and not focused on one place, then return NA as your response for city and country. \
if a novel is not set in a real place on earth, for example Duke being set in Arrakis, then return NA as your response for city and country.\
return the name, author, city, and country information as a dictionary with name, author, city, and country as four dictionary keys.\
\Finally, it is possible that the wikipedia article provided is not the wikipedia article for a book. \
Add a fifth dictionary key called is_book and if the text provided seems like the wikipedia article for a book, return True as the value. \
If the wikipedia article is not that of a book, return False as the value for is_book. In this case, return NA for all other values.\
Do not return any text other than the dictionary in your response. Do not make a human readable sentence. \
Here is the text of the wikipedia article for which you will return a dictionary: "

reminder_text = " Remember, your task is to return a python dictionary with keys name, author, city, country, and is_book. \
Do not return any text other than the dictionary. An example of legitimate output is: \
{'name': 'The House of Doors', 'author': 'Tan Twan Eng', 'city': 'Penang', 'country': 'Malaysia', 'is_Book': True} \
DO NOT RETURN ANY TEXT OTHER THAN THE DICTIONARY"

llm(prompt + test_wiki.text)

#fails on our test case Dune. doesn't on House of Doors though. 

"{'name': 'In Ascension', 'author': 'Martin MacInnes', 'city': 'Rotterdam', 'country': 'Netherlands', 'is_Book': True}"

In [83]:
prompt = "You are a bot that reads wikipedia articles about books and returns information about them. Your output must be in valid JSON. \
Do not output anything other than the JSON.\
First, find the book's title and add it to the JSON with the key 'title'.\
Next, find the book's author and add it to the JSON with the key 'author'.\
Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
with the key 'country'. Finally, surround your JSON output with <result></result> tags. \
Here is the text from the wikipedia article for this book: "



end_prompt = "Based on the book's wikipedia article provided above, create a JSON describing this book. Your output must be in valid JSON. \
Do not output anything other than the JSON.\
First, find the book's title and add it to the JSON with the key 'title'.\
Next, find the book's author and add it to the JSON with the key 'author'.\
Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
with the key 'country'. Finally, surround your JSON output with <result></result> tags."

reminder_text = "Return a valid JSON with keys 'title','author', 'city', and 'country' describing this book according to the instructions given previously. "

llm_response = llm(prompt + wiki_wiki.page("Shuggie_Bain").text + end_prompt)

In [85]:
import json

json_string = llm_response.split('<result>')[1].split('</result>')[0]
parsed_json = json.loads(json_string.strip())

parsed_json

{'title': 'Shuggie Bain',
 'author': 'Douglas Stuart',
 'city': 'Glasgow',
 'country': 'Scotland'}

In [86]:
prompt = "You are a bot that reads wikipedia articles about books and returns information about them. Your output must be in valid JSON. \
Do not output anything other than the JSON.\
First, find the book's title and add it to the JSON with the key 'title'.\
Next, find the book's author and add it to the JSON with the key 'author'.\
Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
with the key 'country'. Finally, surround your JSON output with <result></result> tags. \
Here is the text from the wikipedia article for this book: "



end_prompt = "Based on the book's wikipedia article provided above, create a JSON describing this book. Your output must be in valid JSON. \
Do not output anything other than the JSON.\
First, find the book's title and add it to the JSON with the key 'title'.\
Next, find the book's author and add it to the JSON with the key 'author'.\
Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
with the key 'country'. Finally, surround your JSON output with <result></result> tags."

reminder_text = "Return a valid JSON with keys 'title','author', 'city', and 'country' describing this book according to the instructions given previously. "

llm_response = llm(prompt + wiki_wiki.page("Dune_(novel)").text + end_prompt)

In [87]:
json_string = llm_response.split('<result>')[1].split('</result>')[0]
parsed_json = json.loads(json_string.strip())

parsed_json

{'title': 'Dune', 'author': 'Frank Herbert'}

In [88]:
llm_response

'<result>\n{\n"title": "Dune",\n"author": "Frank Herbert"\n}\n</result>'

In [None]:
class SaveWork:

    def __init__(self):
        self.llm = Ollama(model="llama3")
        self.wiki = wikipediaapi.Wikipedia('MyProjectName (merlin@example.com)', 'en')

        return 

    def pull_book(self):

        prompt = "You are a bot that reads wikipedia articles about books and returns information about them. Your output must be in valid JSON. \
        Do not output anything other than the JSON.\
        First, find the book's title and add it to the JSON with the key 'title'.\
        Next, find the book's author and add it to the JSON with the key 'author'.\
        Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
        you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
        Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
        If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
        with the key 'country'. Finally, surround your JSON output with <result></result> tags. \
        Here is the text from the wikipedia article for this book: "
        
        
        
        end_prompt = "Based on the book's wikipedia article provided above, create a JSON describing this book. Your output must be in valid JSON. \
        Do not output anything other than the JSON.\
        First, find the book's title and add it to the JSON with the key 'title'.\
        Next, find the book's author and add it to the JSON with the key 'author'.\
        Next, find the city that the book is set in and add it to the JSON with the key 'city'. If the book is not set in a real city on earth or\
        you cannot determine what city it is set in, add 'NA' to the JSON with the key 'city'.\
        Next, find the country that the book is set in and add it to the JSON with the key 'country'. \
        If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the JSON\
        with the key 'country'. Finally, surround your JSON output with <result></result> tags."
                
        self.llm_response = self.llm(prompt + self.wiki.page("Dune_(novel)").text + end_prompt)

    def parse_response(self):

        json_string = self.llm_response.split('<result>')[1].split('</result>')[0]
        self.parsed_json = json.loads(json_string.strip())
        


In [82]:
prompt = "You are a bot that reads wikipedia articles about books and returns information about them. Your output must be a python dictionary. \
Do not output anything other than the python dictionary.\
First, find the book's title and add it to the dictionary with the key 'title'.\
Next, find the book's author and add it to the dictionary with the key 'author'.\
Next, find the city that the book is set in and add it to the dictionary with the key 'city'. If the book is not set in a real city on earth or\
you cannot determine what city it is set in, add 'NA' to the dictionary with the key 'city'.\
Next, find the country that the book is set in and add it to the dictionary with the key 'country'. \
If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the dictionary\
with the key 'country'. Finally, surround your dictinary output with <result></result> tags. \
Here is the text from the wikipedia article for this book: "



end_prompt = "Based on the book's wikipedia article provided above, create a python dictionary describing this book. Your output must be a python dictionary. \
Do not output anything other than the python dictionary.\
First, find the book's title and add it to the dictionary with the key 'title'.\
Next, find the book's author and add it to the dictionary with the key 'author'.\
Next, find the city that the book is set in and add it to the dictionary with the key 'city'. If the book is not set in a real city on earth or\
you cannot determine what city it is set in, add 'NA' to the dictionary with the key 'city'.\
Next, find the country that the book is set in and add it to the dictionary with the key 'country'. \
If the book is not set in a real country on earth or you cannot determine what country it is set in, add 'NA' to the dictionary\
with the key 'country'. Finally, surround your dictinary output with <result></result> tags. \
Here is the text from the wikipedia article for this book: "

llm(prompt + wiki_wiki.page("Shuggie_Bain").text + end_prompt)

"<result>\n{\n'title': 'Shuggie Bain',\n'author': 'Douglas Stuart',\n'city': 'Glasgow',\n'country': 'Scotland'\n}\n</result>"

In [89]:
bookers = wiki_wiki.page('List_of_winners_and_nominated_authors_of_the_Booker_Prize')

In [91]:
bookers.links

{'10 Minutes 38 Seconds in This Strange World': 10 Minutes 38 Seconds in This Strange World (id: ??, ns: 0),
 '2006 Man Booker Prize': 2006 Man Booker Prize (id: ??, ns: 0),
 '2007 Man Booker Prize': 2007 Man Booker Prize (id: ??, ns: 0),
 '2008 Man Booker Prize': 2008 Man Booker Prize (id: ??, ns: 0),
 '2009 Man Booker Prize': 2009 Man Booker Prize (id: ??, ns: 0),
 '2010 Man Booker Prize': 2010 Man Booker Prize (id: ??, ns: 0),
 '2011 Man Booker Prize': 2011 Man Booker Prize (id: ??, ns: 0),
 '2012 Man Booker Prize': 2012 Man Booker Prize (id: ??, ns: 0),
 '2013 Man Booker Prize': 2013 Man Booker Prize (id: ??, ns: 0),
 '2014 Man Booker Prize': 2014 Man Booker Prize (id: ??, ns: 0),
 '2015 Man Booker Prize': 2015 Man Booker Prize (id: ??, ns: 0),
 '2016 Man Booker Prize': 2016 Man Booker Prize (id: ??, ns: 0),
 '2017 Man Booker Prize': 2017 Man Booker Prize (id: ??, ns: 0),
 '2018 Man Booker Prize': 2018 Man Booker Prize (id: ??, ns: 0),
 '2019 Booker Prize': 2019 Booker Prize (id: ?

In [77]:
prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. if multiple cities are mentioned in a novel but the novel is mostly set in one place, then return that place (city and country). \
if a novel is set in many places and not focused on one place, then return NA as your response for city and country. \
if a novel is not set in a real place on earth, for example Duke being set in Arrakis, then return NA as your response for city and country.\
return the name, author, city, and country information as a dictionary with name, author, city, and country as four dictionary keys.\
\Finally, it is possible that the wikipedia article provided is not the wikipedia article for a book. \
Add a fifth dictionary key called is_book and if the text provided seems like the wikipedia article for a book, return True as the value. \
If the wikipedia article is not that of a book, return False as the value for is_book. In this case, return NA for all other values.\
Do not return any text other than the dictionary in your response. Do not make a human readable sentence. \
Here is the text of the wikipedia article for which you will return a dictionary: "

reminder_text = " Remember, your task is to return a python dictionary with keys name, author, city, country, and is_book. \
Do not return any text other than the dictionary. An example of legitimate output is: \
{'name': 'The House of Doors', 'author': 'Tan Twan Eng', 'city': 'Penang', 'country': 'Malaysia', 'is_Book': True} \
DO NOT RETURN ANY TEXT OTHER THAN THE DICTIONARY"

llm(prompt + wiki_wiki.page("Shuggie_Bain").text)

'Based on the information provided, here are some awards and nominations for "Shuggie Bain":\n\n* Longlisted for the 2021 Andrew Carnegie Medal for Excellence in Fiction\n* Shortlisted for the 2020 Center for Fiction First Novel Prize\n* Finalist for the 2020 Kirkus Prize\n* Finalist for the 2020 National Book Award for Fiction\n* Winner of the 2020 Booker Prize\n\nAdditionally, "Shuggie Bain" was included on several lists of best books of 2020, including:\n\n* The Telegraph\'s list of 50 best books of 2020\n* The Times\' list of best novels of 2020\n* Kirkus Reviews\' list of best books of 2020\n* Vogue and Elle\'s lists of best books of 2020\n\nIt was also named as one of the best books of the year by several other publications, including The New York Times, The Washington Post, Time, and the BBC.'

### pre-parse book status?

tried using

In [15]:
# ok the check book part fails when we include it in the summarization. maybe doing a two step filter is better
# step one check if it's a book 

check_book_prompt = "based on the text of the wikipedia article provided, tell me if the wikipedia article is a book or not. \
return your response as a python dictionary with key is_book, and value True or False. \
Do not return any text other than the dictionary in your response. Do not make a human readable sentence. \
Here is the text of the wikipedia article for which you will return a dictionary: "




In [31]:
llm(check_book_prompt + test_wiki.text)

# it's not working...let's parse the output like this
# https://www.gettingstarted.ai/how-to-langchain-output-parsers-convert-text-to-objects/

ConnectionError: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /w/api.php?action=query&prop=extracts&titles=Dune&explaintext=1&exsectionformat=wiki&format=json&redirects=1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x112c644c0>: Failed to resolve 'en.wikipedia.org' ([Errno 8] nodename nor servname provided, or not known)"))

### just figuring out how to return as json




In [20]:
from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator


In [21]:
# Define your desired data structure.
class Joke(BaseModel):
    setup: str = Field(description="question to set up a joke")
    punchline: str = Field(description="answer to resolve the joke")

    # You can add custom validation logic easily with Pydantic.
    @validator("setup")
    def question_ends_with_question_mark(cls, field):
        if field[-1] != "?":
            raise ValueError("Badly formed question!")
        return field


# And a query intented to prompt a language model to populate the data structure.
joke_query = "Tell me a joke."

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Joke)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

chain.invoke({"query": joke_query})

Joke(setup="Why don't scientists trust atoms?", punchline='Because they make up everything!')

In [58]:
class Book(BaseModel):
    name: str = Field(description="name of the novel")
    author: str = Field(description="name of the author")
    city: str = Field(description="the city it is most focused on or set in")
    country: str = Field(description="the country it is most focused on or set in")



# And a query intented to prompt a language model to populate the data structure.

writing_prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. \
Here is the text of the wikipedia article: "

book_query = writing_prompt + p_wiki.text

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Book)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

chain.invoke({"query": book_query})

OutputParserException: Failed to parse Book from completion {"properties": {"name": {"title": "Name", "description": "name of the novel", "type": "string"}, "author": {"title": "Author", "description": "name of the author", "type": "string"}, "city": {"title": "City", "description": "the city it is most focused on or set in", "type": "string"}, "country": {"title": "Country", "description": "the country it is most focused on or set in", "type": "string"}}, "required": ["name", "author", "city", "country"]}. Got: 4 validation errors for Book
name
  field required (type=value_error.missing)
author
  field required (type=value_error.missing)
city
  field required (type=value_error.missing)
country
  field required (type=value_error.missing)

In [56]:
class Book(BaseModel):
    name: str = Field(description="name of the novel")
    author: str = Field(description="name of the author")
    city: str = Field(description="the city it is most focused on or set in")
    country: str = Field(description="the country it is most focused on or set in")



# And a query intented to prompt a language model to populate the data structure.

writing_prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. \
Here is the text of the wikipedia article: "

book_query = writing_prompt + p_wiki.text

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Book)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

chain.invoke({"query": book_query})

OutputParserException: Failed to parse Book from completion {"properties": {"name": {"title": "Name", "description": "name of the novel", "type": "string"}, "author": {"title": "Author", "description": "name of the author", "type": "string"}, "city": {"title": "City", "description": "the city it is most focused on or set in", "type": "string"}, "country": {"title": "Country", "description": "the country it is most focused on or set in", "type": "string"}}, "required": ["name", "author", "city", "country"]}. Got: 4 validation errors for Book
name
  field required (type=value_error.missing)
author
  field required (type=value_error.missing)
city
  field required (type=value_error.missing)
country
  field required (type=value_error.missing)

In [53]:
class Book(BaseModel):
    name: str = Field(description="name of the novel")
    author: str = Field(description="name of the author")
    city: str = Field(description="the city it is most focused on or set in")
    country: str = Field(description="the country it is most focused on or set in")



# And a query intented to prompt a language model to populate the data structure.

writing_prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. If the article is not for a book, or the book is not set on a real city or country on Earth, you can use NaN as a response. \
Here is the text of the wikipedia article: "

book_query = writing_prompt + test_wiki.text

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Book)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

chain.invoke({"query": book_query})

OutputParserException: Failed to parse Book from completion {"answer": [{"title": "Tales from the Twilight World", "inspiration_source": "Paul Atreides' visions of future and past"}, {"title": "Fear is The Mindkiller", "inspiration_source": "litany against fear"}, {"title": "Near Fantastica", "inspiration_source": "Frank Herbert's Dune series"}, {"title": "Dune and Philosophy", "edits": [{"name": "Kevin S. Decker"}, {"name": "Dominic J. Nardi"}]}, {"title": "The Wisdom of the Sand: Philosophy and Frank Herbert's Dune", "author": "Kevin C. Williams"}], "related": [{"url": "https://www.litweb.net"}, {"url": "https://www.tor.com"}, {"url": "https://www.nautilus.org"}]}. Got: 4 validation errors for Book
name
  field required (type=value_error.missing)
author
  field required (type=value_error.missing)
city
  field required (type=value_error.missing)
country
  field required (type=value_error.missing)

In [55]:
class Book(BaseModel):
    name: str = Field(description="name of the novel")
    author: str = Field(description="name of the author")
    city: str = Field(description="the city it is most focused on or set in")
    country: str = Field(description="the country it is most focused on or set in")



# And a query intented to prompt a language model to populate the data structure.

writing_prompt = "I am building a database of novels set in different cities and countries on Earth. Based on the wikipedia article for this novel \
tell me the name of the novel, the name of the author, the city it is most focused on or set in, and the country it is most focused on or \
set in. If the article is not for a book, or the book is not set on a real city or country on Earth, you can use NaN as a response. \
Here is the text of the wikipedia article: "

book_query = writing_prompt + p_wiki.text

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Book)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

chain.invoke({"query": book_query})

OutputParserException: Failed to parse Book from completion {"properties": {"name": {"title": "Name", "description": "name of the novel", "type": "string"}, "author": {"title": "Author", "description": "name of the author", "type": "string"}, "city": {"title": "City", "description": "the city it is most focused on or set in", "type": "string"}, "country": {"title": "Country", "description": "the country it is most focused on or set in", "type": "string"}}, "required": ["name", "author", "city", "country"]}. Got: 4 validation errors for Book
name
  field required (type=value_error.missing)
author
  field required (type=value_error.missing)
city
  field required (type=value_error.missing)
country
  field required (type=value_error.missing)

In [12]:
from pydantic import BaseModel, Field



In [14]:
parser = PydanticOutputParser(pydantic_object=Book)

NameError: name 'PydanticOutputParser' is not defined

In [None]:
prompt = PromptTemplate(
    template=reservation_template,
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
) 

### wiki stuff

In [15]:
p_wiki.categories

{'Category:2023 novels': Category:2023 novels (id: ??, ns: 14),
 'Category:Articles with short description': Category:Articles with short description (id: ??, ns: 14),
 'Category:Historical novels': Category:Historical novels (id: ??, ns: 14),
 'Category:Malaysian novels': Category:Malaysian novels (id: ??, ns: 14),
 'Category:Novels about writers': Category:Novels about writers (id: ??, ns: 14),
 'Category:Novels set in Malaysia': Category:Novels set in Malaysia (id: ??, ns: 14),
 'Category:Novels set in the 1920s': Category:Novels set in the 1920s (id: ??, ns: 14),
 'Category:Short description matches Wikidata': Category:Short description matches Wikidata (id: ??, ns: 14),
 'Category:Use mdy dates from November 2023': Category:Use mdy dates from November 2023 (id: ??, ns: 14)}

In [74]:
list(p_wiki.links.keys())

['2023 Booker Prize',
 'Bloomsbury Publishing',
 'Booker Prize',
 'Ethel Proudlock case',
 'Federated Malay States',
 'First World War',
 'Gerald Haxton',
 'ISBN (identifier)',
 'NPR',
 'Penang',
 'Sun Yat-sen',
 'Tan Twan Eng',
 'The Financial Times',
 'The Guardian',
 'The Washington Post',
 'William Somerset Maugham',
 'Category:Use mdy dates from November 2023']

In [75]:
test_wiki = wiki_wiki.page(list(p_wiki.links.keys())[-1])


In [76]:
test_wiki.text

'Wikipedia articles (tagged in this month) that use mm dd yyyy date formats, whether by application of the first main contributor rule or by virtue of close national ties to the subject belong in this category. Use {{Use mdy dates}} to add an article to this category. See MOS:DATE.\nThis system of tagging or categorisation is used as a status monitor of all articles that use mm dd yyyy date formats, and not as a clean up.'

### next data scraping steps

ok let's start by acquiring our lists of books. we then need to actually check that these are books 

In [None]:
# ok one downstream task is to acquire the author of a book and then acquire the nationality of that author. 
# could use the llm to first generate the list of authors