In [1]:
import requests
import bs4
import re
import pickle
from collections import Counter, defaultdict
import json

---
Api overview
---

In [2]:
requests.get("https://api.ishtar-collective.net/").json()

{'navigation': {'home': 'http://api.ishtar-collective.net/',
  'categories': 'http://api.ishtar-collective.net/categories',
  'cards': 'http://api.ishtar-collective.net/cards',
  'items': 'http://api.ishtar-collective.net/items',
  'entries': 'http://api.ishtar-collective.net/entries',
  'transcripts': 'http://api.ishtar-collective.net/transcripts'}}

---
Download helpers
---

In [3]:
def download_cards(category):
    data = {}
    cards = (requests.get(f"http://api.ishtar-collective.net/categories/{category}")\
             .json()['category']['grimoire_cards'])
    for card in cards:
        url = card['api_url']
        name = card['name']
        response = requests.get(url).json()["grimoire_card"]
        description = response["description"] if response["description"] is not None else ""
        data[name] = bs4.BeautifulSoup(description).get_text()
        print("Card downloaded:\t" + name)
    return data

In [17]:
def _get_entries_urls(category):
    """
    Since the entries are not linked with corresponding categories
    in the api they need to be manualy scraped from the website

    Returns:
    links -- list
    """
    urls = []
    content = requests.get(f"https://www.ishtar-collective.net/categories/{category}").content
    soup = bs4.BeautifulSoup(content)
    # find all links that start with '/entries/'
    entries = soup.find_all('a', attrs={'href': re.compile("/entries/")})
    for entry in entries:
        urls.append(f"http://api.ishtar-collective.net{entry.get('href')}")
    return urls

def download_entries(category):
    data = {}
    urls = _get_entries_urls(category)
    for url in urls:
        if url =="http://api.ishtar-collective.net/entries/new-normal#book-evas-journey":
            continue
        response = requests.get(url).json()['entry']
        name = response['name']
        summary = response['short_summary']
        data[name] = bs4.BeautifulSoup(summary).get_text()
        print("Entry downloaded:\t" + name)
    return data

In [5]:
def download_categories():
    categories = []
    # category_map links the api names with their user-friendly equivalents
    category_map = {}

    next_page = "http://api.ishtar-collective.net/categories/page/1"
    while next_page is not None:
        response = requests.get(next_page).json()
        raw_categories = response['categories']
        for category in raw_categories:
            categories.append(category['ishtar_ref'])
            category_map[category['ishtar_ref']] = category['name']
            print("Downloaded category:\t" + category['name'])
        next_page = response['meta'].get('next_page_url')
    return categories, category_map

---
Download loop
---

In [18]:
data = {}
categories, category_map = download_categories()
for idx, category in enumerate(categories, 1):
    data[category] = {}
    print(f"\n\n---Downloading data for {category} {idx}/{len(categories)}----\n")
    data[category]["cards"] = download_cards(category)
    data[category]["entries"] = download_entries(category)

Downloaded category:	Omar Agah
Downloaded category:	Ahamkara
Downloaded category:	The Great Ahamkara Hunt
Downloaded category:	Alpha Lupi
Downloaded category:	Aphelion
Downloaded category:	Ares One
Downloaded category:	Book: The Awoken of the Reef
Downloaded category:	Rezyl Azzir
Downloaded category:	Book: The Black Armory Papers
Downloaded category:	Andal Brask
Downloaded category:	Ana Bray
Downloaded category:	The Bray Family
Downloaded category:	Emperor Calus
Downloaded category:	Cayde-6
Downloaded category:	Book: The Man They Call Cayde
Downloaded category:	CHASM
Downloaded category:	Clovis Bray
Downloaded category:	The Consensus
Downloaded category:	The First Crota fireteam
Downloaded category:	Crows
Downloaded category:	Book: Dawning Delights
Downloaded category:	The Dawning
Downloaded category:	Deep Stone Crypt
Downloaded category:	Dormant SIVA
Downloaded category:	Book: The Dreaming City
Downloaded category:	Book: A Drifter's Gambit
Downloaded category:	The Drifter
Downloaded c

Entry downloaded:	Qiao's Passing
Entry downloaded:	Hardy's Calm
Entry downloaded:	Mihaylova's Instruments
Entry downloaded:	Mihaylova's Choice
Entry downloaded:	Hardy's Orders
Entry downloaded:	Qiao's Care
Entry downloaded:	Mihaylova's Triumph
Entry downloaded:	Qiao's Grin
Entry downloaded:	Mihaylova's Tale
Entry downloaded:	Hardy's Control
Entry downloaded:	Qiao's Heart
Entry downloaded:	Mihaylova's Path
Entry downloaded:	Qiao's Strides


---Downloading data for book-the-awoken-of-the-reef 7/124----

Entry downloaded:	Revanche II
Entry downloaded:	Revanche III
Entry downloaded:	Revanche IV
Entry downloaded:	Revanche V
Entry downloaded:	Telic I
Entry downloaded:	Telic II
Entry downloaded:	Tyrannocide I
Entry downloaded:	Tyrannocide II
Entry downloaded:	Tyrannocide III
Entry downloaded:	Tyrannocide IV
Entry downloaded:	Tyrannocide V
Entry downloaded:	Regent
Entry downloaded:	Illyn
Entry downloaded:	Nitrogen
Entry downloaded:	Refusal
Entry downloaded:	Fleet
Entry downloaded:	Of Earth and

Card downloaded:	The Dreadnaught
Card downloaded:	The Garden's Spire
Card downloaded:	The Guardian
Card downloaded:	The Nexus: Revisited
Card downloaded:	The Nightstalker's Trail
Card downloaded:	The Taken War: Earth
Card downloaded:	The Undying Mind
Card downloaded:	The Will of Crota: Revisited
Card downloaded:	Vanguard Quartermaster
Card downloaded:	Winter's Run
Card downloaded:	Zone Control
Entry downloaded:	Fold
Entry downloaded:	Call
Entry downloaded:	All-In
Entry downloaded:	Bad Beat
Entry downloaded:	Solstice Cloak (Rekindled)
Entry downloaded:	Deal
Entry downloaded:	Turn
Entry downloaded:	Winner Take All
Entry downloaded:	Roll Call
Entry downloaded:	Some Kind of Luck
Entry downloaded:	Two Cells
Entry downloaded:	A Hero's Requiem
Entry downloaded:	Ace of Spades
Entry downloaded:	act|choose|react
Entry downloaded:	A Hero's Requiem
Entry downloaded:	All-In
Entry downloaded:	Asher Mir's One-Way Ticket
Entry downloaded:	A Tale Twice Told
Entry downloaded:	Bad Beat
Entry downloaded:	

Entry downloaded:	Tradition Is Bigger Than You
Entry downloaded:	Winter Lotus Shell
Entry downloaded:	You Can Never Go Home Again


---Downloading data for deep-stone-crypt 23/124----

Card downloaded:	Ghost Fragment: Legends
Entry downloaded:	All-In
Entry downloaded:	Winter's Guile


---Downloading data for dormant-siva 24/124----

Card downloaded:	Dormant SIVA: Clovis Bray 1.0
Card downloaded:	Dormant SIVA: Clovis Bray 1.1
Card downloaded:	Dormant SIVA: Clovis Bray 1.2
Card downloaded:	Dormant SIVA: Clovis Bray 1.3
Card downloaded:	Dormant SIVA: Clovis Bray 1.4
Card downloaded:	Dormant SIVA: Clovis Bray 1.5
Card downloaded:	Dormant SIVA: Clovis Bray 1.6
Card downloaded:	Dormant SIVA: Clovis Bray 1.7
Card downloaded:	Dormant SIVA: Clovis Bray 1.8
Card downloaded:	Dormant SIVA: Clovis Bray 1.9
Card downloaded:	Dormant SIVA: Fallen 3.0
Card downloaded:	Dormant SIVA: Fallen 3.1
Card downloaded:	Dormant SIVA: Fallen 3.2
Card downloaded:	Dormant SIVA: Fallen 3.3
Card downloaded:	Dormant SI

Entry downloaded:	Just Another Day at the Tower
Entry downloaded:	Hiding at Home
Entry downloaded:	Caretaker
Entry downloaded:	Invisible Scars
Entry downloaded:	Just Another Day at the Tower
Entry downloaded:	Loss of Light
Entry downloaded:	Hiding at Home
Entry downloaded:	The New Normal
Entry downloaded:	The Good Fight
Entry downloaded:	Last Day
Entry downloaded:	Caretaker
Entry downloaded:	Invisible Scars
Entry downloaded:	You Can Never Go Home Again


---Downloading data for the-exo-stranger 35/124----

Card downloaded:	Ghost Fragment: Rasputin 4
Card downloaded:	Ghost Fragment: The Exo Stranger
Card downloaded:	Ghost Fragment: The Exo Stranger 2
Card downloaded:	No Time To Explain
Card downloaded:	The Exo Stranger


---Downloading data for lord-felwinter 36/124----

Card downloaded:	Dormant SIVA: Iron Lords 2.6
Card downloaded:	Felwinter Peak
Card downloaded:	Gabi 55-30
Card downloaded:	Ghost Fragment: Mysteries 3
Card downloaded:	Iron Banner
Card downloaded:	King of the Mountain
C

Card downloaded:	Ghost Fragment: The Last Word
Card downloaded:	Ghost Fragment: The Last Word 2
Card downloaded:	Ghost Fragment: The Last Word 3
Card downloaded:	Ghost Fragment: The Last Word 4
Card downloaded:	Ghost Fragment: The Last Word 5
Card downloaded:	Ghost Fragment: The Ocean of Storms
Card downloaded:	Ghost Fragment: The Ocean of Storms 2
Card downloaded:	Ghost Fragment: The Queen
Card downloaded:	Ghost Fragment: The Queen 2
Card downloaded:	Ghost Fragment: The Reef
Card downloaded:	Ghost Fragment: The Reef 2
Card downloaded:	Ghost Fragment: The Reef 3
Card downloaded:	Ghost Fragment: The Reef 4
Card downloaded:	Ghost Fragment: The Rusted Lands
Card downloaded:	Ghost Fragment: The Traveler
Card downloaded:	Ghost Fragment: The Traveler 2
Card downloaded:	Ghost Fragment: The Traveler 3
Card downloaded:	Ghost Fragment: Thieves' Den
Card downloaded:	Ghost Fragment: Thorn
Card downloaded:	Ghost Fragment: Thorn 2
Card downloaded:	Ghost Fragment: Thorn 3
Card downloaded:	Ghost Fragm

Card downloaded:	XLIV:  strict proof eternal
Card downloaded:	XVI: The Sword Logic
Card downloaded:	XXI: an incision
Card downloaded:	XXII: The High War
Card downloaded:	XXVIII: King of Shapes
Card downloaded:	XXXII: Majestic. Majestic.
Card downloaded:	XXXIX: open your eye : go into it
Card downloaded:	XXXV: This Love Is War
Entry downloaded:	act|choose|react
Entry downloaded:	react|choose|act
Entry downloaded:	Reverie Dawn Casque
Entry downloaded:	Reverie Dawn Helm
Entry downloaded:	Reverie Dawn Hood
Entry downloaded:	Tyrannocide IV
Entry downloaded:	Tyrannocide V
Entry downloaded:	Verity's Brow
Entry downloaded:	Wormhusk Crown


---Downloading data for lysander 56/124----

Card downloaded:	Bannerfall
Card downloaded:	Ghost Fragment: The City Age
Card downloaded:	Lysander's Cry
Card downloaded:	Osiris
Entry downloaded:	An Insurmountable Skullfort


---Downloading data for malok 57/124----

Card downloaded:	Blighted Chalice
Card downloaded:	Malok, Pride of Oryx


---Downloading data f

Entry downloaded:	Message from Aunor IV
Entry downloaded:	Message from Aunor IX
Entry downloaded:	Message from Aunor VIII
Entry downloaded:	Message to Aunor
Entry downloaded:	Motion to Compel
Entry downloaded:	Motion to Suppress
Entry downloaded:	Motion to Vacate
Entry downloaded:	Prosecutor
Entry downloaded:	Reextinction
Entry downloaded:	Relentless
Entry downloaded:	Scales
Entry downloaded:	Synesthesia
Entry downloaded:	The Bone
Entry downloaded:	The End
Entry downloaded:	The Gate
Entry downloaded:	The Kell
Entry downloaded:	The Leviathan
Entry downloaded:	The Long Walk
Entry downloaded:	The Nine
Entry downloaded:	The Red Box
Entry downloaded:	The Stacks
Entry downloaded:	The Witch
Entry downloaded:	You Get Used to Him


---Downloading data for wei-ning 69/124----

Card downloaded:	Ghost Fragment: The Hellmouth 2
Card downloaded:	Ghost Fragment: Warlock 2
Card downloaded:	Oryx: Rebuked
Entry downloaded:	Cloak of the Great Hunt
Entry downloaded:	Eriana's Vengeance
Entry downloaded:	Fi



---Downloading data for pilgrimages 77/124----



---Downloading data for praedyth 78/124----

Card downloaded:	Mystery: Praedyth's Door
Card downloaded:	No Time To Explain
Card downloaded:	The Taken War: Venus
Entry downloaded:	Age-Old Bond


---Downloading data for pujari 79/124----

Card downloaded:	Legend: The Black Garden
Card downloaded:	The Darkness
Entry downloaded:	Difference of Opinion
Entry downloaded:	Wings of Sacred Dawn


---Downloading data for pulled-pork 80/124----

Entry downloaded:	Pulled Pork


---Downloading data for lord-radegast 81/124----

Card downloaded:	Ghost Fragment: Mysteries 3
Card downloaded:	Iron Banner
Card downloaded:	Lady Jolder
Card downloaded:	Lady Perun
Card downloaded:	Lady Skorri
Card downloaded:	Lord Felwinter
Card downloaded:	Lord Radegast
Entry downloaded:	Loose Ends, pt. III


---Downloading data for radiolaria 82/124----

Card downloaded:	Hobgoblin
Entry downloaded:	A Cautionary Tale
Entry downloaded:	At the Gate | Part II
Entry downloade

Entry downloaded:	Universal Wavefunction
Entry downloaded:	Vanguard Armor
Entry downloaded:	Vesper of Radius
Entry downloaded:	Voidwalker
Entry downloaded:	West of Sunfall 7
Entry downloaded:	Wise Warlock's Bond
Entry downloaded:	Zavala's Authority


---Downloading data for riven 88/124----

Entry downloaded:	act|choose|react
Entry downloaded:	asudeM
Entry downloaded:	Bond of the Great Hunt
Entry downloaded:	Boots of the Great Hunt
Entry downloaded:	Fifteenth Wish
Entry downloaded:	Gauntlets of the Great Hunt
Entry downloaded:	Gloves of the Great Hunt
Entry downloaded:	Greaves of the Great Hunt
Entry downloaded:	Helm of the Great Hunt
Entry downloaded:	Honored
Entry downloaded:	Hood of the Great Hunt
Entry downloaded:	Illyn
Entry downloaded:	Mark of the Great Hunt
Entry downloaded:	Medusa
Entry downloaded:	Oracle
Entry downloaded:	Reextinction
Entry downloaded:	Revanche V
Entry downloaded:	Reverie Dawn Bond
Entry downloaded:	Reverie Dawn Mark
Entry downloaded:	Riven
Entry downloaded:	R

Card downloaded:	XXXIII: When do monsters have dreams
Card downloaded:	XXXIV: More beautiful to know
Card downloaded:	XXXV: This Love Is War
Card downloaded:	XXXVI: Eater of Hope
Card downloaded:	XXXVII: shapes : points
Card downloaded:	XXXVIII: The partition of death
Card downloaded:	XXXIX: open your eye : go into it
Card downloaded:	XL: An Emperor For All Outcomes
Card downloaded:	XLI: Dreadnaught
Card downloaded:	XLII: <>|<>|<>
Card downloaded:	XLIII: End of Failed Timeline
Card downloaded:	XLIV:  strict proof eternal
Card downloaded:	XLV: I'd shut them all in cells.
Card downloaded:	XLVI: The Gift Mast
Card downloaded:	XLVII: Apocalypse Refrains
Card downloaded:	XLVIII: aiat, aiat, aiat, aiat, aiat
Card downloaded:	XLIX: Forever And A Blade
Card downloaded:	L: Wormfood
Card downloaded:	Calcified Fragments: Insight


---Downloading data for queen-mara-sov 99/124----

Card downloaded:	Ghost Fragment: Fallen 4
Card downloaded:	Ghost Fragment: Queen's Brother
Card downloaded:	Ghost Fra

Card downloaded:	Iron Banner
Card downloaded:	Lord Timur
Card downloaded:	Vostok Observatory


---Downloading data for messages-from-toland 107/124----

Card downloaded:	Ghost Fragment: The Hellmouth
Card downloaded:	Oryx, The Taken King
Card downloaded:	Echo of Oryx
Card downloaded:	Court of Oryx
Card downloaded:	Dark-Drinker
Card downloaded:	Oryx: Rebuked
Card downloaded:	Oryx: Defeated
Card downloaded:	King's Fall


---Downloading data for toland-the-shattered 108/124----

Card downloaded:	Ascendant Sword
Card downloaded:	Bad Juju
Card downloaded:	Blades of Crota
Card downloaded:	Court of Oryx
Card downloaded:	Crota's End
Card downloaded:	Dark-Drinker
Card downloaded:	Echo of Oryx
Card downloaded:	Eyes of Crota
Card downloaded:	Ghost Fragment: Darkness 3
Card downloaded:	Ghost Fragment: Hive 4
Card downloaded:	Ghost Fragment: The Hellmouth
Card downloaded:	Ghost Fragment: The Hellmouth 2
Card downloaded:	Hand of Crota
Card downloaded:	Ir Yût, the Deathsinger
Card downloaded:	King's 

Entry downloaded:	V: Echoes Followed by Silence
Entry downloaded:	VI: A Gift and a Touch of Gray


---Downloading data for book-the-warlock-aunor 117/124----

Entry downloaded:	Message from Aunor I
Entry downloaded:	Message from Aunor II
Entry downloaded:	Message from Aunor III
Entry downloaded:	Message from Aunor IV
Entry downloaded:	Message from Aunor V
Entry downloaded:	Message to Aunor
Entry downloaded:	Message from Aunor VI
Entry downloaded:	Message from Aunor VII
Entry downloaded:	Message from Aunor VIII
Entry downloaded:	Surveillance Transcript
Entry downloaded:	Message from Aunor IX
Entry downloaded:	Message from Aunor X
Entry downloaded:	The Salt Mines
Entry downloaded:	District 125
Entry downloaded:	Civilian Atrium South


---Downloading data for warminds 118/124----

Card downloaded:	Dormant SIVA: Iron Lords 2.5
Card downloaded:	Dormant SIVA: Iron Lords 2.6
Card downloaded:	Dust Palace
Card downloaded:	Fallen S.A.B.E.R.
Card downloaded:	Felwinter Peak
Card downloaded:	Ghost 

Entry downloaded:	Message from Aunor X
Entry downloaded:	Revelations and Invitations
Entry downloaded:	SUFFERING
Entry downloaded:	VI: Focus
Entry downloaded:	VIII: Secrets
Entry downloaded:	VII: Joining
Entry downloaded:	V: Purpose


---Downloading data for zavala 124/124----

Card downloaded:	Black Shield
Card downloaded:	Cayde's Stash
Card downloaded:	Cerberus Vae III
Card downloaded:	Crota's Bane
Card downloaded:	Devils' Lair
Card downloaded:	Draksis, Winter Kell
Card downloaded:	Dread Patrol
Card downloaded:	Fallen S.A.B.E.R.
Card downloaded:	Fleetbase Korus, Phobos
Card downloaded:	Ghost Fragment: Cayde-6
Card downloaded:	Ghost Fragment: Rasputin
Card downloaded:	Ghost Fragment: The City Age
Card downloaded:	House of Winter
Card downloaded:	Iron Banner
Card downloaded:	Iron Banner Rep
Card downloaded:	Outbound Signal
Card downloaded:	Raze-Lighter
Card downloaded:	Regicide
Card downloaded:	Restoration
Card downloaded:	Sand Eaters
Card downloaded:	Scourge of Winter
Card downloaded:

---
Data processing
---

In [20]:
def count_uniqes():
    """
    Returns a collections.defaultdict with each character
    in the entire dataset and the amount of times it appears
    """
    unique_chars = defaultdict(int)
    for category in categories:
        for card in data[category]["cards"].values():
            counted_chars = Counter(card)
            for key, value in counted_chars.items():
                unique_chars[key] += value
        for entry in data[category]["entries"].values():
            counted_chars = Counter(entry)
            for key, value in counted_chars.items():
                unique_chars[key] += value
    return unique_chars

In [21]:
print(f"All characters in the dataset: {list(sorted(count_uniqes().keys()))}")

All characters in the dataset: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '±', 'Û', 'à', 'ç', 'é', 'ê', 'û', 'ō', 'θ', '–', '—', '‘', '’', '“', '”', '…']


In [22]:
# some weird characters could be replaced with more readable
# counterparts without losing almost any infromation from the text
def fix_characters(text):
    text = text.replace('…', "...")
    text = text.replace("”", '"')
    text = text.replace("“", '"')
    text = text.replace("‘", "'")
    text = text.replace("’", "'")
    text = text.replace("—", "-")
    text = text.replace("–", "-")
    return text

In [23]:
# all texts will end with a "␃" character to indicate that the text is over
for category in categories:
    for idx in data[category]["cards"].keys():
        text = fix_characters(data[category]["cards"][idx])
        data[category]['cards'][idx] = text + "␃"
    for idx in data[category]["entries"].keys():
        text = fix_characters(data[category]["entries"][idx])
        data[category]['entries'][idx] = text + "␃"

In [24]:
chars = count_uniqes()
print("Characters sorted by occurrence")
for char, value in sorted(chars.items(), key=lambda i:i[1]):
    print(f" {repr(char)} :\t{value}")

Characters sorted by occurrence
 'θ' :	1
 'ō' :	1
 '=' :	2
 '\\' :	2
 '^' :	2
 'ç' :	4
 '±' :	8
 '|' :	9
 '{' :	10
 '}' :	10
 'ê' :	11
 'à' :	20
 'Û' :	20
 '&' :	26
 '$' :	32
 'é' :	36
 '#' :	45
 '<' :	55
 '*' :	58
 '%' :	95
 '_' :	115
 'û' :	222
 '8' :	230
 '(' :	274
 ')' :	276
 '9' :	300
 '7' :	309
 '~' :	369
 '5' :	424
 '+' :	443
 '>' :	450
 '6' :	466
 'Z' :	555
 '4' :	561
 'X' :	580
 '3' :	783
 '!' :	958
 'J' :	1056
 'Q' :	1067
 'q' :	1170
 ';' :	1183
 '/' :	1301
 'z' :	1352
 '2' :	1377
 'K' :	1498
 '0' :	1668
 '1' :	1807
 'U' :	2021
 ']' :	2092
 '[' :	2094
 'j' :	2238
 '␃' :	2686
 'V' :	2828
 'Y' :	2953
 'F' :	3011
 'P' :	3146
 'B' :	3549
 'G' :	3585
 '?' :	3926
 'D' :	4083
 'M' :	4123
 ':' :	4179
 'x' :	4280
 'L' :	4365
 'N' :	4375
 'R' :	4999
 'O' :	5465
 'W' :	5893
 'C' :	5963
 'H' :	6323
 'E' :	6962
 '-' :	8474
 'S' :	9669
 'A' :	9987
 "'" :	11521
 '"' :	11810
 'T' :	14143
 'I' :	16482
 '\n' :	19829
 'v' :	21033
 'k' :	21294
 'b' :	26352
 ',' :	26955
 'p' :	30615
 'y' :	38863


In [25]:
print(f"Total numer of characters: {sum(chars.values())}")

Total numer of characters: 2718600


In [26]:
# some characters appear so rarely that it's not worth to include them in the datasett
# less charactes == lower complexity == better model

In [27]:
DELETE_THRESHOLD = 200

In [28]:
to_be_deleted = []
for char, value in chars.items():
    if value < DELETE_THRESHOLD:
        to_be_deleted.append(char)

In [29]:
print(to_be_deleted)

['ê', 'à', '±', 'Û', '$', '%', '&', '*', '=', '#', 'é', '<', '\\', '|', '{', '}', '_', '^', 'ç', 'θ', 'ō']


In [30]:
will_be_deleted = 0
for category in categories:
    for card in data[category]["cards"].values():
        for val in set(card):
            if val in to_be_deleted:
                will_be_deleted += len(card)
                break
    for entry in data[category]["entries"].values():
         for val in set(entry):
                if val in to_be_deleted:
                    will_be_deleted += len(entry)
                    break

In [31]:
print(f"After removig the {len(to_be_deleted)} least ocurring characters "
      f"the dataset will shrink by {(will_be_deleted / sum(chars.values()))*100:.2f}%")

After removig the 21 least ocurring characters the dataset will shrink by 5.80%


In [32]:
new_dataset = defaultdict(list)
new_characters = set()
for category in categories:
    for card in data[category]["cards"].values():
        add = True
        for val in set(card):
            if val in to_be_deleted:
                add = False
                break
        if add:
            new_dataset[category].append(card)
            new_characters.update(set(card))
    for entry in data[category]["entries"].values():
        add = True
        for val in set(entry):
            if val in to_be_deleted:
                add = False
                break
        if add:
            new_dataset[category].append(entry)
            new_characters.update(set(entry))
new_dataset = dict(new_dataset)

In [33]:
new_categories = list(new_dataset.keys())
print(f"Final categories {sorted(new_categories)}")

Final categories ['ahamkara', 'alpha-lupi', 'ana-bray', 'andal-brask', 'ares-one', 'asher-mir', 'ayane-takanome', 'battle-of-twilight-gap', 'book-a-drifters-gambit', 'book-dawning-delights', 'book-dust', 'book-ecdysis', 'book-evas-journey', 'book-for-every-rose-a-thorn', 'book-ghost-stories', 'book-letters-from-a-renegade', 'book-marasenna', 'book-most-loyal', 'book-the-awoken-of-the-reef', 'book-the-black-armory-papers', 'book-the-book-of-unmaking', 'book-the-dreaming-city', 'book-the-forsaken-prince', 'book-the-lawless-frontier', 'book-the-man-they-call-cayde', 'book-the-man-with-no-name', 'book-the-warlock-aunor', 'book-truth-to-power', 'book-wall-of-wishes', 'books-of-sorrow', 'bray-family', 'cayde-6', 'chasm', 'clovis-bray', 'court-of-oryx', 'crows', 'deep-stone-crypt', 'dormant-siva', 'dr-shirazi', 'dredgen-yor', 'eliksni', 'emperor-calus', 'eriana-3', 'eris-morn', 'festival-of-the-lost-2015', 'festival-of-the-lost-2016', 'ghost-fragments', 'harbingers', 'holborn', 'ikora-rey', '

In [35]:
print(f"Final characters: {sorted(list(new_characters))}\t({len(new_characters)} characters)")

Final characters: ['\n', ' ', '!', '"', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', 'û', '␃']	(83 characters)


---
Saving
---

In [36]:
with open("data/default_characters.json", 'w') as f:
    json.dump(sorted(list(new_characters)), f)

In [37]:
# some categories are empty now so the category_map has some
# useless data inside
new_cat_map = {cat:category_map[cat] for cat in sorted(new_categories)}
with open("data/default_categories.json", 'w') as f:
    json.dump(new_cat_map, f)

In [38]:
with open("data/dataset.pickle", 'wb') as f:
    pickle.dump(new_dataset, f)