In [84]:
import regex as re

In [140]:
class BasicTokenizer:
    def __init__(self):
        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        self.merge_items = {}
        self.ids = []
        self.regex = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
    def get_stats(self, current_ids):
        mp = {}
        for it1, it2 in zip(current_ids, current_ids[1:]):
            mp[(it1, it2)] = mp.get((it1, it2), 0) + 1
        return mp
    def merge(self, pair, new_id, current_ids):
        new_ids = []
        idx = 0
        while idx + 1 < len(current_ids):
            if pair[0] == current_ids[idx] and pair[1] == current_ids[idx + 1]:
                new_ids.append(new_id)
                idx += 2
            else:
                new_ids.append(current_ids[idx])
                idx += 1
        if idx < len(current_ids):
            new_ids.append(current_ids[idx])
            idx += 1
        return new_ids
    def build_vocab_after_train(self):
        for pair, val in self.merge_items.items():
            self.vocab[val] = self.vocab[pair[0]] + self.vocab[pair[1]]
    def train(self, text, vocab_size, verbose=False):
        running_id = len(self.vocab)
        parts = re.findall(self.regex, text)
        for part in parts:
            # print("previous", list(map(int, part.encode('utf-8'))))
            current_ids = self.encode(part)
            # print("normalized", current_ids)
            new_id = running_id
            for i in range(new_id, vocab_size):
                stats = self.get_stats(current_ids)
                if not stats:
                    break
                pair_to_replace = max(stats, key=stats.get)
                current_ids = self.merge(pair_to_replace, i, current_ids)
                self.merge_items[pair_to_replace] = i
                running_id = i + 1
            self.ids.extend(current_ids)
            self.build_vocab_after_train()
            # print("new" , current_ids)
            
    def encode(self, text):
        current_ids = []
        encoded_ids = []
        parts = re.findall(self.regex, text)
        for part in parts:
            current_ids = list(map(int, part.encode('utf-8')))
            while True:
                stats = self.get_stats(current_ids)
                if not stats:
                    break
                pair = min(stats, key=lambda p: self.merge_items.get(p, float("inf")))
                if pair not in self.merge_items:
                    break
                current_ids = self.merge(pair, self.merge_items[pair], current_ids)
            encoded_ids.extend(current_ids)
        return encoded_ids
    def decode(self, encoded_tokens):
        raw_bytes = b"".join(self.vocab[token] for token in encoded_tokens)
        decoded_text = raw_bytes.decode('utf-8', errors='replace')
        return decoded_text

In [144]:
with open("taylorswift.txt", "r") as r:
    text = r.read()
tokenizer = BasicTokenizer()

tokenizer.train(text , 1000)
# print(tokenizer.merge_items)
# print(tokenizer.ids)
# print(tokenizer.decode(tokenizer.encode("I am a bogy. he. sdfh!? sf? sdsdf...!! sldfhj")) == "I am a bogy. he. sdfh!? sf? sdsdf...!! sldfhj")
# print(tokenizer.encode("I am a bogy. he. sdfh!? sf? sdsdf...!!"))

In [145]:
text_data = '''Sigh ... everybody likes to be "pythonic" and goes for the least characters to type. First, another criteria is readability. Second, the first test in the answer above is true not only if the dict exists and is empty, but also if test_dict is None. So use this test only when you know that the dict object exists (or when the difference does not matter). The second way also has that behavior. Only the third way barks if test_dict is None. – 
Andreas Maier
 CommentedDec 12, 2016 at 19:37 
1
@AndreasMaier Exactly my feeling as well. Also, python is dynamically typed. Inside a function it's common to check "if x is non-empty dictionary, then do this; if x is non-empty numpy array, then do that". Then the first code will fail on if x when x is numpy array – 
jf328
 CommentedDec 15, 2016 at 8:36 
1
@Wajih you link is still irrelevant here... See why – 
Ulysse BN
 CommentedFeb 17, 2017 at 20:44
5
Not upvoting this although technically correct due to concerns I share with. @AndreasMaier – 
Stunner
 CommentedDec 17, 2018 at 23:36
@AndreasMaier and the pythonic way to solve this is to add if test_dict is not None: print("Dict is None") at the beginning – 
lupodellasleppa
 CommentedSep 20, 2023 at 13:43
Add a comment
43

Simple ways to check an empty dict are below:

a = {}
if a == {}:
  print ('empty dict')
if not a:
  print ('empty dict')
Method 1 is more strict, because when a = None, method 1 will provide the correct result, but method 2 will give an incorrect result.

Share
Improve this answer
Follow
edited Jan 5, 2023 at 13:05
Gino Mempin's user avatar
Gino Mempin
29.8k3131 gold badges119119 silver badges166166 bronze badges
answered Dec 11, 2018 at 10:11
Shagun Pruthi's user avatar
Shagun Pruthi
2,0611818 silver badges1919 bronze badges
This should be the accepted answer as it is the only answer differentiating between empty dict's and anything evaluating to None. – 
Wör Du Schnaffzig
 CommentedMay 2, 2024 at 9:25
Add a comment
39

d = {}
print(len(d.keys()))
If the length is zero, it means that the dict is empty.

Share
Improve this answer
Follow
edited Jan 5, 2023 at 13:10
Gino Mempin's user avatar
Gino Mempin
29.8k3131 gold badges119119 silver badges166166 bronze badges
answered Dec 16, 2016 at 10:00
Achilles Ram Nakirekanti's user avatar
Achilles Ram Nakirekanti
4,02111 gold badge2323 silver badges1313 bronze badges
5
While this code snippet may solve the question, including an explanation really helps to improve the quality of your post. Remember that you are answering the question for readers in the future, and those people might not know the reasons for your code suggestion. – 
DimaSan
 CommentedDec 16, 2016 at 11:43
9
len(dict.keys()) is equivalent to len(dict) – 
pdpAxis
 CommentedJun 4, 2020 at 18:45 
@pdpAxis In the value it gives, though I bet the implementation of dict.__len__ is probably a bit faster. :) – 
Mateen Ulhaq
 CommentedJun 11, 2020 at 2:21
1
len(dict.keys()) is NOT equivalent to len(dict). The first case fails on lists and tuples, the second not. So, the first statement is more explicit. – 
Wör Du Schnaffzig
 CommentedMay 2, 2024 at 9:30 
Add a comment
10

A dictionary can be automatically cast to boolean which evaluates to False for empty dictionary and True for non-empty dictionary.

if myDictionary: non_empty_clause()
else: empty_clause()
If this looks too idiomatic, you can also test len(myDictionary) for zero, or set(myDictionary.keys()) for an empty set, or simply test for equality with {}.

The isEmpty function is not only unnecessary but also your implementation has multiple issues that I can spot prima-facie.

The return False statement is indented one level too deep. It should be outside the for loop and at the same level as the for statement. As a result, your code will process only one, arbitrarily selected key, if a key exists. If a key does not exist, the function will return None, which will be cast to boolean False. Ouch! All the empty dictionaries will be classified as false-nagatives.
If the dictionary is not empty, then the code will process only one key and return its value cast to boolean. You cannot even assume that the same key is evaluated each time you call it. So there will be false positives.
Let us say you correct the indentation of the return False statement and bring it outside the for loop. Then what you get is the boolean OR of all the keys, or False if the dictionary empty. Still you will have false positives and false negatives. Do the correction and test against the following dictionary for an evidence.
myDictionary={0:'zero', '':'Empty string', None:'None value', False:'Boolean False value', ():'Empty tuple'}

Share
Improve this answer
Follow
edited Jan 16, 2019 at 3:33
answered Jan 14, 2019 at 10:25
Della's user avatar
Della
1,64833 gold badges2626 silver badges5050 bronze badges
Add a comment
8

1st Way
len(given_dic_obj) 
It returns 0 if there are no elements. Else, returns the size of the dictionary.'''

In [147]:
print(tokenizer.decode(tokenizer.encode(text)) == text)
# print(tokenizer.encode("abv"))
# tokenizer.decode(tokenizer.encode("abv"))

True


In [160]:
regex = re.compile(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")
for idx, val in tokenizer.merge_items.items():
    st = tokenizer.decode([idx[0]]) + tokenizer.decode([idx[1]])
    if len(re.findall(regex, st)) > 1:
        print("culprint found", tokenizer.decode([idx[0]]) , tokenizer.decode([idx[1]]))

In [148]:
for idx, val in tokenizer.merge_items.items():
    print(tokenizer.decode([idx[0]]), " + ", tokenizer.decode([idx[1]]), " = ", val)

C  +  o  =  256
Co  +  p  =  257
Cop  +  y  =  258
   +  p  =  259
 p  +  a  =  260
 pa  +  s  =  261
 pas  +  t  =  262
 past  +  e  =  263
   +  o  =  264
 o  +  f  =  265
   +  t  =  266
 t  +  h  =  267
 th  +  e  =  268
   +  W  =  269
 W  +  i  =  270
 Wi  +  k  =  271
 Wik  +  i  =  272
 Wiki  +  p  =  273
 Wikip  +  e  =  274
 Wikipe  +  d  =  275
 Wikiped  +  i  =  276
 Wikipedi  +  a  =  277
   +  a  =  278
 a  +  r  =  279
 ar  +  t  =  280
 art  +  i  =  281
 arti  +  c  =  282
 artic  +  l  =  283
 articl  +  e  =  284
 o  +  n  =  285
   +  T  =  286
 T  +  a  =  287
 Ta  +  y  =  288
 Tay  +  l  =  289
 Tayl  +  o  =  290
 Taylo  +  r  =  291
   +  S  =  292
 S  +  w  =  293
 Sw  +  i  =  294
 Swi  +  f  =  295
 Swif  +  t  =  296
 a  +  s  =  297
   +  F  =  298
 F  +  e  =  299
 Fe  +  b  =  300
1  +  6  =  301
2  +  0  =  302
20  +  2  =  303
.  +  
  =  304
-  +  -  =  305
--  +  -  =  306
---  +  
  =  307
---
  +  
  =  308
M  +  a  =  309
Ma  +  i  =  310
Mai  +  