In [1]:
# A dependency of the preprocessing for BERT inputs
!pip install tensorflow-text
!pip install tf-models-official
!pip install tensorflow-addons
!pip install bert-for-tf2
!pip install sentencepiece
!pip install transformers

Collecting tensorflow-text
[?25l  Downloading https://files.pythonhosted.org/packages/b6/c0/c0fed4301f592c3b56638ae7292612c17d91a43891ba1aaf9636d535beae/tensorflow_text-2.4.3-cp37-cp37m-manylinux1_x86_64.whl (3.4MB)
[K     |████████████████████████████████| 3.4MB 12.8MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.4.3
Collecting tf-models-official
[?25l  Downloading https://files.pythonhosted.org/packages/57/4a/23a08f8fd2747867ee223612e219eeb0d11c36116601d99b55ef3c72e707/tf_models_official-2.4.0-py2.py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 13.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 38.9MB/s 
[?25hCollecting pyyaml>=5.1
[?25l  Downloading https://files.pythonhosted.org/packag

In [2]:
import os
import shutil
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras import layers

from official.nlp import optimization  # to create AdamW optmizer
import bert
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 100)

In [4]:
bert_model_name = 'bert_en_uncased_L-4_H-512_A-8'

In [5]:
tfhub_handle_preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_model = hub.load('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1')

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3, Total size: 1.96MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1, Total size: 115.55MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'.


In [6]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [7]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_type_ids', 'input_word_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
Input Mask : [1 1 1 1 1 1 1 1 1 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


In [8]:
text_test = ['this is such an amazing movie!', 'This is a third class movie.']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][:, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][:, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][:, :12]}')

Keys       : ['input_type_ids', 'input_word_ids', 'input_mask']
Shape      : (2, 128)
Word Ids   : [[ 101 2023 2003 2107 2019 6429 3185  999  102    0    0    0]
 [ 101 2023 2003 1037 2353 2465 3185 1012  102    0    0    0]]
Input Mask : [[1 1 1 1 1 1 1 1 1 0 0 0]
 [1 1 1 1 1 1 1 1 1 0 0 0]]
Type Ids   : [[0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]]


In [9]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
#FullTokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [10]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_test[0]))

[2023, 2003, 2107, 2019, 6429, 3185, 999]

# Vocabulary

In [11]:
vocab_filepath = bert_model.vocab_file.asset_path.numpy().decode('utf-8')
!head -10 $vocab_filepath

[PAD]
[unused0]
[unused1]
[unused2]
[unused3]
[unused4]
[unused5]
[unused6]
[unused7]
[unused8]


In [12]:
new_file = f'./vocaulary_{bert_model_name}.txt'
!cp $vocab_filepath $new_file

In [13]:
with open(f'{new_file}', 'r') as f:
    vocab = f.readlines()
vocab = [v.strip().replace('[', '').replace(']', '') for v in vocab]
print(f"Total Number of Tokens : {len(vocab)}")

Total Number of Tokens : 30522


In [14]:
vocab[:4]

['PAD', 'unused0', 'unused1', 'unused2']

In [15]:
# Index of special tokens
special_tokens = ['PAD', 'UNK', 'MASK', 'CLS', 'SEP']
token_idx = []
for st in special_tokens:
    token_idx.append((st, vocab.index(st)))
pd.DataFrame(token_idx, columns=['token', 'index'])

Unnamed: 0,token,index
0,PAD,0
1,UNK,100
2,MASK,103
3,CLS,101
4,SEP,102


In [16]:
df_vocab = pd.DataFrame(vocab, columns=['token'])
df_vocab['len'] = df_vocab['token'].apply(len)
print(df_vocab.shape)

(30522, 2)


In [17]:
df_vocab.head()

Unnamed: 0,token,len
0,PAD,3
1,unused0,7
2,unused1,7
3,unused2,7
4,unused3,7


In [18]:
# number of unused tokens
a = df_vocab[df_vocab['token'].str.startswith('unused')]
print(f"Number of unused tokens - {len(a)}")

Number of unused tokens - 995


In [19]:
# number of unit length tokens
df_len_1 = df_vocab[df_vocab['len'] == 1][['token']]
print(f"Number of unused tokens - {len(df_len_1)}")

Number of unused tokens - 995


In [20]:
df_len_1.T.head(100)

Unnamed: 0,999,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,1030,1032,1034,1035,1036,1037,1038,1039,1040,1041,1042,1043,1044,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058,1059,1060,1061,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1118,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1134,1135,1136,1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,1167,1168,1169,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179,1180,1181,1182,1183,1184,1185,1186,1187,1188,1189,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199,1200,1201,1202,1203,1204,1205,1206,1207,1208,1209,1210,1211,1212,1213,1214,1215,1216,1217,1218,1219,1220,1221,1222,1223,1224,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234,1235,1236,1237,1238,1239,1240,1241,1242,1243,1244,1245,1246,1247,1248,1249,1250,1251,1252,1253,1254,1255,1256,1257,1258,1259,1260,1261,1262,1263,1264,1265,1266,1267,1268,1269,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280,1281,1282,1283,1284,1285,1286,1287,1288,1289,1290,1291,1292,1293,1294,1295,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305,1306,1307,1308,1309,1310,1311,1312,1313,1314,1315,1316,1317,1318,1319,1320,1321,1322,1323,1324,1325,1326,1327,1328,1329,1330,1331,1332,1333,1334,1335,1336,1337,1338,1339,1340,1341,1342,1343,1344,1345,1346,1347,1348,1349,1350,1351,1352,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,1363,1364,1365,1366,1367,1368,1369,1370,1371,1372,1373,1374,1375,1376,1377,1378,1379,1380,1381,1382,1383,1384,1385,1386,1387,1388,1389,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403,1404,1405,1406,1407,1408,1409,1410,1411,1412,1413,1414,1415,1416,1417,1418,1419,1420,1421,1422,1423,1424,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434,1435,1436,1437,1438,1439,1440,1441,1442,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452,1453,1454,1455,1456,1457,1458,1459,1460,1461,1462,1463,1464,1465,1466,1467,1468,1469,1470,1471,1472,1473,1474,1475,1476,1477,1478,1479,1480,1481,1482,1483,1484,1485,1486,1487,1488,1489,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500,1501,1502,1503,1504,1505,1506,1507,1508,1509,1510,1511,1512,1513,1514,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535,1536,1537,1538,1539,1540,1541,1542,1543,1544,1545,1546,1547,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558,1559,1560,1561,1562,1563,1564,1565,1566,1567,1568,1569,1570,1571,1572,1573,1574,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599,1600,1601,1602,1603,1604,1605,1606,1607,1608,1609,1610,1611,1612,1613,1614,1615,1616,1617,1618,1619,1620,1621,1622,1623,1624,1625,1626,1627,1628,1629,1630,1631,1632,1633,1634,1635,1636,1637,1638,1639,1640,1641,1642,1643,1644,1645,1646,1647,1648,1649,1650,1651,1652,1653,1654,1655,1656,1657,1658,1659,1660,1661,1662,1663,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710,1711,1712,1713,1714,1715,1716,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726,1727,1728,1729,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745,1746,1747,1748,1749,1750,1751,1752,1753,1754,1755,1756,1757,1758,1759,1760,1761,1762,1763,1764,1765,1766,1767,1768,1769,1770,1771,1772,1773,1774,1775,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815,1816,1817,1818,1819,1820,1821,1822,1823,1824,1825,1826,1827,1828,1829,1830,1831,1832,1833,1834,1835,1836,1837,1838,1839,1840,1841,1842,1843,1844,1845,1846,1847,1848,1849,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995
token,!,"""",#,$,%,&,',(,),*,+,",",-,.,/,0,1,2,3,4,5,6,7,8,9,:,;,<,=,>,?,@,\,^,_,`,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,{,|,},~,¡,¢,£,¤,¥,¦,§,¨,©,ª,«,¬,®,°,±,²,³,´,µ,¶,·,¹,º,»,¼,½,¾,¿,×,ß,æ,ð,÷,ø,þ,đ,ħ,ı,ł,ŋ,œ,ƒ,ɐ,ɑ,ɒ,ɔ,ɕ,ə,ɛ,ɡ,ɣ,ɨ,ɪ,ɫ,ɬ,ɯ,ɲ,ɴ,ɹ,ɾ,ʀ,ʁ,ʂ,ʃ,ʉ,ʊ,ʋ,ʌ,ʎ,ʐ,ʑ,ʒ,ʔ,ʰ,ʲ,ʳ,ʷ,ʸ,ʻ,ʼ,ʾ,ʿ,ˈ,ː,ˡ,ˢ,ˣ,ˤ,α,β,γ,δ,ε,ζ,η,θ,ι,κ,λ,μ,ν,ξ,ο,π,ρ,ς,σ,τ,υ,φ,χ,ψ,ω,а,б,в,г,д,е,ж,з,и,к,л,м,н,о,п,р,с,т,у,ф,х,ц,ч,ш,щ,ъ,ы,ь,э,ю,я,ђ,є,і,ј,љ,њ,ћ,ӏ,ա,բ,գ,դ,ե,թ,ի,լ,կ,հ,մ,յ,ն,ո,պ,ս,վ,տ,ր,ւ,ք,־,א,ב,ג,ד,ה,ו,ז,ח,ט,י,ך,כ,ל,ם,מ,ן,נ,ס,ע,ף,פ,ץ,צ,ק,ר,ש,ת,،,ء,ا,ب,ة,ت,ث,ج,ح,خ,د,ذ,ر,ز,س,ش,ص,ض,ط,ظ,ع,غ,ـ,ف,ق,ك,ل,م,ن,ه,و,ى,ي,ٹ,پ,چ,ک,گ,ں,ھ,ہ,ی,ے,अ,आ,उ,ए,क,ख,ग,च,ज,ट,ड,ण,त,थ,द,ध,न,प,ब,भ,म,य,र,ल,व,श,ष,स,ह,ा,ि,ी,ो,।,॥,ং,অ,আ,ই,উ,এ,ও,ক,খ,গ,চ,ছ,জ,ট,ড,ণ,ত,থ,দ,ধ,ন,প,ব,ভ,ম,য,র,ল,শ,ষ,স,হ,া,ি,ী,ে,க,ச,ட,த,ந,ன,ப,ம,ய,ர,ல,ள,வ,ா,ி,ு,ே,ை,ನ,ರ,ಾ,ක,ය,ර,ල,ව,ා,ก,ง,ต,ท,น,พ,ม,ย,ร,ล,ว,ส,อ,า,เ,་,།,ག,ང,ད,ན,པ,བ,མ,འ,ར,ལ,ས,မ,ა,ბ,გ,დ,ე,ვ,თ,ი,კ,ლ,მ,ნ,ო,რ,ს,ტ,უ,ᄀ,ᄂ,ᄃ,ᄅ,ᄆ,ᄇ,ᄉ,ᄊ,ᄋ,ᄌ,ᄎ,ᄏ,ᄐ,ᄑ,ᄒ,ᅡ,ᅢ,ᅥ,ᅦ,ᅧ,ᅩ,ᅪ,ᅭ,ᅮ,ᅯ,ᅲ,ᅳ,ᅴ,ᅵ,ᆨ,ᆫ,ᆯ,ᆷ,ᆸ,ᆼ,ᴬ,ᴮ,ᴰ,ᴵ,ᴺ,ᵀ,ᵃ,ᵇ,ᵈ,ᵉ,ᵍ,ᵏ,ᵐ,ᵒ,ᵖ,ᵗ,ᵘ,ᵢ,ᵣ,ᵤ,ᵥ,ᶜ,ᶠ,‐,‑,‒,–,—,―,‖,‘,’,‚,“,”,„,†,‡,•,…,‰,′,″,›,‿,⁄,⁰,ⁱ,⁴,⁵,⁶,⁷,⁸,⁹,⁺,⁻,ⁿ,₀,₁,₂,₃,₄,₅,₆,₇,₈,₉,₊,₍,₎,ₐ,ₑ,ₒ,ₓ,ₕ,ₖ,ₗ,ₘ,ₙ,ₚ,ₛ,ₜ,₤,₩,€,₱,₹,ℓ,№,ℝ,™,⅓,⅔,←,↑,→,↓,↔,↦,⇄,⇌,⇒,∂,∅,∆,∇,∈,−,∗,∘,√,∞,∧,∨,∩,∪,≈,≡,≤,≥,⊂,⊆,⊕,⊗,⋅,─,│,■,▪,●,★,☆,☉,♠,♣,♥,♦,♭,♯,⟨,⟩,ⱼ,⺩,⺼,⽥,、,。,〈,〉,《,》,「,」,『,』,〜,あ,い,う,え,お,か,き,く,け,こ,さ,し,す,せ,そ,た,ち,っ,つ,て,と,な,に,ぬ,ね,の,は,ひ,ふ,へ,ほ,ま,み,む,め,も,や,ゆ,よ,ら,り,る,れ,ろ,を,ん,ァ,ア,ィ,イ,ウ,ェ,エ,オ,カ,キ,ク,ケ,コ,サ,シ,ス,セ,タ,チ,ッ,ツ,テ,ト,ナ,ニ,ノ,ハ,ヒ,フ,ヘ,ホ,マ,ミ,ム,メ,モ,ャ,ュ,ョ,ラ,リ,ル,レ,ロ,ワ,ン,・,ー,一,三,上,下,不,世,中,主,久,之,也,事,二,五,井,京,人,亻,仁,介,代,仮,伊,会,佐,侍,保,信,健,元,光,八,公,内,出,分,前,劉,力,加,勝,北,区,十,千,南,博,原,口,古,史,司,合,吉,同,名,和,囗,四,国,國,土,地,坂,城,堂,場,士,夏,外,大,天,太,夫,奈,女,子,学,宀,宇,安,宗,定,宣,宮,家,宿,寺,將,小,尚,山,岡,島,崎,川,州,巿,帝,平,年,幸,广,弘,張,彳,後,御,德,心,忄,志,忠,愛,成,我,戦,戸,手,扌,政,文,新,方,日,明,星,春,昭,智,曲,書,月,有,朝,木,本,李,村,東,松,林,森,楊,樹,橋,歌,止,正,武,比,氏,民,水,氵,氷,永,江,沢,河,治,法,海,清,漢,瀬,火,版,犬,王,生,田,男,疒,発,白,的,皇,目,相,省,真,石,示,社,神,福,禾,秀,秋,空,立,章,竹,糹,美,義,耳,良,艹,花,英,華,葉,藤,行,街,西,見,訁,語,谷,貝,貴,車,軍,辶,道,郎,郡,部,都,里,野,金,鈴,镇,長,門,間,阝,阿,陳,陽,雄,青,面,風,食,香,馬,高,龍,龸,ﬁ,ﬂ,！,（,）,，,－,．,／,：,？,～


In [21]:
print(df_vocab[df_vocab['len'] == 1]['token'].values)

['!' '"' '#' '$' '%' '&' "'" '(' ')' '*' '+' ',' '-' '.' '/' '0' '1' '2'
 '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?' '@' '\\' '^' '_' '`'
 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r'
 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}' '~' '¡' '¢' '£' '¤' '¥' '¦'
 '§' '¨' '©' 'ª' '«' '¬' '®' '°' '±' '²' '³' '´' 'µ' '¶' '·' '¹' 'º' '»'
 '¼' '½' '¾' '¿' '×' 'ß' 'æ' 'ð' '÷' 'ø' 'þ' 'đ' 'ħ' 'ı' 'ł' 'ŋ' 'œ' 'ƒ'
 'ɐ' 'ɑ' 'ɒ' 'ɔ' 'ɕ' 'ə' 'ɛ' 'ɡ' 'ɣ' 'ɨ' 'ɪ' 'ɫ' 'ɬ' 'ɯ' 'ɲ' 'ɴ' 'ɹ' 'ɾ'
 'ʀ' 'ʁ' 'ʂ' 'ʃ' 'ʉ' 'ʊ' 'ʋ' 'ʌ' 'ʎ' 'ʐ' 'ʑ' 'ʒ' 'ʔ' 'ʰ' 'ʲ' 'ʳ' 'ʷ' 'ʸ'
 'ʻ' 'ʼ' 'ʾ' 'ʿ' 'ˈ' 'ː' 'ˡ' 'ˢ' 'ˣ' 'ˤ' 'α' 'β' 'γ' 'δ' 'ε' 'ζ' 'η' 'θ'
 'ι' 'κ' 'λ' 'μ' 'ν' 'ξ' 'ο' 'π' 'ρ' 'ς' 'σ' 'τ' 'υ' 'φ' 'χ' 'ψ' 'ω' 'а'
 'б' 'в' 'г' 'д' 'е' 'ж' 'з' 'и' 'к' 'л' 'м' 'н' 'о' 'п' 'р' 'с' 'т' 'у'
 'ф' 'х' 'ц' 'ч' 'ш' 'щ' 'ъ' 'ы' 'ь' 'э' 'ю' 'я' 'ђ' 'є' 'і' 'ј' 'љ' 'њ'
 'ћ' 'ӏ' 'ա' 'բ' 'գ' 'դ' 'ե' 'թ' 'ի' 'լ' 'կ' 'հ' 'մ' 'յ' 'ն' 'ո' 'պ' 'ս'
 'վ' 'տ' 'ր' 'ւ' 'ք' '־' 'א' 'ב' 'ג' 'ד' 'ה' 'ו' '

In [22]:
# Distribution of length of tokens
df_vocab['len'].value_counts(normalize=True)

6     0.166699
5     0.145633
7     0.139178
8     0.109953
9     0.107463
4     0.098093
3     0.077485
10    0.049702
1     0.032599
11    0.028602
2     0.016054
12    0.015988
13    0.007994
14    0.003014
15    0.001016
16    0.000360
17    0.000066
0     0.000066
18    0.000033
Name: len, dtype: float64

In [23]:
# Number of subwords (starts with ## (double hash))
a = df_vocab[df_vocab['token'].str.startswith('##')]
print(f"Number of subwords - {len(a)}")

Number of subwords - 5828


In [24]:
# position of first english alphabetic token except special tokens
df_vocab[(~df_vocab['token'].isin(special_tokens)) & (df_vocab['token'].str.isalpha())].head(5)

Unnamed: 0,token,len
1037,a,1
1038,b,1
1039,c,1
1040,d,1
1041,e,1


In [25]:
# Find out tokens with contractions e.g. what's, don't etc.
df_vocab[df_vocab['token'].str.contains("'")]

Unnamed: 0,token,len
1005,',1
29618,##',3


In [26]:
('dont' in df_vocab['token'],
 'cant' in df_vocab['token'],
 'whats' in df_vocab['token'],
 'gotta' in df_vocab['token'])

(False, False, False, False)

In [27]:
# check the names
!wget 'http://www.gutenberg.org/files/3201/files/NAMES.TXT'

--2021-03-29 09:28:48--  http://www.gutenberg.org/files/3201/files/NAMES.TXT
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 179353 (175K) [text/plain]
Saving to: ‘NAMES.TXT’


2021-03-29 09:28:49 (425 KB/s) - ‘NAMES.TXT’ saved [179353/179353]



In [28]:
with open('NAMES.TXT', 'rb') as f:
    names = []
    for n in f.readlines():
        try:
            names.append(n.decode('utf-8').strip().lower())
        except:
            pass

print(names[:4], len(names))

['aaberg', 'aalst', 'aara', 'aaren'] 21985


In [29]:
names_in_vocab = set(names).intersection(set(df_vocab['token']))
print(len(names_in_vocab))

3865


In [30]:
print(names_in_vocab)

{'argent', 'mark', 'marie', 'irene', 'clements', 'darcy', 'micro', 'deck', 'colon', 'timothy', 'agnes', 'santos', 'hunter', 'ronald', 'julian', 'wiltshire', 'wilcox', 'champagne', 'sera', 'bishop', 'jamie', 'gavin', 'olaf', 'shane', 'steiner', 'tempest', 'roy', 'lamar', 'con', 'kincaid', 'cole', 'augustine', 'corbin', 'chi', 'stephenson', 'leopold', 'su', 'trojan', 'wesley', 'teddy', 'robin', 'alain', 'moe', 'dimitri', 'mab', 'kiel', 'kung', 'kristin', 'orson', 'barlow', 'vigor', 'camden', 'plume', 'bolton', 'tay', 'joey', 'wiley', 'hartman', 'henrietta', 'simmons', 'may', 'edge', 'erich', 'lowry', 'bryant', 'van', 'muller', 'eric', 'kendall', 'tierney', 'nelly', 'carlton', 'faust', 'christi', 'diana', 'dolphin', 'salim', 'cousins', 'zealand', 'chew', 'janet', 'kay', 'ella', 'dail', 'beaumont', 'maxim', 'foster', 'bancroft', 'dream', 'tally', 'rod', 'dieter', 'church', 'kelsey', 'prentice', 'doe', 'lloyd', 'lamp', 'arte', 'lucille', 'dunn', 'moon', 'beau', 'andreas', 'menon', 'hamish',