# Hash Stat

This is the notebook for some statistics of the number of returned references of hashes.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_stat = pd.read_pickle('./pickle/hash_stat.pickle')
df_stat.head()

Unnamed: 0_level_0,counts
hashtag,Unnamed: 1_level_1
repubblica#venezia,6508
san#marco,4759
storia#venezia,4671
atti#convegno,4603
cura#1985,3684


In [3]:
# In thousands
df_stat['k_group'] = df_stat['counts'].apply(lambda x: (x//1000))
df_stat.groupby('k_group')['counts'].count()

k_group
0    22374351
1          67
2          16
3          21
4           3
6           1
Name: counts, dtype: int64

In [5]:
# In hundreds, for those below 1,000
df_stat['h_group'] = df_stat['counts'].apply(lambda x: (x//100))
df_stat[df_stat.counts<1000].groupby('h_group')['counts'].count()

h_group
0    22369404
1        3587
2         773
3         289
4         134
5          58
6          41
7          27
8          24
9          14
Name: counts, dtype: int64

In [6]:
# In tens, for those below 100
df_stat['t_group'] = df_stat['counts'].apply(lambda x: (x//10))
df_stat[df_stat.counts<100].groupby('t_group')['counts'].count()

t_group
0    22228720
1       90892
2       24089
3       10675
4        5635
5        3282
6        2410
7        1609
8        1179
9         913
Name: counts, dtype: int64

In [7]:
# In ones, for those below 10
df_stat['d_group'] = df_stat['counts'].apply(lambda x: (x//1))
df_stat[df_stat.counts<10].groupby('d_group')['counts'].count()

d_group
1    20004324
2     1401939
3      380207
4      178342
5      100915
6       63798
7       43459
8       31810
9       23926
Name: counts, dtype: int64

In [8]:
# 1000+
df_stat[df_stat.k_group>0].index.values

array(['repubblica#venezia', 'san#marco', 'storia#venezia',
       'atti#convegno', 'cura#1985', 'eta#moderna', 'cura#1984',
       'cura#1987', 'cura#1983', 'cura#1993', 'cura#1996', 'cura#1986',
       'cura#1997', 'cura#1981', 'cura#1995', 'cura#1988', 'cura#1989',
       'cura#1994', 'cura#1991', 'cura#1992', 'cura#1998',
       'catalogo#mostra', 'cura#1990', 'cura#1982', 'cura#1980',
       'renaissance#venice', 'storia#cultura', 'cultura#veneta',
       'cura#1979', 'moyen#age', 'cura#1999', 'repubblica#veneta',
       'cura#2000', 'storia#arte', 'cura#2001', 'cura#2002', 'cura#2003',
       'secolo#xvi', 'cura#atti#convegno', 'cura#1978', 'cura#1973',
       'cura#1977', 'cura#1974', 'cura#1972', 'cura#1968',
       'dizionario#biografico', 'storia#italia', 'cura#1967', 'cura#2004',
       'cura#1976', 'cura#1969', 'cura#1966', 'biografico#italiani',
       'xvi#secolo', 'cura#1975', 'storia#economica', 'cura#1970',
       'civilta#veneziana', 'venezia#eta', 'cura#2005', 'cura#

**Brief Analysis:**
1. `venezia` or `venetia` is quite common.
2. `cultura` means 'culture' and `storia` means 'history'.
3. `san#marco` is one of the six sestieri of Venice.
4. 'atti del convegno' means 'conference proceedings'. `atti#convegno` is parsed as part of title.
5. 'a cura di' means 'edited by'. `cura` is included in the hashtag as part of author due to parsing error.
6. `eta#moderna` means 'modern age'; `moyen#age`, `middle#age` means 'middle age'. 
7. 'catalogo della mostra' means 'exhibition catalog'. `catalogo#mostra` is parsed as part of title.
8. `xvi#secolo`, `secoli#xvi`, `sixteenth#century`,  `xviii#secolo`  means '16th century' or '18th century'.
9. `dizionario#biografico`, `biografico#italiani` are part of 'Dizionario biografico degli Italiani', which is a biographical dictionary.
10. `palazzo#ducale` is a palace built in Venetian Gothic style.
11. `convegno#internazionale` means 'international conference'.
12. `santa#maria` always showes up as part of church name.
13. `citta#nobilissima`, `sansovino#citta#nobilissima` points to the reference 'Venetia citt√† nobilissima et singolare' that is commonly cited.
14. `origini#caduta` points to the reference 'Storia di Venezia nella vita privata dalle origini alla caduta della Repubblica' that is commonly cited.
15. `paolo#sarpi` is an Italian historian.
16. `cozzi#repubblica#venezia` is a combination of author 'cozzi' and title bigram 'repubblica#venezia' which is the favorite topic of this author.

**Black List for this part:**:
- Meaningless: `cura#...`
- **??Too common??**: `venezia`, `veneta`, `palazzo#ducale`, `paolo#sarpi`, etc.
- Not necessay : `secolo`, `secoli`, `century` ... date information which is not main part of title
- Publication name or similar: `convegno#internazionale`, `dizionario#biografico`, `biografico#italiani`

In [19]:
# 500-900+
tags = df_stat[(df_stat.counts<1000) & (df_stat.h_group>=5)].index.values
# remove those with 'cura' and with 'century'
np.array([t for t in tags if  not t.startswith('cura#') 
                 and not (t.startswith('secoli#') or t.endswith('#secolo') or t.endswith('#century'))])

array(['medio#evo', 'storia#repubblica', 'memorie#storiche',
       'seconda#meta', 'studi#storici', 'maggior#consiglio',
       'nobilissima#singolare', 'xvii#xviii', 'andrea#palladio',
       'sansovino#venetia#citta', 'vita#privata', 'storia#patria',
       'documentata#venezia', 'storia#diritto', 'societa#veneta',
       'luzzatto#storia#economica', 'internazionale#studi',
       'marco#venezia', 'museo#correr', 'sansovino#nobilissima#singolare',
       'chiesa#san', 'romanin#storia#documentata', 'medioevo#eta',
       'early#modem', 'chiesa#venezia', 'arnaldi#cultura#veneta',
       'ateneo#veneto', 'archivio#stato', 'caduta#serenissima',
       'early#modern', 'stato#venezia', 'vita#opere',
       'arnaldi#storia#cultura', 'romanin#documentata#venezia',
       'patriziato#veneziano', 'aspetti#problemi', 'xvi#xviii',
       'belle#arti', 'ducale#venezia', 'xvie#siecle',
       'venezia#cinquecento', 'convegno#studi', 'marco#polo',
       'italian#renaissance', 'secolo#xiv', 'hale#

## Try to Build Black List - whether include in those too common ones?

In [None]:
# with cura
hash_cura = [h for h in df_stat.index.values if h.startswith('cura#')]

In [None]:
# only bigrams with century information
hash_seco = [h for h in df_stat.index.values 
             if (h.startswith('secoli#') or h.endswith('#secolo') or h.endswith('#century'))
                and (len(h.split('#'))<=2)
            ]

In [None]:
# part of a publication where the reference from

# TODO #
hash_public = ['studi#onore', 'archivio#veneto', 'lettere#arti', 'scienze#lettere',
               'dizionario#biografico', 'biografico#italiani', 'convegno#internazionale',
               'storia#patria', 
              ]

In [None]:
# hashes produced only by one ref
# avoid for useless search
hash_one = df_stat[df_stat['counts']==1].index.values

## Match Result without Hash Blacklist

IF do the match **without building hash blacklist**, and the result of $\frac{3}{4}$ of all full refs:

In [36]:
def number_level(n):
    if n//1000>0:
        return '1000+'
    if n//500>0:
        return '500+'
    if n//200>0:
        return '200+'
    if n//100>0:
        return '100+'
    if n//50>0:
        return '50+'
    if n//20>0:
        return '20+'
    if n//10>0:
        return '10+'
    if n==1:
        return '1'
    return '1+'

In [43]:
level = ['1000+', '500+', '200+', '100+', '50+', '20+', '10+', '1+', '1']

In [20]:
df_1 = pd.read_pickle("./pickle/hash_ref1.pickle")

df_1['len'] = df_1['match'].apply(len)

df_1['len'].describe()

count    117029.000000
mean        830.340873
std        1714.278039
min           1.000000
25%          20.000000
50%         124.000000
75%         679.000000
max       21467.000000
Name: len, dtype: float64

In [44]:
df_1['len'].apply(number_level).value_counts()[level]

1000+    23537
500+     10689
200+     15591
100+     12693
50+      11732
20+      14202
10+       8865
1+       14880
1         4840
Name: len, dtype: int64

In [21]:
df_2 = pd.read_pickle("./pickle/hash_ref2.pickle")

df_2['len'] = df_2['match'].apply(len)

df_2['len'].describe()

count    130925.000000
mean       1137.804377
std        2074.346238
min           1.000000
25%          37.000000
50%         212.000000
75%        1063.000000
max       31267.000000
Name: len, dtype: float64

In [45]:
df_2['len'].apply(number_level).value_counts()[level]

1000+    33701
500+     13159
200+     19707
100+     14276
50+      12577
20+      13947
10+       8193
1+       12264
1         3101
Name: len, dtype: int64

In [22]:
df_3 = pd.read_pickle("./pickle/hash_ref3.pickle")

df_3['len'] = df_3['match'].apply(len)

df_3['len'].describe()

count    130800.000000
mean       1260.774205
std        2202.370634
min           1.000000
25%          43.000000
50%         242.000000
75%        1269.000000
max       36125.000000
Name: len, dtype: float64

In [46]:
df_3['len'].apply(number_level).value_counts()[level]

1000+    36840
500+     13386
200+     19116
100+     14061
50+      12154
20+      13618
10+       7851
1+       11157
1         2617
Name: len, dtype: int64