In [2]:
import pandas as pd
import re
from nltk.cluster import KMeansClusterer
import nltk
import numpy as np
import spacy
from sklearn.cluster import KMeans
from tqdm import tqdm
import requests

In [4]:
# load file

locations = pd.read_csv('./data/librettos_theaters.csv', delimiter='\t')
locations['id'] = [int(re.findall(r'\d+', s)[0]) for s in locations.file_name]
locations = locations.drop(columns=['Unnamed: 0'])
locations.sample(5)

Unnamed: 0,file_name,title,date,coperta,pot_city_name,coperta_appended,city_name,latitude,longitude,pot_city_name_fuzzy,...,predicted_cluster_method_1,inferred_latitude,inferred_longitude,title_opera_method_1,title_opera_method_2,title_vec_method_1,predicted_title_cluster_method_1,composer,inferred_title,title_mediawiki_pageid
819,data_935.json,Medea. tragedia lirica in tre atti da rapprese...,1850,"['—', '_', '_', 'ey', 'PLES', 'ae', ')', 'ro',...",[],0,0,0.0,0.0,[],...,3,45.4337096,12.3338572,Medea,Medea,[ 5.0539675 3.0430608 10.380463 -4.242064...,251,Not found,Medea,691887
283,data_601.json,"L' amore costante, commedia per musica in quat...",1787,"['{', 'foartedi', '“Sova', 'nn', '!', ')', 'Po...",[],0,0,0.0,0.0,[],...,73,Not found,Not found,L' amore costante,Not found,[ 0.448929 0.53146785 2.7469213 -2.901008...,59,Not found,L' amore costante,2922269
677,data_217.json,"Erminia, farsa giocosa per musica del sig. L. ...",1805,"['f', '\\', 'LOCO', 'SOCHEOOOOS', 'F', ':', 'E...","['livorn', 'casale']",0,livorn,43.54427,10.32615,"['livorn', 'venezia']",...,4,41.1297436,14.783396,Erminia,Erminia,[ 6.9308467 3.0184045 9.5799885 -4.379949...,126,musica del sig. L. G. Buonavoglia l,Erminia,1142922
547,data_302.json,"Gli avventurieri, opera buffa in due atti da r...",1842,"['GLI', 'AVVENTURIERI', 'OPERA', 'BUFFA', '18'...","['venezia', 'venezia', 'venetia']",0,venezia,45.43713,12.33265,"['venezia', 'venezia', 'venetia']",...,3,45.4337096,12.3338572,Gli avventurieri,Not found,[-2.6041934e+00 3.1466422e+00 4.8280487e+00 ...,7,maestro Antonio Buzzolla,Gli avventurieri,6435055
1022,data_376.json,"Il Pastor fido, tragicomedia pastorale del sig...",1784,"['3', 'Qi', 'x', 'Galore', 'icy', 'Toe', 'PASS...","['venezia', 'venezia', 'venetia']",1,venezia,45.43713,12.33265,"['venezia', 'venezia', 'venetia']",...,1,Not found,Not found,"Il Pastor fido, tragicomedia pastorale del sig...",Pastor fido,[ 0.5116386 0.69495374 2.4874156 0.542860...,156,Not found,"Il Pastor fido, tragicomedia pastorale del sig...",Not found


In [18]:
expr = r'''favola pastorale|fauola musicale|burletta in musica|componimento sagro|azione musicale|componimento sacro|spettacolo fantastico|scena in versi|fauola tragicomica|fauola pastorale|diuertimento comico|azione romantica|intermezzo in musica|intermezzi|favola in prosa|componimento sagro|componimento|azione sacra|actio sacra|libretto fantastico|opera|tragedia|farsa|commeddia|commedia|operetta|festa|cantata|dramma|drama|melodramma|melodrama|melo-dramma|oratorio|libretto|fiaba|fauola|favola'''
locations['genre_opera'] = [re.search(expr,s.lower()).group(0) if re.search(expr,s.lower()) else 'Not found' for s in locations.title]

gergo2it = {'fauola':'favola', 'drama':'dramma', 'melodrama':'melodramma', 
            'melo-dramma':'melodramma', 'actio sacra':'azione sacra', 'componimento sagro':'componimento sacro',
            'fauola pastorale':'favola pastorale', 'commeddia':'commedia', 'diuertimento comico':'divertimento comico',
            'fauola tragicomica': 'favola tragicomica'
           }
locations['genre_opera'] = locations['genre_opera'].map(lambda x: gergo2it[x] if x in gergo2it.keys() else x)
locations.genre_opera.value_counts()

dramma                   710
melodramma                89
Not found                 55
tragedia                  42
componimento sacro        37
farsa                     29
opera                     25
azione sacra              21
oratorio                  19
festa                     12
commedia                  11
cantata                   11
intermezzi                 8
favola pastorale           7
componimento               7
libretto                   5
favola                     5
operetta                   4
fiaba                      2
fauola musicale            1
spettacolo fantastico      1
azione musicale            1
divertimento comico        1
scena in versi             1
libretto fantastico        1
favola tragicomica         1
favola in prosa            1
burletta in musica         1
intermezzo in musica       1
azione romantica           1
Name: genre_opera, dtype: int64

In [35]:
nlp = spacy.load("it_core_news_sm")
locations['genre_vec_method_1'] = [np.mean(np.array([token.vector for token in nlp(loc)]), axis=0) for loc in locations.genre_opera]
print('Processed the set')

NUM_CLUSTERS = 15
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0, max_iter=500, verbose=10000).fit(locations['genre_vec_method_1'].tolist())

Processed the set
Initialization complete
Iteration 0, inertia 16061.076542260764
Iteration 1, inertia 13226.865880966596
Iteration 2, inertia 13195.09129230648
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 15712.991658011408
Iteration 1, inertia 11817.625426569226
Iteration 2, inertia 11787.777340259454
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 13984.209133167722
Iteration 1, inertia 11794.77631444457
Iteration 2, inertia 11765.676743042563
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 14924.019186460195
Iteration 1, inertia 11258.527757515667
Converged at iteration 1: strict convergence.
Initialization complete
Iteration 0, inertia 13984.209133167722
Iteration 1, inertia 11794.77631444457
Iteration 2, inertia 11765.676743042563
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 13984.209133167722
Iteration 1

In [36]:
locations['predicted_genre_cluster_method_1'] = locations['genre_vec_method_1'].apply(lambda x: kmeans.predict([x.tolist()])[0])
locations['predicted_genre_cluster_method_1']

0       0
1       0
2       2
3       4
4       0
       ..
1105    0
1106    0
1107    0
1108    0
1109    0
Name: predicted_genre_cluster_method_1, Length: 1110, dtype: int64

In [37]:
cluster_genre = {}
for i, cluster in enumerate(locations.predicted_genre_cluster_method_1):
    if cluster in cluster_genre.keys():
        if locations.genre_opera[i] != 'Not found':
            cluster_genre[cluster].append(locations.genre_opera[i])
    else:
            cluster_genre[cluster] = [locations.genre_opera[i]]

In [38]:
cluster_genre

{0: ['dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
  'dramma',
 

In [39]:
for k in cluster_genre.keys():
    cluster_genre[k] = max(set(cluster_genre[k]), key=cluster_genre[k].count)
    
cluster_genre

{0: 'dramma',
 2: 'melodramma',
 4: 'componimento sacro',
 3: 'Not found',
 5: 'opera',
 7: 'farsa',
 6: 'tragedia',
 1: 'azione sacra',
 10: 'festa',
 11: 'commedia',
 12: 'cantata',
 8: 'oratorio',
 14: 'favola pastorale',
 13: 'libretto',
 9: 'intermezzi'}

In [40]:
locations['inferred_genre'] = [cluster_genre[cluster] for cluster in locations.predicted_genre_cluster_method_1]
locations.inferred_genre.value_counts()

dramma                710
melodramma             89
Not found              55
componimento sacro     46
tragedia               42
farsa                  29
azione sacra           28
opera                  25
oratorio               19
favola pastorale       15
commedia               15
festa                  12
cantata                11
intermezzi              8
libretto                6
Name: inferred_genre, dtype: int64

In [41]:
S = requests.Session()
URL = "https://it.wikipedia.org/w/api.php"

In [42]:
locations['genre_mediawiki_pageid'] = [S.get(url=URL, params={"action": "query",
                                                              "format": "json",
                                                              "list": "search",
                                                              "srsearch": title}).json()
                                            ['query']['search'][0]['pageid'] 
                                       if len(S.get(url=URL, params={"action": "query",
                                                                     "format": "json",
                                                                     "list": "search",
                                                                     "srsearch": title}).json()
                                              ['query']['search']) > 0
                                       else
                                           'Not found'
                                       for title in locations.inferred_genre
                                       ]
locations['genre_mediawiki_pageid']

0        375957
1        375957
2       2615411
3        710671
4        375957
         ...   
1105     375957
1106     375957
1107     375957
1108     375957
1109     375957
Name: genre_mediawiki_pageid, Length: 1110, dtype: int64

In [43]:
locations.to_csv('./data/librettos_theaters.csv', sep='\t')