# Part I - LocalMaxs Extractor

Implementation of the multi-word Relevant Expressions LocalMaxs extractor, taking into account the following requirements:

- To obtain tokens, you may add a space character before and/or after characters such as “;”, “:”, “!”, “?”, “<”, “>”, “&”, “)”, “(“, “]”, “[", among others that do not change the semantics of the text, in order to improve the reliability of token frequencies.

- Choose a sufficiently efficient programming language so you can use the extractor in corpus of at least 1.5 million words.

- Let it be possible to use more than one cohesion metric, such as SCP, Dice, φ2, amomg others.

- Consider n-grams of length up to 7.

- Consider a minimum frequency filter as necessary requirement for an n-gram to be considered as Relevant Expression (RE); for example, the frequency of a RE must be at least 2.

- Evaluate the results of the extractor through the Precision, Recall and F metric, for at least two corpora. Consider one or more languages.

In [None]:
MaxXaxisValStopWords = 4000
DeltaX = 4

def stopwords(trigrams):
    stopWordsCandidates = {}
    for i in range(len(trigrams)):
        if(trigrams[i][0] not in stopWordsCandidates):
            stopWordsCandidates[trigrams[i][0]]=1
        else:
            stopWordsCandidates[trigrams[i][0]]+=1
        if(trigrams[i][2] not in stopWordsCandidates):
            stopWordsCandidates[trigrams[i][2]]=1
        else:
            stopWordsCandidates[trigrams[i][2]]+=1
    stopWordsCandidatesOrdered = calculateRatioOcurrSyllablesWord(stopWordsCandidates)
    prev=float('inf')
    StopWords=[]
    ElbowFound=False
    for b, word in enumerate(stopWordsCandidatesOrdered):
        if b < MaxXaxisValStopWords:
            if not ElbowFound:
                if b % DeltaX == 0:
                    val = stopWordsCandidates[word]
                    if prev - val < DeltaX:
                        prev = val
                        ElbowFound = True
                    else:
                        prev = val
                StopWords.append(word)
            else:
                break
        else:
            break

    return StopWords

def calculateRatioOcurrSyllablesWord(stopWordsCandidates):
    for word in stopWordsCandidates:
        ocurr=stopWordsCandidates[word]
        syl=countSyllables(word)
        if syl==0:
            syl=1
        stopWordsCandidates[word]= ocurr/syl
    return dict(sorted(stopWordsCandidates.items(), key=lambda item: item[1], reverse=True))

def countSyllables(word):
    vowels = 'aeiouyàáãâíìéèêóòúù'
    count = 0
    prev_char = None

    for char in word.lower():
        if char in vowels and (prev_char is None or prev_char not in vowels):
            count += 1
        prev_char = char

    if word.lower().endswith('e'):
        count -= 1

    return count

In [14]:
import re
from collections import Counter
import os

def LocalMaxsExtractor(directory, max_n, p=2, glue='SCP'):    
    """
    Implementation of the LocalMaxs Extractor algorithm
    INPUTS
    - file_path: path of the file
    - max_n: max length of the ngram to extract
    - p: exponent of the generalised mean
    OUTPUTS
    - RMWE: dictionary containing key = doc, value a dictionary containing REs (keys) probabilities (values)
            example: doc: {RE_1: P(RE_1), RE_2: P(RE_2), ..., RE_n: P(RE_n)}
    """
    
    def extract_ngrams(txt, max_n):
        """
        Function to extract the n-grams from the txt
        n goes from 1 to max_n
        """
        ngrams = [] # container to store the n-grams
        characters = r';:!<>&()\[/]'
        for line in txt:
            mod_line = re.sub(f'([{re.escape(characters)}])', r' \1 ', line)
            tokens = mod_line.split()  # split it into tokens
            for curr_n in range(1, max_n + 1): 
                for i in range(len(tokens) - curr_n + 1):
                    curr_n_gram = tuple(tokens[i:i + curr_n])
                    ngrams.append(curr_n_gram)
        return ngrams
    
    def extract_trigrams(txt):
        """
        Function to extract the trigrams from the txt
        """
        trigrams = [] # container to store the n-grams
        for line in txt:
            mod_line = re.sub(r'[^\w\s]', '', line)
            tokens = mod_line.split()  # split it into tokens
            for i in range(len(tokens) - 3 + 1):
                curr_trigram = tuple(tokens[i:i + 3])
                trigrams.append(curr_trigram)
        return trigrams
    
    

    def scp(n_gram_freq, pseudo_two_grams_freq):
        """
        SCP_f cohesion function
        """
        den = 0
        for tup in pseudo_two_grams_freq:
            temp_1 , temp_2 = tup
            den += temp_1*temp_2 
        den /= len(pseudo_two_grams_freq)
        return (n_gram_freq**2)/den
    
    def dice(n_gram_freq, pseudo_two_grams_freq):
        """
        DICE_f cohesion function
        """
        den = 0
        for tup in pseudo_two_grams_freq:
            temp_1 , temp_2 = tup
            den += temp_1 + temp_2
        den /= len(pseudo_two_grams_freq)
        return (n_gram_freq*2)/den
    
    def len_doc(text):
        """
        count the number of words of text
        """
        return len(re.sub(r'[^\w\s]', ' ', text).split())
    
    if glue=='SCP': glue = scp
    elif glue=='Dice': glue = dice
    
    RMWE = {}  # store the Relevant MultiWord Expressions
    
    for filename in os.listdir(directory):
        RMWE[filename] = []
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as fp:
            buffer = fp.readlines()
        
        stopword = set(stopwords(extract_trigrams(txt=buffer)))
            
        counter = Counter(extract_ngrams(txt=buffer, max_n=max_n)) # extract the n-grams from the file and compute freq
        
        characters = r';:!<>&()\[/]'
        N = len(re.sub(f'([{re.escape(characters)}])', r' \1 ', " ".join(buffer)).split()) # compute the number of words in buffer
        
        d = {ngram: {'freq': freq, 'prob' : freq/N} for ngram, freq in counter.items()} # add ngram as keys and freq as values
        
        
        # Compute the glue for each n-gram with n>1
        for ngram, info in d.items():
            if len(ngram) > 1:
                pseudo_two_grams_freq = [] # store the pseudo two-gram used for computing the cohesion function
                for i in range(1, len(ngram)):
                    pseudo_two_grams_freq.append( (d[ngram[:i]]['freq'], d[ngram[i:]]['freq']) )
                d[ngram]['glue'] = glue(info['freq'], pseudo_two_grams_freq)  # calling the cohesion function
        
        # Compute max_omega_plus and max_omega_minus and store it in d
        for ngram, info in d.items():
            if len(ngram) > 2:
                
                # Omega minus
                l, r = d[ngram[:-1]]['glue'], d[ngram[1:]]['glue']
                info['omega_minus'] = max(l,r)
                
                # Omega Plus
                if 'omega_plus' not in d[ngram[:-1]]:
                    d[ngram[:-1]]['omega_plus'] = info['glue']
                else:
                    d[ngram[:-1]]['omega_plus'] = max(d[ngram[:-1]]['omega_plus'], info['glue'])
                
                if 'omega_plus' not in d[ngram[1:]]:
                    d[ngram[1:]]['omega_plus'] = info['glue']
                else:
                    d[ngram[1:]]['omega_plus'] = max(d[ngram[1:]]['omega_plus'], info['glue'])
        
        for W, info in d.items():
            if ((W[0] not in stopword) and (W[-1] not in stopword)) and info['freq']>1 and 'omega_plus' in info:
                if len(W)==2 and info['glue'] >= info['omega_plus']:
                    if len(RMWE[filename]) == 0: RMWE[filename] = {W: info['prob']}
                    else: RMWE[filename][W] = info['prob']
                elif len(W)>2 and len(W)<max_n:
                    generalised_mean = (info['omega_minus']**p + info['omega_plus']**p)**(1/p)
                    if info['glue'] >= generalised_mean:
                        if len(RMWE[filename]) == 0: RMWE[filename] = {W: info['prob']}
                        else: RMWE[filename][W] = info['prob']
        
    return RMWE

In [15]:
%%time

## Test with SCP
RMWE = LocalMaxsExtractor(directory='/Users/riccardo/Documents/PAD/Module_2/Progetto/corpus2mw', 
                          max_n=8, 
                          p=2, 
                          glue='SCP')

CPU times: user 1min 31s, sys: 1.02 s, total: 1min 32s
Wall time: 1min 36s


In [16]:
RMWE['fil_1']

{('body', 'parts'): 0.0005564830272676684,
 ('19th', 'century,'): 0.0005564830272676684,
 ('<', 'doc'): 0.0007419773696902244,
 ('en.wikipedia.org', '/'): 0.0007419773696902244,
 ('url="http', ':', '/', '/', 'en.wikipedia.org'): 0.0007419773696902244,
 (')', ','): 0.0020404377666481174,
 (',', 'hence'): 0.0003709886848451122,
 ('Indic', 'scripts'): 0.0007419773696902244,
 ('Southeast', 'Asia.'): 0.0003709886848451122,
 (')', '.'): 0.001669449081803005,
 ('Orthodox', 'Church,'): 0.0003709886848451122,
 ('have', 'been'): 0.0007419773696902244,
 ('may', 'be', 'given', 'to', 'any'): 0.0003709886848451122,
 ('World', 'War'): 0.0005564830272676684,
 ('UEFA', 'Cup'): 0.0003709886848451122,
 ('1', 'Kings'): 0.0005564830272676684,
 ('Kings', '16'): 0.0003709886848451122,
 ('adverse', 'events'): 0.0003709886848451122,
 ('Latin', 'Empire'): 0.0003709886848451122,
 ('depart', 'for', 'the', 'Holy'): 0.0003709886848451122,
 ('crime', 'rate'): 0.0005564830272676684,
 ('national', 'average'): 0.000370

In [65]:
%%time
RMWE = LocalMaxsExtractor(directory="/Users/riccardo/Documents/PAD/Module_2/Progetto/FR6.1Mw", 
                          max_n=8, 
                          language='french', 
                          p=2, 
                          glue='SCP')

CPU times: user 9min 5s, sys: 15min 7s, total: 24min 13s
Wall time: 43min 29s


In [66]:
RMWE

{'FR6.1Mw': {(')', ','): 0.0041328274987487874,
  ('premières', 'décennies'): 1.7007520570982665e-06,
  ('.', 'Il'): 0.00016899290894621866,
  ('Châteaumeillant', '(', 'Cher'): 3.092276467451394e-07,
  ('Michel', 'Bréal'): 9.276829402354181e-07,
  ("l'École", 'pratique'): 1.3915244103531272e-06,
  ('grammaire', 'comparée,'): 4.6384147011770903e-07,
  ('faculté', 'des', 'lettres'): 1.5461382337256967e-06,
  ('suit', 'notamment', 'les', 'cours'): 3.092276467451394e-07,
  ("l'École", 'pratique', 'des', 'hautes', 'études.'): 6.184552934902788e-07,
  (')', '.'): 0.007767334644767783,
  ('.', 'En'): 0.00010652892430370051,
  ('grammaire', 'comparée'): 1.8553658804708361e-06,
  ('langues', 'indo-européennes.'): 2.319207350588545e-06,
  ('langues', 'orientales.'): 4.6384147011770903e-07,
  ('thèse', 'pour', 'le', 'doctorat'): 3.092276467451394e-07,
  ('chaire', 'de', 'grammaire', 'comparée'): 3.092276467451394e-07,
  ("l'École", 'des', 'langues', 'orientales.'): 3.092276467451394e-07,
  ('occu

In [67]:
%%time
RMWE_en = LocalMaxsExtractor(directory="/Users/riccardo/Documents/PAD/Module_2/Progetto/EN6.0Mw", 
                          max_n=8, 
                          language='english', 
                          p=2, 
                          glue='SCP')

CPU times: user 11min 41s, sys: 17min 8s, total: 28min 49s
Wall time: 50min 58s


In [68]:
RMWE_en

{'EN6.0Mw': {('stateless', 'societies,'): 3.151575165146478e-07,
  ('even', 'though'): 2.9624806552376895e-05,
  ('regarded', 'as', 'the', 'founder'): 6.303150330292956e-07,
  ('de', 'la'): 4.034016211387492e-05,
  ('La', 'Coruña'): 4.727362747719717e-07,
  ('&', 'apos'): 0.00048140310647612456,
  ('sometimes', 'called'): 2.2691341189054643e-05,
  ('leading', 'figure'): 1.8909450990878869e-06,
  ('United', 'States'): 0.00038149817374098115,
  ('become', 'standard.'): 4.727362747719717e-07,
  ('United', 'States', 'and', 'Canada'): 2.6788388903745062e-06,
  ('workers', 'were', 'killed.'): 4.727362747719717e-07,
  ('targeted', 'killings'): 4.727362747719717e-07,
  ('best', 'known'): 2.4424707529885207e-05,
  ('Russian', 'counterparts'): 3.151575165146478e-07,
  ('anarchist', 'federations'): 3.151575165146478e-07,
  ('inside', 'and', 'outside'): 1.4182088243159152e-06,
  ('International', 'of', 'Anarchist'): 4.727362747719717e-07,
  ('fascist', 'regimes'): 3.151575165146478e-07,
  ('Benito

---
# Part II - Implicit Relevant Expression

Implementation of automatic extractor of Explicit and Implicit Kewords from documents. Consider the following steps:

1. Extract Relevant Expressions (REs) from a set of several documents, by using LocalMaxs extractor you have implemented. Create adequate criteria to select the most informative REs.
2. Take the most 10 ‐15 informative REs of each document and consider them as the Explicit Keywords of the document.
3. Calculate the similarity between REs and use that to find the most correlated REs wich are semantically close to the Explicit keywords of each document and, although not explicitly written in the document, they may be used as its Implicit Keywords.

## Extract most informative REs from each doc

We extract the 10 most informative REs for each document by taking the first 10 with the highest according to their **median  length of words**

In [4]:
from statistics import median

def MostInformativeREs(d):
    """
    INPUT:
    d: dictionary conatining -> doc_i: {RE_1: P(RE_1), RE_2: P(RE_2), ..., RE_n: P(RE_n)}
    
    OUTPUT:
    informative_d: dictionary which contains for each doc the 10 REs with highest median length of words
    """
    informative_d = {}
    for doc, REs in d.items():
        if len(REs) > 10:
            temp = []  # store tuples of REs and median length of words
            for RE in REs.keys():
                lengths = [len(w) for w in RE]  # compute the length of the words in the RE and store it in lengths
                median_length = median(lengths) # computhe the median of lengths
                temp.append((RE,median_length)) # append to temp a tuple (RE, median length)
            sorted_temp = sorted(temp, key=lambda x: x[1], reverse=True) # sort temp on median_length in descending order
            informative_d[doc] = {" ".join(re[0]): REs[re[0]] for re in sorted_temp[:10]} # add to informative_d 10 most informative REs
        elif len(REs) > 0:
            informative_d[doc] = {" ".join(re) : p for re, p in REs.items()}
            
    return informative_d 

In [5]:
MI_RMWE = MostInformativeREs(RMWE)

In [6]:
MI_RMWE['fil_1']

{'en.wikipedia.org /': 0.0007419773696902244,
 'E-mail advertising': 0.0003709886848451122,
 'produces its "conjugate': 0.0003709886848451122,
 'Orthodox Church,': 0.0003709886848451122,
 'national average': 0.0003709886848451122,
 'maximum transfer': 0.0003709886848451122,
 'Southeast Asia.': 0.0003709886848451122,
 'adverse events': 0.0003709886848451122,
 'always respect': 0.0003709886848451122,
 '19th century,': 0.0005564830272676684}

## Compute the Prob of uni-gram (1-gram)

Compute the prob of uni-gram (1-gram) for each document in the corpus

In [7]:
import re

def prob(directory):
    """
    Computes the probability of unigram of each doc in the specified directory 
    INPUT
    - directory: directory path
    - d: dictionary of relevant expressions returned by the LocalMaxAlgorithm
    OUTPUT
    - P: dictionary with key=file_name, values is a dict containing singletons (keys) and associated p (values)
    """
    
    characters = r';:!<>&()\[/]'
    P = {} 
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as fp:
            buffer = fp.readlines()
            mod_buffer = ""
            for line in buffer:
                mod_buffer += re.sub(f'([{re.escape(characters)}])', r' \1 ', line)
            singletons = mod_buffer.split() # split the buffer in singletons
            N = len(singletons)             # N is the number of words in the document
            P[filename] = {w: f/N for w, f in Counter(singletons).items()} # compute the probabilitites
        
    return P

In [8]:
prob_1gram = prob("/Users/riccardo/Documents/PAD/Module_2/Progetto/new_c")

In [9]:
prob_1gram['fil_1']

{'Greek': 0.0011129660545353367,
 'Christian': 0.0001854943424225561,
 'scribes': 0.0001854943424225561,
 'played': 0.0005564830272676684,
 'a': 0.017992951214987944,
 'crucial': 0.0001854943424225561,
 'role': 0.0001854943424225561,
 'in': 0.022630309775551846,
 'the': 0.0667779632721202,
 'preservation': 0.0001854943424225561,
 'of': 0.04340567612687813,
 'Aristotle': 0.0003709886848451122,
 'by': 0.005750324615099239,
 'copying': 0.0001854943424225561,
 'all': 0.0011129660545353367,
 'extant': 0.0001854943424225561,
 'language': 0.0001854943424225561,
 'manuscripts': 0.0001854943424225561,
 'corpus.': 0.0001854943424225561,
 'The': 0.008532739751437582,
 'first': 0.0011129660545353367,
 'Christians': 0.0001854943424225561,
 'to': 0.020404377666481174,
 'comment': 0.0001854943424225561,
 'extensively': 0.0001854943424225561,
 'on': 0.005564830272676683,
 'were': 0.003709886848451122,
 'John': 0.0012984603969578928,
 'Philoponus,': 0.0001854943424225561,
 'Elias,': 0.00018549434242255

---
## Tf-Idf

In [10]:
def TfIdf(d, t):
    """
    Compute the Tf-Idf for each unigram in d.
    return a dict which contains:
    {doc_i: {w1: p(w1), w2: p(w2), ...}}
    prune the unigram w with a tf_idf under the threshold t
    INPUT:
    - d: dictionary with key=file_name, values is a dict containing singletons (keys) and associated p (values) 
    - t: threshold on the Tf-Idf
    OUTPUT:
    - out = {doc_i: {w1: p(w1), w2: p(w2), ...}}
    """
    
    from math import log
    
    def count(d):
        n_doc = {}   # dictionary where to store the number of doc where f(w)>0
        seen = set() # set that store the documents already processed
        for curr_doc, curr_unigrams in d.items():
            for w, p in curr_unigrams.items():
                if w in n_doc:
                    continue
                else:
                    count = 0
                    for other_doc, other_unigrams in d.items():
                        if other_doc not in seen and w in other_unigrams:
                            count+=1
                    n_doc[w] = count
            seen.add(curr_doc)
        return n_doc
    
    temp = count(d) # dictionary with unigram : n. doc where f(unigram)>0
    N = len(d)      # tot number of doc
    
    out = {} # output dictionary doc: {unigram1: tf_idf, unigram2: tf_idf, ...}
    
    for doc, unigrams in d.items():
        out[doc] = {}
        for w, p in unigrams.items():
            tf_idf = p * log(N/temp[w])
            if tf_idf >= t:
                out[doc][w] = p
            
    return out

In [11]:
%%time
x = TfIdf(d=prob_1gram, t=0.005)

CPU times: user 145 ms, sys: 6.08 ms, total: 151 ms
Wall time: 160 ms


In [16]:
x['fil_4']

{'Possible': 0.002702702702702703,
 'reflexes': 0.002702702702702703,
 'Insular': 0.002702702702702703,
 'Celtic.': 0.002702702702702703,
 'Chimpanzees': 0.002702702702702703,
 'facial': 0.005405405405405406,
 'expressions,': 0.002702702702702703,
 'postures': 0.002702702702702703,
 'sounds': 0.002702702702702703,
 'communicate': 0.002702702702702703,
 'other.': 0.002702702702702703,
 'Chimps': 0.005405405405405406,
 'expressive': 0.002702702702702703,
 'faces': 0.002702702702702703,
 'close-up': 0.002702702702702703,
 'communications.': 0.002702702702702703,
 'When': 0.008108108108108109,
 'frightened,': 0.002702702702702703,
 '"full': 0.002702702702702703,
 'closed': 0.002702702702702703,
 'grin"': 0.002702702702702703,
 'individuals': 0.002702702702702703,
 'fearful,': 0.002702702702702703,
 'well.': 0.002702702702702703,
 'expressions': 0.002702702702702703,
 '"lip': 0.002702702702702703,
 'flip",': 0.002702702702702703,
 '"pout",': 0.002702702702702703,
 '"sneer",': 0.002702702702

---
## Merge Dictionaries
Merge the two dictionaries containing unigram:prob and RMWE:prob into one dictionary

In [17]:
new_d = {} # new dictionary with both unigram and multigram

# MI_RMWE contains the RMWE
# x contains one-gram with their P

for doc, info in x.items(): 
    if doc in MI_RMWE:
        if len(info)>0: 
            new_d[doc] = {**info, **MI_RMWE[doc]}
        elif len(info)==0: 
            new_d[doc] = MI_RMWE[doc]
        else: 
            continue
    else:
        new_d[doc] = info
        

In [19]:
new_d['fil_2']

{'crane': 0.002306805074971165,
 'football': 0.00461361014994233,
 'clubs': 0.002306805074971165,
 'Neftchi': 0.002306805074971165,
 'Baku': 0.00461361014994233,
 'Premier': 0.002306805074971165,
 '"Orient"': 0.002306805074971165,
 'mast': 0.002306805074971165,
 'collapsed': 0.002306805074971165,
 'ship': 0.002306805074971165,
 'casualties.': 0.002306805074971165,
 'Captain': 0.0034602076124567475,
 'fire': 0.002306805074971165,
 '"Tonnant"': 0.002306805074971165,
 'struck': 0.002306805074971165,
 'cannonball': 0.002306805074971165,
 'ship,': 0.002306805074971165,
 'line,': 0.002306805074971165,
 'solitary': 0.002306805074971165,
 'nests': 0.002306805074971165,
 'Euronymous': 0.002306805074971165,
 'Norwegian': 0.002306805074971165,
 'label': 0.002306805074971165,
 'Switzerland': 0.00461361014994233,
 'checks': 0.002306805074971165,
 'crossings.': 0.002306805074971165,
 'customs': 0.002306805074971165,
 'pilots': 0.002306805074971165,
 'football clubs': 0.002306805074971165,
 'casualti

---
## Compute the Average Probabilities

In [20]:
def avgProb(d):
    """
    Compute the average probabilities of each word in the 
    set of documents d over the set of documents d itself.
    INPUT
    - d = {doc_i: {w_1i: p_1i, w_2i: p_2i, ...}}
    OUTPUT
    - avg_prob = {w1: P(w1,.), w2: P(w2,.), ...}
    """
    D = len(d)    # number of documents in d
    avg_prob = {} # output dicitonary with w : average probability
    seen = set()  # set of documents already seen
    
    for curr_doc, curr_info in d.items():
        for curr_w, curr_p in curr_info.items():
            if curr_w not in avg_prob:
                sum_p = curr_p
                for doc, info in d.items():
                    if doc not in seen and curr_w in info:
                        sum_p += info[curr_w]
            avg_prob[curr_w] = sum_p/D
        seen.add(curr_doc)
    
    return avg_prob

In [21]:
%%time

avg_prob = avgProb(new_d)

CPU times: user 4.23 ms, sys: 46 µs, total: 4.28 ms
Wall time: 4.4 ms


---
## Compute the Correlation

In [61]:
def corr(dExplicit, dImplicit, dAvgProb):
    """
    Compute the correlation between the explicit keyword contained in dExplicit 
    and the unigram, multiword expressions contained in dImplicit
    INPUT
    - dExplicit: contains {doc: RMWE: p}
    - dImplicit: contains {doc: w: p} with w both RMWE and unigram
    - dAvgProb contains average probabilities of both unigram and RMWE {w: avg_prob}
    OUTPUT
    - d = doc_i : {RMWEi: {w1:coor, w2:corr, w3:corr}}
    """
    d = {}         # dict d = doc_i : {RMWEi: {w1:coor, w2:corr, w3:corr}}
    D = len(dA)-1  # number of doc in the corpus 
    for curr_doc, curr_info in dExplicit.items():
        d[curr_doc] = {}
        for rmwe in curr_info.keys():  # rmwe are the implicit keywords (Relevant Multiword Expressions)
            d[curr_doc][rmwe] = {}
            for other_doc, other_info in dImplicit.items():
                for w in other_info.keys(): # w are the implicit descriptors
                    cov_rmwe_w = 0 # Cov(rmwe, w)
                    var_rmwe = 0   # Var(rmwe)
                    var_w = 0      # Var(w)
                    for doc, info in dImplicit.items():
                        if rmwe in info and w in info and curr_doc!=doc:
                            cov_rmwe_w += (info[rmwe]-dAvgProb[rmwe])*(info[w]-dAvgProb[w]) # update Cov(rmwe, w)
                            var_rmwe += (info[rmwe]-dAvgProb[rmwe])**2                      # update Var(rmwe)
                            var_w += (info[w]-dAvgProb[w])**2                               # update Var(w)
                    den = (((var_rmwe/D)**(1/2))*((var_w/D)**(1/2)))   # compute the denominatore of the Correlation
                    if den != 0:
                        corr_rmwe_w = (cov_rmwe_w/D)/den    # compute the Correlation, Corr(rmwe, w)
                        d[curr_doc][rmwe][w] = corr_rmwe_w
                    else:
                        d[curr_doc][rmwe][w] = 0
    return d
    

In [62]:
%%time

temp = corr(dA=MI_RMWE, dB=new_d, dAvgProb=avg_prob)

RMWE:en.wikipedia.org /, w:en.wikipedia.org /, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:en.wikipedia.org /, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:E-mail advertising, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:produces its "conjugate, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:Orthodox Church,, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:national average, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:maximum transfer, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:Southeast Asia., corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:adverse events, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:always respect, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:19th century,, corr:1.0, fil_rmwe:fil_36, fil_w:fil_28
RMWE:en.wikipedia.org /, w:seed, corr:1.0, fil_rmwe:fil_36, 

RMWE:United States, w:Filk, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:filker, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:Myanmar, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:Aung, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:Ne, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:constant, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:velocity, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:net, corr:0.9999999999999999, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:inertia, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:"natural, corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:rest"., corr:1.0000000000000002, fil_rmwe:fil_14, fil_w:fil_28
RMWE:United States, w:formal, corr:1.0, fil_rmwe:fil_14, fil_w:fil_2

In [60]:
MI_RMWE['fil_14']

{'"Athletics" written': 0.0005735589331803843,
 'aspirated ( pronounced': 0.0005735589331803843,
 '"macaroni products"': 0.0005735589331803843,
 'phonemes. For example,': 0.0005735589331803843,
 'alternate jersey': 0.0005735589331803843,
 'maintaining law': 0.0005735589331803843,
 'designed using': 0.0005735589331803843,
 'United States': 0.0005735589331803843,
 'Arctic Circle': 0.0005735589331803843,
 'DNA templates': 0.0005735589331803843}

---
# Stop Words

In [13]:
MaxXaxisValStopWords = 4000
DeltaX = 4

'''
Riccardo, this receives a list of all trigrams that donnot include special characters, this is very important to be sure
'''

def stopwords(trigrams):
    stopWordsCandidates = {}
    for i in range(len(trigrams)):
        if(trigrams[i][0] not in stopWordsCandidates):
            stopWordsCandidates[trigrams[i][0]]=1
        else:
            stopWordsCandidates[trigrams[i][0]]+=1
        if(trigrams[i][2] not in stopWordsCandidates):
            stopWordsCandidates[trigrams[i][2]]=1
        else:
            stopWordsCandidates[trigrams[i][2]]+=1
    stopWordsCandidatesOrdered = calculateRatioOcurrSyllablesWord(stopWordsCandidates)
    prev=float('inf')
    StopWords=[]
    ElbowFound=False
    for b, word in enumerate(stopWordsCandidatesOrdered):
        if b < MaxXaxisValStopWords:
            if not ElbowFound:
                if b % DeltaX == 0:
                    val = stopWordsCandidates[word]
                    if prev - val < DeltaX:
                        prev = val
                        ElbowFound = True
                    else:
                        prev = val
                StopWords.append(word)
            else:
                break
        else:
            break

    return StopWords

def calculateRatioOcurrSyllablesWord(stopWordsCandidates):
    for word in stopWordsCandidates:
        ocurr=stopWordsCandidates[word]
        syl=countSyllables(word)
        if syl==0:
            syl=1
        stopWordsCandidates[word]= ocurr/syl
    return dict(sorted(stopWordsCandidates.items(), key=lambda item: item[1], reverse=True))

def countSyllables(word):
    vowels = 'aeiouyàáãâíìéèêóòúù'
    count = 0
    prev_char = None

    for char in word.lower():
        if char in vowels and (prev_char is None or prev_char not in vowels):
            count += 1
        prev_char = char

    if word.lower().endswith('e'):
        count -= 1

    return count