In [1]:
''' Given a word, a dictinary and a tokenized corpus via file path
    returns the list of lines containing those words '''

def get_lines_with_word(word: str, dictionary: dict, file_name: str):
    phrases = []
    
    corpus_file = open(file_name)
    
    for line in corpus_file:
        line = line.split()
        for token in line:
            token = int(token)
            
            if token == dictionary[word]:
                phrases.append([int(e) for e in line])
                break
            
    corpus_file.close()
    
    return phrases

In [2]:
''' Given: 
    - a dictionary used as co-occurrence matrix to store the co-occurrences in
    - an integer representing the size of the sliding window
    - a list of tokenized lines (the corpus)
    - a token representing the word to compute the co-occurrences for
    Updates the given co-occurrence matrix '''        

def update_co_occ(co_occurrences: dict, window_size: int, corpus: list, clue: int):
    for line in corpus:
        for index, word in enumerate(line):
            if word == clue:
                for window_index in range(max(0, index - window_size), min(len(line), index + window_size + 1)):
                    candidate = line[window_index]
                    if candidate != clue and candidate != 0:
                        if (clue, candidate) in co_occurrences:
                            co_occurrences[(clue, candidate)] += 1
                        else:
                            co_occurrences[(clue, candidate)] = 1

In [3]:
import math

''' Given a list of dictionary representing the sparce co-occurrence matrix of each corpus,
    a list of couples (word, times the word appears) being the distribution of words
    and a list of weights (int) (one for each corpus)
    calculates and returns a dictionary containing the sparse PMI matrix '''

def calculate_pmi(co_occ, distribution, weights):
    pmi = {}

    for i in range(len(co_occ)):
        for ((c, s), v) in co_occ[i].items():
            if (c, s) in pmi:
                pmi[(c, s)] += v * weights[i]
            else:
                pmi[(c, s)] = v * weights[i]
    
    for ((c, s), v) in pmi.items():
        if c in distribution:
            cc = distribution[c]
        else:
            cc = 1000000
        if s in distribution:
            ss = distribution[s]
        else:
            ss = 1000000
        pmi[(c, s)] = math.log(v / (cc * ss))

    return pmi

In [4]:
''' Given a dictionary containing a sparse PMI matrix 
    calculates the mean of the PMI values of couples with same solution
    converts the result into a list, sorts it and returns it'''

def calculate_solutions(pmi: dict):
    min_val = 0
    
    solutions = {}
    found_solutions = {}
    for (clue, sol), val in pmi.items():
        if val < min_val:
            min_val = val
        
        if sol in solutions:
            solutions[sol] += val
            found_solutions[sol] += 1
        else:
            solutions[sol] = val
            found_solutions[sol] = 1
    
    for sol, val in solutions.items():
        solutions[sol] += min_val * (5 - found_solutions[sol])
        solutions[sol] /= 5
    
    sorted_solutions = sorted(list(solutions.items()), key=lambda x: x[1], reverse=True)
    
    return sorted_solutions

In [9]:
# Resources to calculate PMI denominator

# Whitelist (as distribution)
distribution_file = open('Corpus/whitelist.txt')
distribution = {line.split(', ')[0]: int(line[:-1].split(', ')[1]) for line in distribution_file}
distribution_file.close()

dictionary = {t: i+1 for i, (t, c) in enumerate(distribution.items())}
inverted_dictionary = {i: t for t, i in dictionary.items()}
distribution = {i+1: c for i, (t, c) in enumerate(distribution.items())}

In [13]:
''' Given a list of clues, tries to find the solution to the Ghigliottina quiz '''

def ghigliottina(clues: list):
    clues = [c.lower() for c in clues]

    paisa_co_occ = {}
    colloc_co_occ = {}
    demauro_co_occ = {}
    sayings_co_occ = {}
    songs_co_occ = {}
    
    for clue in clues:
        if clue in dictionary:
            
            paisa_lines = get_lines_with_word(clue, dictionary, 'Corpus/paisa_tokenization.txt')
            colloc_lines = get_lines_with_word(clue, dictionary, 'Corpus/collocazioni_tokenization.txt')
            demauro_lines = get_lines_with_word(clue, dictionary, 'Corpus/polirematics_tokenization.txt')
            sayings_lines = get_lines_with_word(clue, dictionary, 'Corpus/sayings_tokenization.txt')
            songs_lines = get_lines_with_word(clue, dictionary, 'Corpus/songs_tokenization.txt')

            update_co_occ(paisa_co_occ, 3, paisa_lines, dictionary[clue])
            update_co_occ(colloc_co_occ, 10, colloc_lines, dictionary[clue])
            update_co_occ(demauro_co_occ, 10, demauro_lines, dictionary[clue])
            update_co_occ(sayings_co_occ, 10, sayings_lines, dictionary[clue])
            update_co_occ(songs_co_occ, 10, songs_lines, dictionary[clue])

    co_occ = [paisa_co_occ,
              colloc_co_occ, 
              demauro_co_occ, 
              sayings_co_occ, 
              songs_co_occ]

    weights = [1, 100, 200, 100, 50]

    pmi = calculate_pmi(co_occ, distribution, weights)

    solutions = calculate_solutions(pmi)
    
    return solutions[:5]