# Impementazione dell'esercizio sulla test segmentation

## Step algoritmo:

- Dividere il file per righe, le righe guideranno l'esercizio in quanto ai tali ci sarà associata la riga di riferimento;
- Contare il valore di co-occorrenza per ogni frase:
  - Il valore di co-occorrenza è la somma dei valori di co-occorrenza per ogni parola in una frase;

### Tagli corretti (ita)
* 36/37 - taglio arg 1/2
* 56/57 - taglio arg 2/3

### Tagli corretti (en)

- 59/60
- 102/103

In [1]:
from nltk.corpus import stopwords
from collections import Counter
from gensim.test.utils import simple_preprocess
from gensim.corpora.dictionary import Dictionary
from gensim import models
from pprint import pprint
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random

In [2]:
def remove_stop_words_ita(phrase):
    stop_words = stopwords.words('italian')
    phrase = phrase.split()
    phrase = [word for word in phrase if word not in stop_words]
    return phrase

def get_text_from_file(path):
    file = []
    stop_words = set(stopwords.words('english'))
    with open (path, 'r') as f:
        for row in f:
            filtered_s = [w for w in word_tokenize(row) if not w.lower() in stop_words]
            file.append(simple_preprocess(str(filtered_s), deacc=True))
    f.close()
    return file

def cooccurrence(text):
    '''
    Calculates the cooccurrence value of a sequence of words. 
    This value correspond to the sum of the occurrences of the n most frequently used words in the word list.
    '''
    score = 0
    c = Counter()
    most_common = []
    for row in text:
        c.update(row)
        
    most_common = c.most_common(3)
    # print(most_common)
    
    for el in most_common:
        score = score + el[1]
        
    return score

def extract_segment(file, start, end):
    '''
    Given the first and the last line, extract the segment.
    '''
    segment = []
    for i in range(start, end):
        segment.append(file[i])
    return segment
    

### Data organization

Organizziamo le parole in una lista di liste, in qui ogni lista corrisponderà alle parole di una riga tokenizzate.

In [3]:
file = get_text_from_file('../res/segmentation_eng.txt')
c = Counter()
num_lines = sum(1 for line in open('../res/segmentation_eng.txt')) # Number of lines in the file

for row in file:
    c.update(row)
    
print(c.most_common(30))

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 3133: character maps to <undefined>

### Paramethers tuning

Calcolo dei valori standard per i tagli iniziali, per farlo divido il totale delle linee per il numero di topic.

In [None]:
cut = []
n_topic = 3

cut.append(0)
for k in range(1, n_topic):
    cut.append(num_lines // n_topic * k)
cut.append(num_lines)

print(cut)

[0, 60, 120, 181]


### Testing the algorithm

Calcolo il valore di cooccorrenza per ogni segmento

In [None]:
scores = []
cut = [0 , 3, 45, num_lines]
# Extract the segments
for i in range(len(cut)-1):
    print(cut[i])
    text = extract_segment(file, cut[i], cut[i+1])
    scores.append(cooccurrence(text))
    
print(scores)


0
3
45
[7, 64, 105]


### First implementation of the complete algorithm - DIDN'T WORK

Prepare data for the algorithm:

In [None]:
max_scores, scores = [0, 0, 0], [0, 0, 0]
sum_score = 0
max = 0
# cut = [0, 10, 45, num_lines]

#* Questi parametri sono legati alle singole iterazioni, quindi alla ricerca dei tagli legati ai singoli segmenti
limit = [0, 0, 0] # Used to avoid to go to far away from the cut
direction = [1, 1, 1] # Indicate the direction of the search, 1 mean "top" and -1 mean "bottom"

cut = [0, 40, 120, num_lines] #* Real: [59, 102]

Prendiamo la somma di tutti gli score (dei 3 segmenti) e cerchiamo di massimizzarla

In [None]:
# Extract the segments
for f in range(2000):    
    
    for i in range(len(cut)-1):
        text = extract_segment(file, cut[i], cut[i+1])
        scores[i] = cooccurrence(text)
        
    sum_scores = sum(scores)
    if(sum_scores > max):
        max = sum_scores
    
    # Check the score and update the max for each segment
    for k in range(len(cut)-1):
        
        # Find the cut for the first segments wich is special because have just 1 boundary
        if k == 0:
            if(scores[k] > max_scores[k]):
                max_scores[k] = scores[k]
                limit[k] = 0
            # Update the value of cut --> Try a direction
            elif limit[k] < 5:
                cut[1] = cut[1] + (1 * direction[k])
                limit[k] = limit[k] + 1
            # Change direction of search --> Wrong direction
            else:
                direction[k] *= -1
                limit[k] = 0
        else:     
            if scores[k] > max_scores[k]:
                max_scores[k] = scores[k]
                limit[k] = 0
            elif limit[k] < 5:
                cut[k] = cut[k] + (1 * direction[k])
                limit[k] = limit[k] + 1
            else: # Update the cut point
                direction[k] = direction[k] * -1
                limit[k] = 0
                cut[k] = cut[k] + (1 * direction[k])
        
# print(scores)

print(f'\n The Max values is {max_scores} at lines {cut}')



 The Max values is [70, 95, 47] at lines [0, 45, 123, 181]


### Second implementation of the complete algorithm: Maximise the sum of the scores

In [None]:
# Extract the segments
x , y, max = 0, 0, 0
max_scores = [(0, 0), (0, 0), (0, 0)]
max_cut = []

for f in range(20000): 
    
    y = random.randint(2, num_lines-1)
    x = random.randint(1, num_lines-2)
    
    while x > y:
        y = random.randint(2, num_lines-1)
        x = random.randint(1, num_lines-2)
    
    cut = [0, x, y, num_lines]
    
    for i in range(len(cut)-1):
        text = extract_segment(file, cut[i], cut[i+1])
        scores[i] = cooccurrence(text)
        
    sum_scores = sum(scores)
    if(sum_scores > max):
        max = sum_scores
        max_cut = cut
    
    for d in range(len(scores)):
        max_scores[d] = (cut[d], scores[d])
        
        
# print(scores)

print(f'\n Best cut at lines \n{max_cut}\n the max sum is \n{max}\n')



 Best cut at lines 
[0, 59, 102, 181]
 the max sum is 
231



### Find the best cut for first segment - Funziona solo con un punto di partenza inferiore al toglio corretto (59) in quanto non c'è un limite "destro"


In [None]:
scores = []
scores.append(0)
max_scores = (0, 0)
cut = [0, 10, 45, num_lines]
limit = 0 # Used to avoid to go to far away from the cut
direction = 1 # Indicate the direction of the search, 1 mean "top" and -1 mean "bottom"

# Extract the segments
for f in range(1000):
    text = extract_segment(file, cut[0], cut[1])
    scores[0] = cooccurrence(text)
    
    # print(scores)
        
    # Save the max value and the cut line --> Good value found
    if(scores[0] > max_scores[1]):
        max_scores = (cut[1], scores[0])
        limit = 0
        
    # Update the value of cut --> Try a direction
    elif limit < 5:
        cut[1] = cut[1] + (1 * direction)
        limit = limit + 1
    
    # Change direction of search --> Wrong direction
    else:
        direction *= -1
        limit = 0

    
# print(scores)

print(f'\n The Max value is: {max_scores[1]} at line {max_scores[0]}')



 The Max value is: 80 at line 59
