# IDL 10 : Construction d'un HMM pour l'étiquetage morphosyntaxique automatique

In [1]:
class HMM:
	def __init__(self, initial_prob, transition_prob, emission_prob):
		# Initialisation des probabilités
		self.initial_prob = initial_prob
		self.transition_prob = transition_prob
		self.emission_prob = emission_prob

	def initial(self, tag):
		# Retourne la probabilité initiale pour une étiquette
		return self.initial_prob.get(tag, 0.0)

	def transition(self, tag_p, tag_c):
		# Retourne la probabilité de transition d'une étiquette à une autre
		if tag_p in self.transition_prob and tag_c in self.transition_prob[tag_p]:
			return self.transition_prob[tag_p][tag_c]
		else:
			return 0.0

	def emission(self, tag, token):
		# Retourne la probabilité d'émission d'un token étant donné une étiquette
		if tag in self.emission_prob and token in self.emission_prob[tag]:
			return self.emission_prob[tag][token]
		else:
			return 0.0

In [2]:
def initials(hmm, token):
	initial_probs = {}
	for tag in hmm.emission_prob:
		prob = hmm.initial(tag) * hmm.emission(tag, token)
		initial_probs[tag] = {'probabilité': prob, 'depuis': None}
	return initial_probs

def best_transitions(hmm, probas_preced, token):
	meilleures_transitions = {}
	for etiquette in hmm.emission_prob:
		meilleure_prob = 0
		meilleure_etiquette_prec = None
		for etiqu_prec, val in probas_preced.items():
			prob_transition = hmm.transition(etiqu_prec, etiquette)
			prob_totale = prob_transition * val['probabilité']
			if prob_totale > meilleure_prob:
				meilleure_prob = prob_totale
				meilleure_etiquette_prec = etiqu_prec

		meilleure_prob *= hmm.emission(etiquette, token)
		meilleures_transitions[etiquette] = {'probabilité': meilleure_prob, 'depuis': meilleure_etiquette_prec}
	return meilleures_transitions

def viterbi_matrix(hmm, words):
	viterbi = [initials(hmm, words[0])]
	for word in words[1:]:
		current_probs = best_transitions(hmm, viterbi[-1], word)
		viterbi.append(current_probs)
	return viterbi

def viterbi(hmm, sentence):
	matrix = viterbi_matrix(hmm, sentence)
	last_word_probs = matrix[-1]
	max_prob = 0
	max_tag = None
	for tag, info in last_word_probs.items():
		if info["probabilité"] > max_prob:
			max_prob = info["probabilité"]
			max_tag = tag
	if max_tag is None:
		max_tag = 'NOUN'
	sequence = [max_tag]
	for i in range(len(sentence) - 2, -1, -1):
		prev_tag = matrix[i + 1][sequence[0]]['depuis']
		if prev_tag is None:
			prev_tag = sequence[0]
		sequence.insert(0, prev_tag)
	return sequence, max_prob

## Exercice 2 : créer un HMM depuis un jeu d'entraînement

### a) Test lire 

In [3]:
def lire(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			if line.strip():
				tokens = line.strip().split()
				sentence = [token.split('/') for token in tokens]
				sentences.append(sentence)
	return sentences

In [4]:
sent = lire("test_lire.txt")
assert sent == [[['Que', 'SCONJ'], ['la', 'DET'], ['lumière', 'NOUN'], ['soit', 'VERB'], ['!', 'PUNCT']]], sent

In [5]:
print(sent)

[[['Que', 'SCONJ'], ['la', 'DET'], ['lumière', 'NOUN'], ['soit', 'VERB'], ['!', 'PUNCT']]]


In [6]:
def depuis_corpus(corpus):
    initials = {}
    emissions = {}
    transitions = {}

    total_initials = 0
    total_transitions = {}

    for sentence in corpus:
        for i, token in enumerate(sentence):
            try:
                word, tag = token
            except ValueError:
                print(f"Erreur de format pour le token: {token} dans la phrase: {sentence}")
                continue

            if i == 0:
                initials[tag] = initials.get(tag, 0) + 1
                total_initials += 1

            if tag not in emissions:
                emissions[tag] = {}
            emissions[tag][word] = emissions[tag].get(word, 0) + 1

            if i > 0:
                prev_tag = sentence[i-1][1]
                if prev_tag not in transitions:
                    transitions[prev_tag] = {}
                    total_transitions[prev_tag] = 0
                transitions[prev_tag][tag] = transitions[prev_tag].get(tag, 0) + 1
                total_transitions[prev_tag] += 1

    for tag in initials:
        initials[tag] = initials[tag] / total_initials
    for tag in emissions:
        total = sum(emissions[tag].values())
        for word in emissions[tag]:
            emissions[tag][word] = emissions[tag][word] / total
    for tag in transitions:
        total = sum(transitions[tag].values())
        for next_tag in transitions[tag]:
            transitions[tag][next_tag] = transitions[tag][next_tag] / total

    return HMM(initials, transitions, emissions)

In [7]:
train = lire("sequoia/fr_sequoia-ud-train.line.txt")
my_hmm = depuis_corpus(train)
sentence = ["Le", "professeur", "Gaston", "parle"]
path, probs = viterbi(my_hmm, sentence)

assert path == ['DET', 'NOUN', 'PROPN', 'VERB'], path

print("proba =", probs)

Erreur de format pour le token: ['', '', 'ADP'] dans la phrase: [['Parti', 'VERB'], ['de', 'ADP'], ['Cholet', 'PROPN'], ['(', 'PUNCT'], ['Maine', 'PROPN'], ['et', 'CCONJ'], ['Loire', 'PROPN'], [')', 'PUNCT'], ['le', 'DET'], ['chauffeur', 'NOUN'], ["n'", 'ADV'], ['aura', 'AUX'], ['mis', 'VERB'], ['finalement', 'ADV'], ["qu'", 'ADV'], ['une', 'DET'], ['journée', 'NOUN'], ['et', 'CCONJ'], ['demi', 'NOUN'], [',', 'PUNCT'], ['avec', 'ADP'], ['des', 'DET'], ['pointes', 'NOUN'], ['de', 'ADP'], ['vitesse', 'NOUN'], ['à', 'ADP'], ['70', 'NUM'], ['km', 'NOUN'], ['', '', 'ADP'], ['h', 'NOUN'], ['(', 'PUNCT'], ['moins', 'ADP'], ['la', 'DET'], ['nuit', 'NOUN'], ['de', 'ADP'], ['repos', 'NOUN'], [')', 'PUNCT'], ['pour', 'ADP'], ['couvrir', 'VERB'], ['une', 'DET'], ['distance', 'NOUN'], ['de', 'ADP'], ['600', 'NUM'], ['km', 'NOUN'], ['.', 'PUNCT']]
Erreur de format pour le token: ['', '', 'SYM'] dans la phrase: [['EMEA', 'PROPN'], ['', '', 'SYM'], ['H', 'C', '562', 'NOUN']]
Erreur de format pour le t

In [8]:
dev = lire("sequoia/fr_sequoia-ud-dev.line.txt")
total = 0
correct = 0
for sentence in dev:
    tokens = []
    gold = []
    for item in sentence:
        if len(item) == 2:
            token, tag = item
            tokens.append(token)
            gold.append(tag)
        else:
            print(f"Skipping malformatted token: {item} in sentence: {sentence}")
            break

    if len(tokens) == len(sentence):
        total += len(sentence)
        guess, prob = viterbi(my_hmm, tokens)
        correct += sum(guess[i] == gold[i] for i in range(len(gold)))

print("L'accuracy du HMM sur le corpus de développement est de :", 100 * correct / total, "%")

Skipping malformatted token: ['', '', 'CCONJ'] in sentence: [['Traitement', 'NOUN'], ['des', 'ADP+DET'], ['patients', 'NOUN'], ['atteints', 'VERB'], ["d'", 'ADP'], ['un', 'DET'], ['syndrome', 'NOUN'], ['coronarien', 'ADJ'], ['aigu', 'ADJ'], ['(', 'PUNCT'], ['angor', 'NOUN'], ['instable', 'ADJ'], ['', '', 'CCONJ'], ['infarctus', 'NOUN'], ['du', 'ADP+DET'], ['myocarde', 'NOUN'], ['sans', 'ADP'], ['sus', 'NOUN'], ['décalage', 'NOUN'], ['du', 'ADP+DET'], ['segment', 'NOUN'], ['ST', 'NOUN'], ['(', 'PUNCT'], ['AI', 'NOUN'], ['', '', 'SYM'], ['IDM', 'NOUN'], ['ST-', 'ADJ'], [')', 'PUNCT'], [')', 'PUNCT'], ['devant', 'VERB'], ['bénéficier', 'VERB'], ["d'", 'ADP'], ['une', 'DET'], ['intervention', 'NOUN'], ['urgente', 'ADJ'], ['ou', 'CCONJ'], ['précoce', 'ADJ'], ['.', 'PUNCT']]
Skipping malformatted token: ['', '', 'ADP'] in sentence: [['La', 'DET'], ['dose', 'NOUN'], ['recommandée', 'VERB'], ["d'", 'ADP'], ['Angiox', 'PROPN'], [',', 'PUNCT'], ['pour', 'ADP'], ['les', 'DET'], ['patients', 'NOUN

In [9]:
class HMM:
    def __init__(self, initial_prob, transition_prob, emission_prob, backoff):
        self.initial_prob = initial_prob
        self.transition_prob = transition_prob
        self.emission_prob = emission_prob
        self.backoff = backoff

    def initial(self, tag):
        return self.initial_prob.get(tag, 0.0)

    def transition(self, tag_p, tag_c):
        return self.transition_prob.get(tag_p, {}).get(tag_c, 0.0)

    def emission(self, tag, token):
        return self.emission_prob.get(tag, {}).get(token, self.backoff)

In [10]:
def depuis_corpus(corpus):
    initials = {}
    emissions = {}
    transitions = {}
    n = 0
    vocabulaire = set()

    for sentence in corpus:
        for item in sentence:
            if len(item) == 2:
                word, tag = item
                n += 1
                vocabulaire.add(word)

                if sentence.index(item) == 0:
                    initials[tag] = initials.get(tag, 0) + 1
                
                if tag not in emissions:
                    emissions[tag] = {}
                emissions[tag][word] = emissions[tag].get(word, 0) + 1
                
                if sentence.index(item) > 0:
                    prev_tag = sentence[sentence.index(item)-1][1]
                    if prev_tag not in transitions:
                        transitions[prev_tag] = {}
                    transitions[prev_tag][tag] = transitions[prev_tag].get(tag, 0) + 1
            else:
                print(f"Skipping malformatted token: {item} in sentence: {sentence}")
                continue

    V = len(vocabulaire) + len(emissions)
    backoff = 1 / (n + V)

    for tag in initials:
        initials[tag] = initials[tag] / sum(initials.values())
    for tag in emissions:
        total = sum(emissions[tag].values())
        for word in emissions[tag]:
            emissions[tag][word] = emissions[tag][word] / total
    for tag in transitions:
        total = sum(transitions[tag].values())
        for next_tag in transitions[tag]:
            transitions[tag][next_tag] = transitions[tag][next_tag] / total

    return HMM(initials, transitions, emissions, backoff)

In [12]:
train = lire("sequoia/fr_sequoia-ud-train.line.txt")
dev = lire("sequoia/fr_sequoia-ud-dev.line.txt")
my_hmm = depuis_corpus(train)

total = 0
correct = 0
for sentence in dev:
	filtered_sentence = [item for item in sentence if len(item) == 2]
	tokens = [token for token, tag in filtered_sentence]
	gold = [tag for token, tag in filtered_sentence]
	if not tokens:
		continue
	total += len(filtered_sentence)
	guess, prob = viterbi(my_hmm, tokens)
	correct += sum(guess[i] == gold[i] for i in range(len(gold)))

if total > 0:
	print("L'accuracy du HMM sur le corpus de développement est de :", 100*correct / total, "%")
else:
	print("Aucune donnée valide pour calculer l'accuracy.")

Skipping malformatted token: ['', '', 'ADP'] in sentence: [['Parti', 'VERB'], ['de', 'ADP'], ['Cholet', 'PROPN'], ['(', 'PUNCT'], ['Maine', 'PROPN'], ['et', 'CCONJ'], ['Loire', 'PROPN'], [')', 'PUNCT'], ['le', 'DET'], ['chauffeur', 'NOUN'], ["n'", 'ADV'], ['aura', 'AUX'], ['mis', 'VERB'], ['finalement', 'ADV'], ["qu'", 'ADV'], ['une', 'DET'], ['journée', 'NOUN'], ['et', 'CCONJ'], ['demi', 'NOUN'], [',', 'PUNCT'], ['avec', 'ADP'], ['des', 'DET'], ['pointes', 'NOUN'], ['de', 'ADP'], ['vitesse', 'NOUN'], ['à', 'ADP'], ['70', 'NUM'], ['km', 'NOUN'], ['', '', 'ADP'], ['h', 'NOUN'], ['(', 'PUNCT'], ['moins', 'ADP'], ['la', 'DET'], ['nuit', 'NOUN'], ['de', 'ADP'], ['repos', 'NOUN'], [')', 'PUNCT'], ['pour', 'ADP'], ['couvrir', 'VERB'], ['une', 'DET'], ['distance', 'NOUN'], ['de', 'ADP'], ['600', 'NUM'], ['km', 'NOUN'], ['.', 'PUNCT']]
Skipping malformatted token: ['', '', 'SYM'] in sentence: [['EMEA', 'PROPN'], ['', '', 'SYM'], ['H', 'C', '562', 'NOUN']]
Skipping malformatted token: ['H', 'C'