In [21]:
"""
    Creating conditional model & predicting  
"""

import numpy as np
from nltk.stem import PorterStemmer as stemmer
import pickle 

class WordConditionalModel:

    def __init__(self):
        self.__words_matrix = 0
        self.__bios_matrix = 0
        self.__model = 0
        self.__words = []
        self.__none_words_vocab = []
    
    def __build_matrix(self, number_of_sentences, number_of_words):
        word_matrix = np.zeros((number_of_sentences, number_of_words))
        return word_matrix


    def __represent_sentences_as_matrix(self, words_lsts):
        matrix = self.__build_matrix(len(words_lsts * 9), len(self.__words))
        for i in range(len(words_lsts)):
            for j in range(len(words_lsts[i])):
                mj = self.__words.index( stemmer().stem(words_lsts[i][j].lower()))
                matrix[i*9:(i+1)*9, mj] = 1
        return matrix

    def __represent_bios_as_matrix(self, bios_lsts, words_lsts):
        matrix = self.__build_matrix(len(words_lsts * 9), len(self.__words))
        for i in range(len(words_lsts)):
            for j in range(len(words_lsts[i])):
                d = self.__words.index( stemmer().stem(words_lsts[i][j].lower()))
                for m in range(9):
                    if bios_lsts[i][j][m * 2] == "I" or bios_lsts[i][j][m * 2] == "B":
                        matrix[(i * 9) + m][d] = 1
        return matrix

    def __calculate_condetional_probs(self, w_i, w_j):
        occurrenece_of_wi_and_wj, bold_wi = 0, 0
        occurrenece_of_wi_and_wj = (self.__words_matrix[:, w_i] * self.__words_matrix[:, w_j]).sum()
        y = self.__words_matrix[:, w_i] * self.__words_matrix[:, w_j]
        bold_wi = (y * self.__bios_matrix[:, w_i]).sum()
        self.__model[w_i][w_j] = (bold_wi / occurrenece_of_wi_and_wj) if occurrenece_of_wi_and_wj else 0
        #print("[{}], [{}] = {}".format(self.__words[w_i], self.__words[w_j], self.__model[w_i][w_j]))

    def fit(self, words_lsts, bios_lsts):
        print("training WordConditionalModel....")
        self.__words = []
        words_vocab = [stemmer().stem(word.lower()) for innerlist in words_lsts for word in innerlist]
        for word in words_vocab:
            if word not in self.__words:
                self.__words.append(word)
        
        self.__model = self.__build_matrix(len(self.__words), len(self.__words))
        self.__words_matrix = self.__represent_sentences_as_matrix(words_lsts)
        self.__bios_matrix = self.__represent_bios_as_matrix(bios_lsts, words_lsts)
        print("vocab lenghts: " , len(self.__words))
        for i in range(len(self.__words)):
            if i%100 == 0:
                print(i , end = ' ')
            for j in range(len(self.__words)):
                self.__calculate_condetional_probs(i, j)
        print()

    def predict(self, word_lsts, words_id):
        self.__get_id_of_none_wordvocab(word_lsts, words_id)
        print(self.__none_words_vocab)
        predictions = []
        for i in range(len(word_lsts)):
            prediction = []
            for j in range(len(word_lsts[i])):
                word_ij = stemmer().stem(word_lsts[i][j].lower())
                if word_ij in self.__words:
                    w_i = self.__words.index( word_ij )
                    s = 0
                    for m in range(len(word_lsts[i])):
                        word_im = stemmer().stem(word_lsts[i][m].lower())
                        if word_im in self.__words:
                            w_j = self.__words.index( word_im )
                            s += self.__model[w_i][w_j]
                    prediction.append(s / len(word_lsts[i]))
                else:
                    prediction.append(0)
            predictions.append(prediction)
        return predictions
    
    def __get_id_of_none_wordvocab(self, word_lsts, words_id):
        self.__none_words_vocab = []
        for i in range(len(word_lsts)):
            for j in range(len(word_lsts[i])):
                word_ij = stemmer().stem(word_lsts[i][j].lower())
                if word_ij not in self.__words:
                    self.__none_words_vocab.append(words_id[i][j])
        
    
    def save(self, path):
        model = {"__words":self.__words, "__model":self.__model, "__none_words":self.__none_words_vocab}
        pickle.dump(model,open(path,"wb"))

    def load(self, path):
        model = pickle.load(open(path, "rb"))
        self.__words, self.__model, self.__none_words_vocab = model["__words"], model["__model"], model["__none_words"]

In [3]:
import Read_data_and_Write_results as rw

In [38]:
words_id, word_lsts, bio_lsts, _, _, _ = rw.read_data("datasets/just_train.txt")
word_id, word_list, bio_list, _, truth, _ = rw.read_data("datasets/just_test.txt")

In [40]:
model = WordConditionalModel()

In [41]:
model.fit(word_lsts, bio_lsts)

training WordConditionalModel....
vocab lenghts:  15
0 


In [42]:
prediction = model.predict(word_list, word_id)

['Q_4_0', 'Q_4_1', 'Q_4_2', 'Q_4_4', 'Q_4_6', 'Q_4_7', 'Q_4_8', 'Q_4_9', 'Q_4_10']


In [54]:
print(prediction)

[[0.8888888888888888, 0.3703703703703704, 0.7777777777777778], [0.04444444444444444, 0.33333333333333337, 0.0, 0.6666666666666667, 0.0, 0.04444444444444444, 0.33333333333333337, 0.0, 0.7777777777777778, 0.11111111111111113], [0.0, 0.0, 0.5555555555555555, 0.0, 0.22222222222222227, 0.22222222222222227, 0.5555555555555555, 0.11111111111111113], [0, 0, 0, 0.018518518518518517, 0, 0.037037037037037035, 0, 0, 0, 0, 0, 0.027777777777777776]]


In [55]:
print(truth)

[['0.8888888888888888', '0.4444444444444444', '0.7777777777777778'], ['0.0', '0.3333333333333333', '0.0', '0.6666666666666666', '0.0', '0.0', '0.1111111111111111', '0.0', '0.7777777777777778', '0.1111111111111111'], ['0.0', '0.0', '0.5555555555555556', '0.0', '0.2222222222222222', '0.2222222222222222', '0.5555555555555556', '0.1111111111111111'], ['0.0', '0.0', '0.0', '0.0', '0.2222222222222222', '0.1111111111111111', '0.3333333333333333', '0.0', '0.7777777777777778', '0.7777777777777778', '0.6666666666666666', '0.2222222222222222']]


In [63]:
for i in range(len(prediction)):
    for j in range(len(prediction[i])):
        print("prediction[{}] - truth[{}] = {}".format(prediction[i][j], truth[i][j],
                                                       abs(float(prediction[i][j])- float(truth[i][j]))))

prediction[0.8888888888888888] - truth[0.8888888888888888] = 0.0
prediction[0.3703703703703704] - truth[0.4444444444444444] = 0.07407407407407401
prediction[0.7777777777777778] - truth[0.7777777777777778] = 0.0
prediction[0.04444444444444444] - truth[0.0] = 0.04444444444444444
prediction[0.33333333333333337] - truth[0.3333333333333333] = 5.551115123125783e-17
prediction[0.0] - truth[0.0] = 0.0
prediction[0.6666666666666667] - truth[0.6666666666666666] = 1.1102230246251565e-16
prediction[0.0] - truth[0.0] = 0.0
prediction[0.04444444444444444] - truth[0.0] = 0.04444444444444444
prediction[0.33333333333333337] - truth[0.1111111111111111] = 0.22222222222222227
prediction[0.0] - truth[0.0] = 0.0
prediction[0.7777777777777778] - truth[0.7777777777777778] = 0.0
prediction[0.11111111111111113] - truth[0.1111111111111111] = 2.7755575615628914e-17
prediction[0.0] - truth[0.0] = 0.0
prediction[0.0] - truth[0.0] = 0.0
prediction[0.5555555555555555] - truth[0.5555555555555556] = 1.1102230246251565e

In [72]:
def visualization_of_errors(prediction, truth, range1, range2, range3, range4):
    counter1, counter2, counter3, counter4 = 0, 0, 0, 0
    for i in range(len(prediction)):
        for j in range(len(prediction[i])):
            if abs(float(prediction[i][j])- float(truth[i][j])) <= range1:
                counter1 += 1
            elif abs(float(prediction[i][j])- float(truth[i][j])) > range1 and abs(float(prediction[i][j])- float(truth[i][j])) <= range2:
                counter2 += 1
            elif abs(float(prediction[i][j])- float(truth[i][j])) > range2 and abs(float(prediction[i][j])- float(truth[i][j])) <= range3:
                counter3 += 1
            elif abs(float(prediction[i][j])- float(truth[i][j])) > range3 and abs(float(prediction[i][j])- float(truth[i][j])) <= range4:
                counter4 += 1
    print("counter1 = [{}]".format(counter1))
    print("counter2 = [{}]".format(counter2))
    print("counter3 = [{}]".format(counter3))
    print("counter4 = [{}]".format(counter4))

In [73]:
visualization_of_errors(prediction, truth, 0, 0.25, 0.5, 0.75)

counter1 = [13]
counter2 = [16]
counter3 = [1]
counter4 = [1]
