In [1]:
from scripts.nlp_jarno import Bagofwords, BPETokenizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

define functions to extract abstracts from files and convert tokens

In [2]:
def extract_abstracts(filepath):
    """
    reads file of format [paper title]: paper abstract \n
    :param filepath: string path to file
    :return: dictionary where key: string title of paper, value: string abstract of paper
    """
    output_dict = {}
    with open(filepath, "r") as train:
        for line in train.read().split("\n"):
            header, abstract = line.split("]:")
            abstract = abstract.replace(" ", "_")
            output_dict[header[1:]] = [char for char in abstract]
    return output_dict

In [3]:
def get_tokens(text, max_merges=10):
    """

    :param text: list of text
    :param max_merges: number of times merges take place
    :return: token keys
    """
    tokenizer = BPETokenizer()
    corpus = text
    vocab = tokenizer.init_vocab(corpus)
    pair_count = tokenizer.generate_pair_count(corpus)

    merges = []
    tries = 0

    for i in range(0, max_merges):
        highest = tokenizer.find_highest_pair(pair_count)
        if highest is None:
            break

        corpus, pair_count = tokenizer.update_corpus(corpus, highest, pair_count)
        vocab = tokenizer.update_vocab(highest, vocab)
        merges.append(highest[0])

        tries += 1

    return vocab

Er zijn twee bestanden met abstracts:
tumor_abstracts.txt
other_abstracts.txt

in de tumor_abstracts staan abstacts die gaan over tumoren en in other_abstracts staan abstracts die gaan over senecance / cell aging.

In [4]:
tumor_abstacts = extract_abstracts("../tumor_abstracts.txt")
other_abstracts = extract_abstracts("../other_abstracts.txt")

X = list(tumor_abstacts.values()) + list(other_abstracts.values())
y = [int(i < len(tumor_abstacts)) for i in range(0, len(X))]
y

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]

Split de X en y met train test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

definieer de tokens met get_tokens

In [6]:
freq = get_tokens(X_train, 1000)
token_input = {i: tok for i, tok in enumerate(freq)}

### gebruik tf_idf

In [7]:
bag = Bagofwords(token_input, "tf_idf")
bag.fit("".join(all_chars) for all_chars in X_train)

In [8]:
encripted_X_train = []
for x_i in X_train:
    encripted_X_train.append(bag.create_bag(x_i))
encripted_X_train[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.056099669420363846,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.30280688134444295,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.2370227479098684,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.06008078183714413,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.05984765999685652,
 0,
 0,
 -0.28769601934798017,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.21666210238080955,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.006893002310586493,
 0,
 -0.018378530529570485,
 0,
 0,
 0,
 0,
 -0.008717761848493636,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -0.2831546929864987,
 0,
 0,
 0,
 0,
 0,
 0,
 

In [9]:
model_tf_idf = GaussianNB()
model_tf_idf.fit(encripted_X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [10]:
encripted_X_test = []
for x_i in X_test:
    encripted_X_test.append(bag.create_bag(x_i))
   

predicted = model_tf_idf.predict(encripted_X_test)
    
predicted, y_test

(array([1, 0, 1, 0]), [1, 1, 1, 1])

### Gebruik frequency

In [11]:
bag = Bagofwords(token_input, "frequency")
bag.fit("".join(all_chars) for all_chars in X)
encripted_X_train = []
for x_i in X_train:
    encripted_X_train.append(bag.create_bag(x_i))
model_frequency = GaussianNB()
model_frequency.fit(encripted_X_train, y_train)
encripted_X_test = []
for x_i in X_test:
    encripted_X_test.append(bag.create_bag(x_i))
   
predicted = model_frequency.predict(encripted_X_test)
    
predicted, y_test

(array([1, 0, 1, 1]), [1, 1, 1, 1])

### Gebruik multi_hot

In [12]:
bag = Bagofwords(token_input, "frequency")
bag.fit("".join(all_chars) for all_chars in X)
encripted_X_train = []
for x_i in X_train:
    encripted_X_train.append(bag.create_bag(x_i))
model_multi_hot = GaussianNB()
model_multi_hot.fit(encripted_X_train, y_train)
encripted_X_test = []
for x_i in X_test:
    encripted_X_test.append(bag.create_bag(x_i))
   
predicted = model_multi_hot.predict(encripted_X_test)
    
predicted, y_test

(array([1, 0, 1, 1]), [1, 1, 1, 1])

Er is een verschil tussen de incriptie methodes. 