In [1]:
import operator
import numpy as np
from sklearn.model_selection import KFold
import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics import classification_report



In [2]:
kf = KFold(n_splits=3, shuffle=True)

In [3]:
glove_model = KeyedVectors.load_word2vec_format("D:/wiki.de.vec", binary=False)

In [4]:
glove_model.init_sims(replace=True)

In [5]:
def decapitalize(word):
    if len(word) > 1:
        word = word[0].lower() + word[1:]
    return word

In [6]:
#no suffixes
def all_dist(vector_1,vector_2):
    cur_dist = 0
    for i in range((len(vector_1))):
        if i == 0 or i == 2:
            try:
                ith_dist = glove_model.similarity(decapitalize(vector_1[i]), decapitalize(vector_2[i]))
                cur_dist = (1-ith_dist)*(1-ith_dist)
            except:
                ith_dist = 0
                cur_dist = (1-ith_dist)*(1-ith_dist)
        else:
            if vector_1[i] != vector_2[i]:
                cur_dist += 1
    return cur_dist

In [7]:
file = open("C:/Users/1/Desktop/compounds_final.txt", "r", encoding="utf-8")
lines = file.readlines()

In [8]:
#no suffixes
matrix = []
for line in lines:
    vectors = []
    types = []
    line = line.strip("\n")
    items = line.split("/")
    p1 = [items[1][:1]] + [items[1][:2]] + [items[1][:3]]
    p2 = [items[4][:1]] + [items[4][:2]] + [items[4][:3]]
    s1 = [items[1][-1]] + [items[1][-2:]] + [items[1][-3:]]
    s2 = [items[4][-1]] + [items[4][-2:]] + [items[4][-3:]]
    items_new = items[1:3] + p1 + s1 + items[4:6] + p2 + s2 + [items[7]]
    for item in items_new:
        vectors.append(item)
    types.append(items[-1])
    tp = (vectors,types)
    matrix.append(tp)

In [9]:
for item in matrix[:2]:
    print (item)

(['Engel', '<NN>', 'E', 'En', 'Eng', 'l', 'el', 'gel', 'Bild', '<+NN>', 'B', 'Bi', 'Bil', 'd', 'ld', 'ild', 's'], ['N'])
(['Wolke', '<NN>', 'W', 'Wo', 'Wol', 'e', 'ke', 'lke', 'Schatten', '<+NN>', 'S', 'Sc', 'Sch', 'n', 'en', 'ten', 'n'], ['NN'])


In [10]:
matrixX = []
matrixY = []
for item in matrix:
    matrixX.append(item[0])
    matrixY.append(item[1][0])

In [11]:
X = np.array(matrixX)
y = np.array(matrixY)

In [12]:
kf.get_n_splits(X)

3

In [13]:
all_predicts = []
tests = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    predictions = []
    for vector in X_test:
        distances = []
        types = []
        for x in range(len(X_train)):
            distance = all_dist(X_train[x], vector)
            d_t = (distance, y_train[x])
            distances.append(d_t)
            types.append(y_train[x])
        distances_sort = sorted(distances)
        types_sort = []
        for item in distances_sort[:3]:
            types_sort.append(item[1])
        maxx = max(set(types_sort), key=types_sort.count)
        predictions.append(maxx)
    y_test = y_test.tolist()
    all_predicts.extend(predictions)
    tests.extend(y_test)
    
print(classification_report(tests, all_predicts))

             precision    recall  f1-score   support

         AN       0.45      0.62      0.52      1349
          N       0.43      0.29      0.34      1250
         NN       0.22      0.19      0.20       540

avg / total       0.40      0.41      0.40      3139



In [14]:
#with suffixes
def all_dist(vector_1,vector_2):
    cur_dist = 0
    for i in range((len(vector_1))):
        if i == 0 or i == 3:
            try:
                ith_dist = glove_model.similarity(decapitalize(vector_1[i]), decapitalize(vector_2[i]))
                cur_dist = (1-ith_dist)*(1-ith_dist)
            except:
                ith_dist = 0
                cur_dist = (1-ith_dist)*(1-ith_dist)
        else:
            if vector_1[i] != vector_2[i]:
                cur_dist += 1
    return cur_dist

In [24]:
#with suffixes
matrix = []
for line in lines:
    vectors = []
    types = []
    line = line.strip("\n")
    items = line.split("/")
    p1 = [items[1][:1]] + [items[1][:2]] + [items[1][:3]]
    p2 = [items[4][:1]] + [items[4][:2]] + [items[4][:3]]
    s1 = [items[1][-1]] + [items[1][-2:]] + [items[1][-3:]]
    s2 = [items[4][-1]] + [items[4][-2:]] + [items[4][-3:]]
    items_new = items[1:4] + p1 + s1 + items[4:7] + p2 + s2 + [items[7]]
    for item in items_new:
        vectors.append(item)
    types.append(items[-1])
    tp = (vectors,types)
    matrix.append(tp)

In [25]:
for item in matrix[:2]:
    print (item)

(['Engel', '<NN>', 'no', 'E', 'En', 'Eng', 'l', 'el', 'gel', 'Bild', '<+NN>', 'no', 'B', 'Bi', 'Bil', 'd', 'ld', 'ild', 's'], ['N'])
(['Wolke', '<NN>', 'no', 'W', 'Wo', 'Wol', 'e', 'ke', 'lke', 'Schatten', '<+NN>', 'no', 'S', 'Sc', 'Sch', 'n', 'en', 'ten', 'n'], ['NN'])


In [26]:
matrixX = []
matrixY = []
for item in matrix:
    matrixX.append(item[0])
    matrixY.append(item[1][0])

In [27]:
X = np.array(matrixX)
y = np.array(matrixY)

In [28]:
kf.get_n_splits(X)

3

In [29]:
all_predicts = []
tests = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    predictions = []
    for vector in X_test:
        distances = []
        types = []
        for x in range(len(X_train)):
            distance = all_dist(X_train[x], vector)
            d_t = (distance, y_train[x])
            distances.append(d_t)
            types.append(y_train[x])
        distances_sort = sorted(distances)
        types_sort = []
        for item in distances_sort[:3]:
            types_sort.append(item[1])
        maxx = max(set(types_sort), key=types_sort.count)
        predictions.append(maxx)
    y_test = y_test.tolist()
    all_predicts.extend(predictions)
    tests.extend(y_test)
    
print(classification_report(tests, all_predicts))

             precision    recall  f1-score   support

         AN       0.44      0.62      0.52      1349
          N       0.39      0.24      0.30      1250
         NN       0.23      0.20      0.21       540

avg / total       0.39      0.40      0.38      3139

