In [26]:
# Standard python modules
from functools import lru_cache
from os.path import expanduser, join

# Extra python modules
from bert_serving.client import BertClient
from bpemb import BPEmb
from IPython.display import display
#from gensim.models.fasttext import FastText
import numpy as np
from numpy.linalg import norm
import pandas as pd
import requests

import json
import csv
import itertools
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# INSTALL
#pip install bert-serving-server  # server
#pip install bert-serving-client  # client, independent of `bert-serving-server`

In [3]:
# The data we will work on is from the `dasem` toolbox available at https://github.com/fnielsen/dasem
# Only the four_words_2.csv file is necessary here, - not the rest of `dasem`.
#filename_four_words = expanduser(r'/home/s165541/Bert/four_words_english.csv')
#four_words = pd.read_csv(filename_four_words)

wordToSimilarWordsMap = {}
with open(r'C:/Users/thina/Documents/Matematik_og_teknologi/Advanced Machine Learning/FastText/four_words_english.json', "r") as read_file:
    data = json.load(read_file)
    for wordObj in data:
        wordToSimilarWordsMap[wordObj['word']] = wordObj['lexeme']

four_words_csv = []
with open(r'C:/Users/thina/Documents/Matematik_og_teknologi/Advanced Machine Learning/FastText/four_words_english.csv', newline='') as csvfile:
    data_csv = csv.reader(csvfile, delimiter='.', quotechar='|')
    for row in data_csv:
        words = row[0].split(",")
        words = [word.strip() for word in words]
        four_words_csv.append(words)  

four_words_csv.pop(0)
# The fasttext model is available from https://fasttext.cc/docs/en/crawl-vectors.html
#filename_fasttext_model = expanduser(join('~', 'data', 'fasttext', 'cc.da.300.bin'))

# multi_cased_L-12_H-768_A-12 may be be downloaded from
# https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
#https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip

#import zipfile
#with zipfile.ZipFile('/home/s165541/Bert/multi_cased_L-12_H-768_A-12.zip', 'r') as zip_ref:
#    zip_ref.extractall('/home/s165541/Bert')


# BPEmb are downloaded automatically

['word1', 'word2', 'word3', 'word4']

In [4]:
# Run in terminal 
# python start-bert-as-service.py -model_dir ./wwm_uncased_L-24_H-1024_A-16 -num_worker=1

In [5]:
bc = BertClient(ip='localhost')

In [6]:
print(bc.encode(wordToSimilarWordsMap['train']).shape)
print(four_words_csv[82])
print(wordToSimilarWordsMap[four_words_csv[82][1]])
print(four_words_csv[85][1].split())
print(wordToSimilarWordsMap[four_words_csv[85][0]])

(4, 1024)
['white', 'black', 'red', 'apple']
['blackest', 'blacker', 'black', 'കറുപ്പ്']
['kitchen']


KeyError: 'dining room'

# Knowledge Graph

In [30]:
corrcoef_outliers, cov_outliers, dot_outliers, cosine_outliers, svd_outliers, pca_outliers = [], [], [], [], [], []
for i in range(len(four_words_csv)): 
    vectors = np.zeros((4,1024))
    for j in range(4): 
        word = four_words_csv[i][j]
        if len(word.split()) is 1: 
            vectors[j,:] = np.mean(bc.encode(wordToSimilarWordsMap[word]), axis=0)
        elif len(word.split()) is 2: 
            word = word.split()
            vectors[j,:] = np.mean(bc.encode(wordToSimilarWordsMap[word[0]])+bc.encode(wordToSimilarWordsMap[word[1]]), axis=0)
        
    R = np.corrcoef(vectors)
    indices = np.argsort(R.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    corrcoef_outliers.append(outlier)

    C = np.cov(vectors)
    indices = np.argsort(C.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cov_outliers.append(outlier)

    D = np.dot(vectors, vectors.T)
    indices = np.argsort(D.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    dot_outliers.append(outlier)
    
    CO = np.dot(vectors, vectors.T)
    square_mag = np.diag(CO)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = CO * inv_mag
    cosine = cosine.T * inv_mag
    indices = np.argsort(cosine.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cosine_outliers.append(outlier)
    
    
    vecs = np.transpose(vectors)
    MeanVector = np.mean(vecs, axis = 1)[:,np.newaxis]
    vecs = np.concatenate((vecs,MeanVector), axis = 1)
    
    # SVD
    U, s, Vh = np.linalg.svd(vecs,full_matrices=False)
    Dist = np.zeros(4)

    for j in range(4):
        Dist[j] = np.linalg.norm(U[4,0]-U[j,0])+np.linalg.norm(U[4,1]-U[j,1])
    outlier = four_words_csv[i][np.where(Dist == max(Dist))[0][0]]
    svd_outliers.append(outlier)
    
    
    # PCA
    pca_vecs = PCA(n_components=2, svd_solver = 'arpack')
    X_pca_vecs = pca_vecs.fit_transform(vecs) # Calculate PCA of vector

    Dist_pca_vecs = np.zeros(4) # Calculate distance 
    for j in range(4):
        Dist_pca_vecs[j] = np.linalg.norm(X_pca_vecs[4,0]-X_pca_vecs[j,0])+np.linalg.norm(X_pca_vecs[4,1]-X_pca_vecs[j,1])
    outlier = four_words_csv[i][np.where(Dist_pca_vecs == max(Dist_pca_vecs))[0][0]]
    pca_outliers.append(outlier)
    
    print('Progress', i)
    

TypeError: must be str, not list

In [16]:
sum_corr = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == corrcoef_outliers[i]:
        sum_corr += 1
    
sum_corr/len(four_words_csv)

0.4262295081967213

In [17]:
sum_cov = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cov_outliers[i]:
        sum_cov += 1
    
sum_cov/len(four_words_csv)

0.2786885245901639

In [18]:
sum_dot = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == dot_outliers[i]:
        sum_dot += 1
    
sum_dot/len(four_words_csv)

0.2786885245901639

In [19]:
sum_svd = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == svd_outliers[i]:
        sum_svd += 1
    
sum_svd/len(four_words_csv)

0.23497267759562843

In [20]:
sum_pca = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == pca_outliers[i]:
        sum_pca += 1
    
sum_pca/len(four_words_csv)

0.3333333333333333

In [21]:
sum_cosine = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cosine_outliers[i]:
        sum_cosine += 1
    
sum_cosine/len(four_words_csv)

0.4262295081967213

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.DataFrame({'Word 1': np.array(four_words_csv)[:,0], 'Word 2': np.array(four_words_csv)[:,1], 'Word 3': np.array(four_words_csv)[:,2], 'Word 4': np.array(four_words_csv)[:,3], 'corrcoef': corrcoef_outliers, 
            'cov':cov_outliers, 'dot':dot_outliers,'SVD':svd_outliers, 'PCA':pca_outliers , 'cosine': cosine_outliers})

# Four Words 

In [29]:
corrcoef_outliers, cov_outliers, dot_outliers, cosine_outliers, svd_outliers, pca_outliers = [], [], [], [], [], []
for i in range(len(four_words_csv)): 
    vectors = bc.encode(four_words_csv[i])
    
    R = np.corrcoef(vectors)
    indices = np.argsort(R.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    corrcoef_outliers.append(outlier)

    C = np.cov(vectors)
    indices = np.argsort(C.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cov_outliers.append(outlier)

    D = np.dot(vectors, vectors.T)
    indices = np.argsort(D.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    dot_outliers.append(outlier)
    
    CO = np.dot(vectors, vectors.T)
    square_mag = np.diag(CO)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = CO * inv_mag
    cosine = cosine.T * inv_mag
    indices = np.argsort(cosine.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cosine_outliers.append(outlier)
    
    
    vecs = np.transpose(vectors)
    MeanVector = np.mean(vecs, axis = 1)[:,np.newaxis]
    vecs = np.concatenate((vecs,MeanVector), axis = 1)
    
    # SVD
    U, s, Vh = np.linalg.svd(vecs,full_matrices=False)
    Dist = np.zeros(4)

    for j in range(4):
        Dist[j] = np.linalg.norm(U[4,0]-U[j,0])+np.linalg.norm(U[4,1]-U[j,1])
    outlier = four_words_csv[i][np.where(Dist == max(Dist))[0][0]]
    svd_outliers.append(outlier)
    
    
    # PCA
    pca_vecs = PCA(n_components=2, svd_solver = 'arpack')
    X_pca_vecs = pca_vecs.fit_transform(vecs) # Calculate PCA of vector

    Dist_pca_vecs = np.zeros(4) # Calculate distance 
    for j in range(4):
        Dist_pca_vecs[j] = np.linalg.norm(X_pca_vecs[4,0]-X_pca_vecs[j,0])+np.linalg.norm(X_pca_vecs[4,1]-X_pca_vecs[j,1])
    outlier = four_words_csv[i][np.where(Dist_pca_vecs == max(Dist_pca_vecs))[0][0]]
    pca_outliers.append(outlier)
    
    print('Progress', i)
    

TypeError: must be str, not list

In [23]:
sum_corr = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == corrcoef_outliers[i]:
        sum_corr += 1
    
print('Outlier detection using correlation coefficients', sum_corr/len(four_words_csv))

sum_cov = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cov_outliers[i]:
        sum_cov += 1
    
print('Outlier detection using covarians', sum_cov/len(four_words_csv))

sum_dot = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == dot_outliers[i]:
        sum_dot += 1
    
print('Outlier detection using dot', sum_dot/len(four_words_csv))

sum_svd = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == svd_outliers[i]:
        sum_svd += 1
    
print('Outlier detection using SVD', sum_svd/len(four_words_csv))

sum_pca = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == pca_outliers[i]:
        sum_pca += 1
    
print('Outlier detection using PCA', sum_pca/len(four_words_csv))

sum_cosine = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cosine_outliers[i]:
        sum_cosine += 1
    
print('Outlier detection using cosine', sum_cosine/len(four_words_csv))

Outlier detection using correlation coefficients 0.40437158469945356
Outlier detection using covarians 0.28415300546448086
Outlier detection using dot 0.28415300546448086
Outlier detection using SVD 0.24043715846994534
Outlier detection using PCA 0.32786885245901637
Outlier detection using cosine 0.40437158469945356


In [27]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.DataFrame({'Word 1': np.array(four_words_csv)[:,0], 'Word 2': np.array(four_words_csv)[:,1], 'Word 3': np.array(four_words_csv)[:,2], 'Word 4': np.array(four_words_csv)[:,3], 'corrcoef': corrcoef_outliers, 
            'cov':cov_outliers, 'dot':dot_outliers,'SVD':svd_outliers, 'PCA':pca_outliers , 'cosine': cosine_outliers})

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,corrcoef,cov,dot,SVD,PCA,cosine
0,train,car,plane,exercise,car,car,car,exercise,plane,car
1,brother,sister,father,iron,sister,sister,sister,sister,sister,sister
2,stool,chair,couch,door,door,door,door,couch,couch,door
3,tree,grass,bush,clock,grass,grass,grass,bush,clock,grass
4,poster,photo,picture,camera,poster,camera,camera,picture,camera,poster
5,weight,height,length,meter,height,height,height,length,length,height
6,worm,earthworm,cockroach,tiger,cockroach,worm,worm,cockroach,cockroach,cockroach
7,cereals,milk,butter,knife,cereals,cereals,cereals,butter,butter,cereals
8,lunch,dinner,breakfast,kitchen,kitchen,kitchen,kitchen,kitchen,kitchen,kitchen
9,basketball,baseball,football,strategy,strategy,football,football,baseball,football,strategy
