In [1]:
# Standard python modules
from functools import lru_cache
from os.path import expanduser, join

# Extra python modules
from bert_serving.client import BertClient
from bpemb import BPEmb
from IPython.display import display
#from gensim.models.fasttext import FastText
import numpy as np
from numpy.linalg import norm
import pandas as pd
import requests

import json
import csv
import itertools
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# INSTALL
#pip install bert-serving-server  # server
#pip install bert-serving-client  # client, independent of `bert-serving-server`

In [3]:
# The data we will work on is from the `dasem` toolbox available at https://github.com/fnielsen/dasem
# Only the four_words_2.csv file is necessary here, - not the rest of `dasem`.
#filename_four_words = expanduser(r'/home/s165541/Bert/four_words_english.csv')
#four_words = pd.read_csv(filename_four_words)

wordToSimilarWordsMap = {}
with open(r'C:/Users/thina/Documents/Matematik_og_teknologi/Advanced Machine Learning/FastText/four_words_english.json', "r") as read_file:
    data = json.load(read_file)
    for wordObj in data:
        wordToSimilarWordsMap[wordObj['word']] = wordObj['lexeme']

four_words_csv = []
with open(r'C:/Users/thina/Documents/Matematik_og_teknologi/Advanced Machine Learning/FastText/four_words_english.csv', newline='') as csvfile:
    data_csv = csv.reader(csvfile, delimiter='.', quotechar='|')
    for row in data_csv:
        words = row[0].split(",")
        words = [word.strip() for word in words]
        four_words_csv.append(words)  

four_words_csv.pop(0)
# The fasttext model is available from https://fasttext.cc/docs/en/crawl-vectors.html
#filename_fasttext_model = expanduser(join('~', 'data', 'fasttext', 'cc.da.300.bin'))

# multi_cased_L-12_H-768_A-12 may be be downloaded from
# https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
#https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip

#import zipfile
#with zipfile.ZipFile('/home/s165541/Bert/multi_cased_L-12_H-768_A-12.zip', 'r') as zip_ref:
#    zip_ref.extractall('/home/s165541/Bert')


# BPEmb are downloaded automatically

['word1', 'word2', 'word3', 'word4']

In [4]:
# Run in terminal 
# python start-bert-as-service.py -model_dir ./multi_cased_L-12_H-768_A-12 -num_worker=1

In [5]:
bc = BertClient(ip='localhost')

In [6]:
corrcoef_outliers, cov_outliers, dot_outliers, cosine_outliers, svd_outliers, pca_outliers = [], [], [], [], [], []
for i in range(len(four_words_csv)): 
    vectors = np.zeros((4,1024))
    for j in range(4): 
        word = four_words_csv[i][j]
        if len(word.split()) is 1: 
            vectors[j,:] = np.mean(bc.encode(['similarity '+word]), axis=0)
        elif len(word.split()) is 2: 
            word = word.split()
            vectors[j,:] = np.mean(bc.encode(['similarity '+word[0]])+bc.encode(['similarity '+word[1]]), axis=0)
        
    R = np.corrcoef(vectors)
    indices = np.argsort(R.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    corrcoef_outliers.append(outlier)

    C = np.cov(vectors)
    indices = np.argsort(C.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cov_outliers.append(outlier)

    D = np.dot(vectors, vectors.T)
    indices = np.argsort(D.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    dot_outliers.append(outlier)
    
    CO = np.dot(vectors, vectors.T)
    square_mag = np.diag(CO)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = CO * inv_mag
    cosine = cosine.T * inv_mag
    indices = np.argsort(cosine.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cosine_outliers.append(outlier)
    
    
    vecs = np.transpose(vectors)
    MeanVector = np.mean(vecs, axis = 1)[:,np.newaxis]
    vecs = np.concatenate((vecs,MeanVector), axis = 1)
    
    # SVD
    U, s, Vh = np.linalg.svd(vecs,full_matrices=False)
    Dist = np.zeros(4)

    for j in range(4):
        Dist[j] = np.linalg.norm(U[4,0]-U[j,0])+np.linalg.norm(U[4,1]-U[j,1])
    outlier = four_words_csv[i][np.where(Dist == max(Dist))[0][0]]
    svd_outliers.append(outlier)
    
    
    # PCA
    pca_vecs = PCA(n_components=2, svd_solver = 'arpack')
    X_pca_vecs = pca_vecs.fit_transform(vecs) # Calculate PCA of vector

    Dist_pca_vecs = np.zeros(4) # Calculate distance 
    for j in range(4):
        Dist_pca_vecs[j] = np.linalg.norm(X_pca_vecs[4,0]-X_pca_vecs[j,0])+np.linalg.norm(X_pca_vecs[4,1]-X_pca_vecs[j,1])
    outlier = four_words_csv[i][np.where(Dist_pca_vecs == max(Dist_pca_vecs))[0][0]]
    pca_outliers.append(outlier)
    
    print('Progress', i)
    

Progress 0
Progress 1
Progress 2
Progress 3
Progress 4
Progress 5
Progress 6
Progress 7
Progress 8
Progress 9
Progress 10
Progress 11
Progress 12
Progress 13
Progress 14
Progress 15
Progress 16
Progress 17
Progress 18
Progress 19
Progress 20
Progress 21
Progress 22
Progress 23
Progress 24
Progress 25
Progress 26
Progress 27
Progress 28
Progress 29
Progress 30
Progress 31
Progress 32
Progress 33
Progress 34
Progress 35
Progress 36
Progress 37
Progress 38
Progress 39
Progress 40
Progress 41
Progress 42
Progress 43
Progress 44
Progress 45
Progress 46
Progress 47
Progress 48
Progress 49
Progress 50
Progress 51
Progress 52
Progress 53
Progress 54
Progress 55
Progress 56
Progress 57
Progress 58
Progress 59
Progress 60
Progress 61
Progress 62
Progress 63
Progress 64
Progress 65
Progress 66
Progress 67
Progress 68
Progress 69
Progress 70
Progress 71
Progress 72
Progress 73
Progress 74
Progress 75
Progress 76
Progress 77
Progress 78
Progress 79
Progress 80
Progress 81
Progress 82
Progress 83
Pr

In [7]:
sum_corr = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == corrcoef_outliers[i]:
        sum_corr += 1
    
sum_corr/len(four_words_csv)

0.6174863387978142

In [8]:
sum_cov = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cov_outliers[i]:
        sum_cov += 1
    
sum_cov/len(four_words_csv)

0.3333333333333333

In [9]:
sum_dot = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == dot_outliers[i]:
        sum_dot += 1
    
sum_dot/len(four_words_csv)

0.3333333333333333

In [10]:
sum_svd = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == svd_outliers[i]:
        sum_svd += 1
    
sum_svd/len(four_words_csv)

0.08196721311475409

In [11]:
sum_pca = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == pca_outliers[i]:
        sum_pca += 1
    
sum_pca/len(four_words_csv)

0.0273224043715847

In [12]:
sum_cosine = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cosine_outliers[i]:
        sum_cosine += 1
    
sum_cosine/len(four_words_csv)

0.6174863387978142

In [13]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.DataFrame({'Word 1': np.array(four_words_csv)[:,0], 'Word 2': np.array(four_words_csv)[:,1], 'Word 3': np.array(four_words_csv)[:,2], 'Word 4': np.array(four_words_csv)[:,3], 'corrcoef': corrcoef_outliers, 
            'cov':cov_outliers, 'dot':dot_outliers,'SVD':svd_outliers, 'PCA':pca_outliers , 'cosine': cosine_outliers})

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,corrcoef,cov,dot,SVD,PCA,cosine
0,train,car,plane,exercise,car,plane,plane,car,train,car
1,brother,sister,father,iron,iron,sister,sister,brother,sister,iron
2,stool,chair,couch,door,door,couch,couch,couch,stool,door
3,tree,grass,bush,clock,clock,tree,tree,tree,tree,clock
4,poster,photo,picture,camera,camera,camera,camera,poster,poster,camera
5,weight,height,length,meter,meter,meter,meter,height,weight,meter
6,worm,earthworm,cockroach,tiger,cockroach,worm,worm,cockroach,worm,cockroach
7,cereals,milk,butter,knife,knife,knife,knife,cereals,milk,knife
8,lunch,dinner,breakfast,kitchen,kitchen,kitchen,kitchen,lunch,lunch,kitchen
9,basketball,baseball,football,strategy,strategy,baseball,baseball,baseball,basketball,strategy


## Using sentences

In [14]:
corrcoef_outliers, cov_outliers, dot_outliers, cosine_outliers, svd_outliers, pca_outliers = [], [], [], [], [], []
for i in range(len(four_words_csv)): 
    vectors = np.zeros((4,1024))
    word0 = four_words_csv[i][0].split()[0]
    word1 = four_words_csv[i][1].split()[0]
    word2 = four_words_csv[i][2].split()[0]
    word3 = four_words_csv[i][3].split()[0]
    
    vectors[0,:] = bc.encode(['How similar is '+word0+' to '+word1+' , '+word2+' and '+word3])
    vectors[1,:] = bc.encode(['How similar is '+word1+' to '+word0+' , '+word2+' and '+word3])
    vectors[2,:] = bc.encode(['How similar is '+word2+' to '+word0+' , '+word1+' and '+word3])
    vectors[3,:] = bc.encode(['How similar is '+word3+' to '+word0+' , '+word1+' and '+word2])
    
    
    R = np.corrcoef(vectors)
    indices = np.argsort(R.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    corrcoef_outliers.append(outlier)

    C = np.cov(vectors)
    indices = np.argsort(C.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cov_outliers.append(outlier)

    D = np.dot(vectors, vectors.T)
    indices = np.argsort(D.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    dot_outliers.append(outlier)
    
    CO = np.dot(vectors, vectors.T)
    square_mag = np.diag(CO)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = CO * inv_mag
    cosine = cosine.T * inv_mag
    indices = np.argsort(cosine.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cosine_outliers.append(outlier)
    
    
    vecs = np.transpose(vectors)
    MeanVector = np.mean(vecs, axis = 1)[:,np.newaxis]
    vecs = np.concatenate((vecs,MeanVector), axis = 1)
    
    # SVD
    U, s, Vh = np.linalg.svd(vecs,full_matrices=False)
    Dist = np.zeros(4)

    for j in range(4):
        Dist[j] = np.linalg.norm(U[4,0]-U[j,0])+np.linalg.norm(U[4,1]-U[j,1])
    outlier = four_words_csv[i][np.where(Dist == max(Dist))[0][0]]
    svd_outliers.append(outlier)
    
    
    # PCA
    pca_vecs = PCA(n_components=2, svd_solver = 'arpack')
    X_pca_vecs = pca_vecs.fit_transform(vecs) # Calculate PCA of vector

    Dist_pca_vecs = np.zeros(4) # Calculate distance 
    for j in range(4):
        Dist_pca_vecs[j] = np.linalg.norm(X_pca_vecs[4,0]-X_pca_vecs[j,0])+np.linalg.norm(X_pca_vecs[4,1]-X_pca_vecs[j,1])
    outlier = four_words_csv[i][np.where(Dist_pca_vecs == max(Dist_pca_vecs))[0][0]]
    pca_outliers.append(outlier)
    
    print('Progress', i)
    

Progress 0
Progress 1
Progress 2
Progress 3
Progress 4
Progress 5
Progress 6
Progress 7
Progress 8
Progress 9
Progress 10
Progress 11
Progress 12
Progress 13
Progress 14
Progress 15
Progress 16
Progress 17
Progress 18
Progress 19
Progress 20
Progress 21
Progress 22
Progress 23
Progress 24
Progress 25
Progress 26
Progress 27
Progress 28
Progress 29
Progress 30
Progress 31
Progress 32
Progress 33
Progress 34
Progress 35
Progress 36
Progress 37
Progress 38
Progress 39
Progress 40
Progress 41
Progress 42
Progress 43
Progress 44
Progress 45
Progress 46
Progress 47
Progress 48
Progress 49
Progress 50
Progress 51
Progress 52
Progress 53
Progress 54
Progress 55
Progress 56
Progress 57
Progress 58
Progress 59
Progress 60
Progress 61
Progress 62
Progress 63
Progress 64
Progress 65
Progress 66
Progress 67
Progress 68
Progress 69
Progress 70
Progress 71
Progress 72
Progress 73
Progress 74
Progress 75
Progress 76
Progress 77
Progress 78
Progress 79
Progress 80
Progress 81
Progress 82
Progress 83
Pr

In [15]:
sum_corr = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == corrcoef_outliers[i]:
        sum_corr += 1
    
print('Outlier detection using correlation coefficients', sum_corr/len(four_words_csv))

sum_cov = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cov_outliers[i]:
        sum_cov += 1
    
print('Outlier detection using covarians', sum_cov/len(four_words_csv))

sum_dot = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == dot_outliers[i]:
        sum_dot += 1
    
print('Outlier detection using dot', sum_dot/len(four_words_csv))

sum_svd = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == svd_outliers[i]:
        sum_svd += 1
    
print('Outlier detection using SVD', sum_svd/len(four_words_csv))

sum_pca = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == pca_outliers[i]:
        sum_pca += 1
    
print('Outlier detection using PCA', sum_pca/len(four_words_csv))

sum_cosine = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cosine_outliers[i]:
        sum_cosine += 1
    
print('Outlier detection using cosine', sum_cosine/len(four_words_csv))

Outlier detection using correlation coefficients 0.825136612021858
Outlier detection using covarians 0.2677595628415301
Outlier detection using dot 0.2677595628415301
Outlier detection using SVD 0.08743169398907104
Outlier detection using PCA 0.01639344262295082
Outlier detection using cosine 0.825136612021858


In [16]:
corrcoef_outliers, cov_outliers, dot_outliers, cosine_outliers, svd_outliers, pca_outliers = [], [], [], [], [], []
for i in range(len(four_words_csv)): 
    vectors = np.zeros((4,1024))
    word0 = four_words_csv[i][0].split()[0]
    word1 = four_words_csv[i][1].split()[0]
    word2 = four_words_csv[i][2].split()[0]
    word3 = four_words_csv[i][3].split()[0]
    
    vectors[0,:] = bc.encode(['Does '+word0+' match with '+word1+' , '+word2+' and '+word3])
    vectors[1,:] = bc.encode(['Does '+word1+' match with '+word0+' , '+word2+' and '+word3])
    vectors[2,:] = bc.encode(['Does '+word2+' match with '+word0+' , '+word1+' and '+word3])
    vectors[3,:] = bc.encode(['Does '+word3+' match with '+word0+' , '+word1+' and '+word2])
    
    
    R = np.corrcoef(vectors)
    indices = np.argsort(R.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    corrcoef_outliers.append(outlier)

    C = np.cov(vectors)
    indices = np.argsort(C.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cov_outliers.append(outlier)

    D = np.dot(vectors, vectors.T)
    indices = np.argsort(D.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    dot_outliers.append(outlier)
    
    CO = np.dot(vectors, vectors.T)
    square_mag = np.diag(CO)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = CO * inv_mag
    cosine = cosine.T * inv_mag
    indices = np.argsort(cosine.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cosine_outliers.append(outlier)
    
    
    vecs = np.transpose(vectors)
    MeanVector = np.mean(vecs, axis = 1)[:,np.newaxis]
    vecs = np.concatenate((vecs,MeanVector), axis = 1)
    
    # SVD
    U, s, Vh = np.linalg.svd(vecs,full_matrices=False)
    Dist = np.zeros(4)

    for j in range(4):
        Dist[j] = np.linalg.norm(U[4,0]-U[j,0])+np.linalg.norm(U[4,1]-U[j,1])
    outlier = four_words_csv[i][np.where(Dist == max(Dist))[0][0]]
    svd_outliers.append(outlier)
    
    
    # PCA
    pca_vecs = PCA(n_components=2, svd_solver = 'arpack')
    X_pca_vecs = pca_vecs.fit_transform(vecs) # Calculate PCA of vector

    Dist_pca_vecs = np.zeros(4) # Calculate distance 
    for j in range(4):
        Dist_pca_vecs[j] = np.linalg.norm(X_pca_vecs[4,0]-X_pca_vecs[j,0])+np.linalg.norm(X_pca_vecs[4,1]-X_pca_vecs[j,1])
    outlier = four_words_csv[i][np.where(Dist_pca_vecs == max(Dist_pca_vecs))[0][0]]
    pca_outliers.append(outlier)
    
    print('Progress', i)
    

Progress 0
Progress 1
Progress 2
Progress 3
Progress 4
Progress 5
Progress 6
Progress 7
Progress 8
Progress 9
Progress 10
Progress 11
Progress 12
Progress 13
Progress 14
Progress 15
Progress 16
Progress 17
Progress 18
Progress 19
Progress 20
Progress 21
Progress 22
Progress 23
Progress 24
Progress 25
Progress 26
Progress 27
Progress 28
Progress 29
Progress 30
Progress 31
Progress 32
Progress 33
Progress 34
Progress 35
Progress 36
Progress 37
Progress 38
Progress 39
Progress 40
Progress 41
Progress 42
Progress 43
Progress 44
Progress 45
Progress 46
Progress 47
Progress 48
Progress 49
Progress 50
Progress 51
Progress 52
Progress 53
Progress 54
Progress 55
Progress 56
Progress 57
Progress 58
Progress 59
Progress 60
Progress 61
Progress 62
Progress 63
Progress 64
Progress 65
Progress 66
Progress 67
Progress 68
Progress 69
Progress 70
Progress 71
Progress 72
Progress 73
Progress 74
Progress 75
Progress 76
Progress 77
Progress 78
Progress 79
Progress 80
Progress 81
Progress 82
Progress 83
Pr

In [17]:
sum_corr = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == corrcoef_outliers[i]:
        sum_corr += 1
    
print('Outlier detection using correlation coefficients', sum_corr/len(four_words_csv))

sum_cov = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cov_outliers[i]:
        sum_cov += 1
    
print('Outlier detection using covarians', sum_cov/len(four_words_csv))

sum_dot = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == dot_outliers[i]:
        sum_dot += 1
    
print('Outlier detection using dot', sum_dot/len(four_words_csv))

sum_svd = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == svd_outliers[i]:
        sum_svd += 1
    
print('Outlier detection using SVD', sum_svd/len(four_words_csv))

sum_pca = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == pca_outliers[i]:
        sum_pca += 1
    
print('Outlier detection using PCA', sum_pca/len(four_words_csv))

sum_cosine = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cosine_outliers[i]:
        sum_cosine += 1
    
print('Outlier detection using cosine', sum_cosine/len(four_words_csv))

Outlier detection using correlation coefficients 0.7377049180327869
Outlier detection using covarians 0.5846994535519126
Outlier detection using dot 0.5846994535519126
Outlier detection using SVD 0.09289617486338798
Outlier detection using PCA 0.12021857923497267
Outlier detection using cosine 0.7377049180327869


In [18]:
corrcoef_outliers, cov_outliers, dot_outliers, cosine_outliers, svd_outliers, pca_outliers = [], [], [], [], [], []
for i in range(len(four_words_csv)): 
    vectors = np.zeros((4,1024))
    word0 = four_words_csv[i][0].split()[0]
    word1 = four_words_csv[i][1].split()[0]
    word2 = four_words_csv[i][2].split()[0]
    word3 = four_words_csv[i][3].split()[0]
    
    vectors[0,:] = np.mean(bc.encode(['How similar is '+word0+' to '+word1])+bc.encode(['How similar is '+word0+' to '+word2])+bc.encode(['How similar is '+word0+' to '+word3]), axis=0)                       
    vectors[1,:] = np.mean(bc.encode(['How similar is '+word1+' to '+word0])+bc.encode(['How similar is '+word1+' to '+word2])+bc.encode(['How similar is '+word1+' to '+word3]), axis=0)
    vectors[2,:] = np.mean(bc.encode(['How similar is '+word2+' to '+word0])+bc.encode(['How similar is '+word2+' to '+word1])+bc.encode(['How similar is '+word2+' to '+word3]), axis=0)
    vectors[3,:] = np.mean(bc.encode(['How similar is '+word3+' to '+word0])+bc.encode(['How similar is '+word3+' to '+word1])+bc.encode(['How similar is '+word3+' to '+word2]), axis=0)
    
    
    R = np.corrcoef(vectors)
    indices = np.argsort(R.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    corrcoef_outliers.append(outlier)

    C = np.cov(vectors)
    indices = np.argsort(C.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cov_outliers.append(outlier)

    D = np.dot(vectors, vectors.T)
    indices = np.argsort(D.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    dot_outliers.append(outlier)
    
    CO = np.dot(vectors, vectors.T)
    square_mag = np.diag(CO)
    inv_square_mag = 1 / square_mag
    inv_square_mag[np.isinf(inv_square_mag)] = 0
    inv_mag = np.sqrt(inv_square_mag)
    cosine = CO * inv_mag
    cosine = cosine.T * inv_mag
    indices = np.argsort(cosine.sum(axis=0))
    outlier = four_words_csv[i][indices[0]]
    cosine_outliers.append(outlier)
    
    
    vecs = np.transpose(vectors)
    MeanVector = np.mean(vecs, axis = 1)[:,np.newaxis]
    vecs = np.concatenate((vecs,MeanVector), axis = 1)
    
    # SVD
    U, s, Vh = np.linalg.svd(vecs,full_matrices=False)
    Dist = np.zeros(4)

    for j in range(4):
        Dist[j] = np.linalg.norm(U[4,0]-U[j,0])+np.linalg.norm(U[4,1]-U[j,1])
    outlier = four_words_csv[i][np.where(Dist == max(Dist))[0][0]]
    svd_outliers.append(outlier)
    
    
    # PCA
    pca_vecs = PCA(n_components=2, svd_solver = 'arpack')
    X_pca_vecs = pca_vecs.fit_transform(vecs) # Calculate PCA of vector

    Dist_pca_vecs = np.zeros(4) # Calculate distance 
    for j in range(4):
        Dist_pca_vecs[j] = np.linalg.norm(X_pca_vecs[4,0]-X_pca_vecs[j,0])+np.linalg.norm(X_pca_vecs[4,1]-X_pca_vecs[j,1])
    outlier = four_words_csv[i][np.where(Dist_pca_vecs == max(Dist_pca_vecs))[0][0]]
    pca_outliers.append(outlier)
    
    print('Progress', i)
    

Progress 0
Progress 1
Progress 2
Progress 3
Progress 4
Progress 5
Progress 6
Progress 7
Progress 8
Progress 9
Progress 10
Progress 11
Progress 12
Progress 13
Progress 14
Progress 15
Progress 16
Progress 17
Progress 18
Progress 19
Progress 20
Progress 21
Progress 22
Progress 23
Progress 24
Progress 25
Progress 26
Progress 27
Progress 28
Progress 29
Progress 30
Progress 31
Progress 32
Progress 33
Progress 34
Progress 35
Progress 36
Progress 37
Progress 38
Progress 39
Progress 40
Progress 41
Progress 42
Progress 43
Progress 44
Progress 45
Progress 46
Progress 47
Progress 48
Progress 49
Progress 50
Progress 51
Progress 52
Progress 53
Progress 54
Progress 55
Progress 56
Progress 57
Progress 58
Progress 59
Progress 60
Progress 61
Progress 62
Progress 63
Progress 64
Progress 65
Progress 66
Progress 67
Progress 68
Progress 69
Progress 70
Progress 71
Progress 72
Progress 73
Progress 74
Progress 75
Progress 76
Progress 77
Progress 78
Progress 79
Progress 80
Progress 81
Progress 82
Progress 83
Pr

In [19]:
sum_corr = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == corrcoef_outliers[i]:
        sum_corr += 1
    
print('Outlier detection using correlation coefficients', sum_corr/len(four_words_csv))

sum_cov = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cov_outliers[i]:
        sum_cov += 1
    
print('Outlier detection using covarians', sum_cov/len(four_words_csv))

sum_dot = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == dot_outliers[i]:
        sum_dot += 1
    
print('Outlier detection using dot', sum_dot/len(four_words_csv))

sum_svd = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == svd_outliers[i]:
        sum_svd += 1
    
print('Outlier detection using SVD', sum_svd/len(four_words_csv))

sum_pca = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == pca_outliers[i]:
        sum_pca += 1
    
print('Outlier detection using PCA', sum_pca/len(four_words_csv))

sum_cosine = 0
for i in range(len(four_words_csv)):
    if four_words_csv[i][3] == cosine_outliers[i]:
        sum_cosine += 1
    
print('Outlier detection using cosine', sum_cosine/len(four_words_csv))

Outlier detection using correlation coefficients 0.6994535519125683
Outlier detection using covarians 0.09836065573770492
Outlier detection using dot 0.09836065573770492
Outlier detection using SVD 0.03825136612021858
Outlier detection using PCA 0.00546448087431694
Outlier detection using cosine 0.6994535519125683
