# 1.2 Experiment: toxic comments data

## A. Import modules and functions
## B. Load & prepare data
## C. Load test data
## D. Experiment:a
### 1) SVD
### 2) word2vec
### 3) Bitmap
### 4) Poincare
## E. Results

# A. Import modules and functions

In [2]:
# import libs
import numpy as np
import time
import gensim
from collections import Counter
from itertools import combinations
import pandas as pd
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from math import log, isnan
import random
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
try:
    maketrans = ''.maketrans
except AttributeError:
    # fallback for Python 2
    from string import maketrans
    
import nltk


# import functions

from search_algorithm import cosine_distance, a_nn, calc_cutoff, power_means, generate_transactions, generate_patterns
from embeddings_script import generate_svd, generate_word2vec, generate_poincare, prepare_poincare_relations, prepare_poincare_relationsA, prepare_poincare_relationsB
from bitmap_index import gen_index, search



Using TensorFlow backend.


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def text_transformation(input_text):
    stemmer = LancasterStemmer()
    stop_words = set(stopwords.words("english"))
    
    
    result = input_text.lower().replace('\n',' ').translate(maketrans("","", string.punctuation)).strip().split(" ")
    
    result = [stemmer.stem(str(i)) for i in result if not i in stop_words]
    return result

# B. Load Data

In [5]:
dataset = pd.read_csv('toxic_comments.csv', nrows = 5000)
dataset['text'] = dataset.apply(lambda x: text_transformation(x['comment_text']), axis = 1)

data_dict = {}

for i,row in dataset.iterrows():
    data_dict[row['id']] = row['text']



# C. Generate test data

In [6]:
a_time = time.time()

patterns_dict = {}
i = 0
for key in data_dict.keys():
    patterns_dict['p_'+str(i)] = data_dict[key][:1]
    i+=1
    if len(data_dict[key]) > 2:
        patterns_dict['p_'+str(i)] = data_dict[key][:2]
        i+=1
    if len(data_dict[key]) >3:
        patterns_dict['p_'+str(i)] = data_dict[key][:len(data_dict[key])-2]
        i+=1

for key in patterns_dict.keys():
    if str(patterns_dict[key]) == 'nan':
        patterns_dict.pop(key)


# for key in data_dict.keys():
#     if str(data_dict[key]) == 'nan':
#         data_dict.pop(key)
        
        
        
patterns_1 = [key for key in patterns_dict.keys() if len(patterns_dict[key]) == 1]
patterns_2 = [key for key in patterns_dict.keys() if len(patterns_dict[key]) == 2]
patterns_3 = [key for key in patterns_dict.keys() if len(patterns_dict[key]) > 2]

experimental_data = patterns_dict
experimental_data.update(data_dict)


to_delete = []
for key in experimental_data.keys():
    if experimental_data[key] == []:
        to_delete.append(key)
        
for key in to_delete:
    experimental_data.pop(key)

print("Processing took {} s".format(time.time()-a_time))

Processing took 0.20299744606018066 s


# D. Experiment

## 1) SVD

In [7]:
a_time = time.time()

svd = generate_svd(experimental_data, n_dim = 300)

p_data = {}
p_svd = {}
t_data = {}
t_svd = {}

for key in svd.keys():
    if str(key)[0] == 'p':
        p_svd[key] = svd[key]
        p_data[key] = experimental_data[key]
    else:
        t_svd[key] = svd[key]
        t_data[key] = experimental_data[key]

svd_cutoff = calc_cutoff(p_data,p_svd,t_data,t_svd)

print(svd_cutoff)

print("Processing took {} s".format(time.time()-a_time))


0.3016855185060599
Processing took 820.5123383998871 s


In [8]:
test_patterns1 = patterns_1[:5000] #75%/10k;59%/10k
test_patterns2 = patterns_2[:len(test_patterns1)] #65%/10k;23%/10k
test_patterns3 = patterns_3[:len(test_patterns1)] #86%/10k;43%/10k

# test_patterns = test_patterns1

# a_time = time.time()

# length = 0
# correct = 0
# for pattern in test_patterns:
#     for transaction in t_svd.keys():
#         if cosine_distance(p_svd[pattern],t_svd[transaction]) < svd_cutoff:
#             if set(p_data[pattern]) <= set(t_data[transaction]):
#                 correct += 1
            
#             break
            
#     length += 1

# print("Searching quality: {}".format(float(correct)/float(length)))            
# print("Processing took {} s".format(time.time()-a_time))


# test_patterns = test_patterns2

# a_time = time.time()

# length = 0
# correct = 0
# for pattern in test_patterns:
#     for transaction in t_svd.keys():
#         if cosine_distance(p_svd[pattern],t_svd[transaction]) < svd_cutoff:
#             if set(p_data[pattern]) <= set(t_data[transaction]):
#                 correct += 1
            
#             break
            
#     length += 1

# print("Searching quality: {}".format(float(correct)/float(length)))            
# print("Processing took {} s".format(time.time()-a_time))


test_patterns = test_patterns3

a_time = time.time()

length = 0
correct = 0
for pattern in test_patterns:
    for transaction in t_svd.keys():
        if cosine_distance(p_svd[pattern],t_svd[transaction]) < svd_cutoff:
            if set(p_data[pattern]) <= set(t_data[transaction]):
                correct += 1
            
            break
            
    length += 1

print("Searching quality: {}".format(float(correct)/float(length)))            
print("Processing took {} s".format(time.time()-a_time))


# Searching quality: 0.6741348269653931
# Processing took 3329.4174551963806 s
# Searching quality: 0.2996599319863973
# Processing took 3166.5946822166443 s
# Searching quality: 0.7632149046793761
# Processing took 3478.0479485988617 s

Searching quality: 0.7632149046793761
Processing took 3478.0479485988617 s


## 2) word2vec

In [6]:
a_time = time.time()


word2vec = generate_word2vec(experimental_data)

print("Model generation took {} s".format(time.time()-a_time))

p_data = {}
p_word2vec = {}
t_data = {}
t_word2vec = {}

for key in word2vec.keys():
    if str(key)[0] == 'p':
        p_word2vec[key] = word2vec[key]
        p_data[key] = experimental_data[key]
    else:
        t_word2vec[key] = word2vec[key]
        t_data[key] = experimental_data[key]

word2vec_cutoff = calc_cutoff(p_data,p_word2vec,t_data,t_word2vec)

print(word2vec_cutoff)

print("Processing took {} s".format(time.time()-a_time))

# Last run time:
# 5 s

Model generation took 90.14730715751648 s
0.5203972983027424
Processing took 93.18852305412292 s


In [7]:

word2vec_cutoff = calc_cutoff(p_data,p_word2vec,t_data,t_word2vec)

print(word2vec_cutoff)

0.5319072774727861


In [8]:


test_patterns1 = patterns_1[:10000] #75%/10k;59%/10k
test_patterns2 = patterns_2[:len(test_patterns1)] #65%/10k;23%/10k
test_patterns3 = patterns_3[:len(test_patterns1)] #86%/10k;43%/10k

test_patterns = test_patterns1

a_time = time.time()

length = 0
correct = 0
empty_run = 0
for pattern in test_patterns:
    found = False
    for transaction in t_word2vec.keys():
        if cosine_distance(p_word2vec[pattern],t_word2vec[transaction]) < word2vec_cutoff:
            if set(p_data[pattern]) <= set(t_data[transaction]):
                correct += 1
            found = True
            break
    if not found:
        empty_run+=1
    length += 1

print("Searching quality: {}".format(float(correct)/(float(length)-float(empty_run))))
print("Empty runs: {} Correct: {} TOTAL: {} ratio correct/total: {}".format(float(empty_run),float(correct),float(length),float(correct)/float(length)))            
print("Processing took {} s".format(time.time()-a_time))


test_patterns = test_patterns2

a_time = time.time()

length = 0
correct = 0
empty_run = 0
for pattern in test_patterns:
    for transaction in t_word2vec.keys():
        if cosine_distance(p_word2vec[pattern],t_word2vec[transaction]) < word2vec_cutoff:
            if set(p_data[pattern]) <= set(t_data[transaction]):
                correct += 1
            found = True
            break
            
    if not found:
        empty_run+=1
    length += 1

print("Searching quality: {}".format(float(correct)/(float(length)-float(empty_run))))
print("Empty runs: {} Correct: {} TOTAL: {} ratio correct/total: {}".format(float(empty_run),float(correct),float(length),float(correct)/float(length)))            
print("Processing took {} s".format(time.time()-a_time))



test_patterns = test_patterns3

a_time = time.time()

length = 0
correct = 0
empty_run = 0
for pattern in test_patterns:
    for transaction in t_word2vec.keys():
        if cosine_distance(p_word2vec[pattern],t_word2vec[transaction]) < word2vec_cutoff:
            if set(p_data[pattern]) <= set(t_data[transaction]):
                correct += 1
            found = True
            break
            
    if not found:
        empty_run+=1
                        
    length += 1

print("Searching quality: {}".format(float(correct)/(float(length)-float(empty_run))))
print("Empty runs: {} Correct: {} TOTAL: {} ratio correct/total: {}".format(float(empty_run),float(correct),float(length),float(correct)/float(length)))              
print("Processing took {} s".format(time.time()-a_time))


# Searching quality: 0.921667628390075
# Empty runs: 3067.0 Correct: 6389.0 TOTAL: 9999.0 ratio correct/total: 0.638963896389639
# Processing took 525.1704280376434 s
# Searching quality: 0.23445026178010472
# Empty runs: 0.0 Correct: 2239.0 TOTAL: 9550.0 ratio correct/total: 0.23445026178010472
# Processing took 430.62952637672424 s
# Searching quality: 0.2741135180927024
# Empty runs: 0.0 Correct: 2265.0 TOTAL: 8263.0 ratio correct/total: 0.2741135180927024
# Processing took 186.3489751815796 s

# Searching quality: 0.9070177648985763
# Empty runs: 2062.0 Correct: 7199.0 TOTAL: 9999.0 ratio correct/total: 0.7199719971997199
# Processing took 417.22366738319397 s
# Searching quality: 0.14607329842931938
# Empty runs: 0.0 Correct: 1395.0 TOTAL: 9550.0 ratio correct/total: 0.14607329842931938
# Processing took 223.90713953971863 s
# Searching quality: 0.09972165073217959
# Empty runs: 0.0 Correct: 824.0 TOTAL: 8263.0 ratio correct/total: 0.09972165073217959
# Processing took 72.7106556892395 s

Searching quality: 0.904968644476604
Empty runs: 1707.0 Correct: 7504.0 TOTAL: 9999.0 ratio correct/total: 0.7504750475047505
Processing took 377.24962401390076 s
Searching quality: 0.15081508150815082
Empty runs: 0.0 Correct: 1508.0 TOTAL: 9999.0 ratio correct/total: 0.15081508150815082
Processing took 229.97160744667053 s
Searching quality: 0.0560302866414278
Empty runs: 0.0 Correct: 518.0 TOTAL: 9245.0 ratio correct/total: 0.0560302866414278
Processing took 43.96140766143799 s


## 3) Bitmap index

In [None]:
a_time = time.time()

t_index, i_index = gen_index(experimental_data, d = 64)

print("Index generated")

print("Processing took {} s".format(time.time()-a_time))

# Last run time:
# 80 s


In [None]:

a_time = time.time()


test_dict = {}
test_i = {}
pattern_dict = {}
pattern_i = {}

# test_keys = [key for key in data_dict.keys() if str(key)[0] == 't']

for key in t_index.keys():
    if str(key)[0] != 'p':
        test_dict[key] = t_index[key]
        test_i[key] = i_index[key]
    elif str(key)[0] == 'p':
        pattern_dict[key] = t_index[key]
        pattern_i[key] = i_index[key]
    

print("Processing took {} s".format(time.time() - a_time))

In [None]:
test_patterns1 = patterns_1[:10000] #75%/10k;59%/10k
test_patterns2 = patterns_2[:len(test_patterns1)] #65%/10k;23%/10k
test_patterns3 = patterns_3[:len(test_patterns1)] #86%/10k;43%/10k

test_patterns = test_patterns1

a_time = time.time()

count = 0
correct = 0
real_correct = 0
for key in test_patterns:
    for t_key in test_dict.keys():
        trigger = False
        if set(experimental_data[key]) <= set(experimental_data[t_key]):
            trigger = True
        if search(pattern_dict[key],pattern_i[key],test_dict[t_key],test_i[t_key]):
            correct +=1
            if trigger:
                real_correct += 1
            break
    count+=1

            
print("Searching quality: {} {}".format(float(correct)/float(count),float(real_correct)/float(count)))            
print("Processing took {} s".format(time.time()-a_time))


test_patterns = test_patterns2

a_time = time.time()


count = 0
correct = 0
real_correct = 0
for key in test_patterns:
    for t_key in test_dict.keys():
        trigger = False
        if set(experimental_data[key]) <= set(experimental_data[t_key]):
            trigger = True
        if search(pattern_dict[key],pattern_i[key],test_dict[t_key],test_i[t_key]):
            correct +=1
            if trigger:
                real_correct += 1
            break
    count+=1

            
print("Searching quality: {} {}".format(float(correct)/float(count),float(real_correct)/float(count)))            
print("Processing took {} s".format(time.time()-a_time))

test_patterns = test_patterns3

a_time = time.time()


count = 0
correct = 0
real_correct = 0
for key in test_patterns:
    for t_key in test_dict.keys():
        trigger = False
        if set(experimental_data[key]) <= set(experimental_data[t_key]):
            trigger = True
        if search(pattern_dict[key],pattern_i[key],test_dict[t_key],test_i[t_key]):
            correct +=1
            if trigger:
                real_correct += 1
            break
    count+=1

            
print("Searching quality: {} {}".format(float(correct)/float(count),float(real_correct)/float(count)))            
print("Processing took {} s".format(time.time()-a_time))

# Searching quality: 1.0 1.0
# Processing took 40.4065318107605 s
# Searching quality: 1.0 1.0
# Processing took 104.36041164398193 s
# Searching quality: 1.0 1.0
# Processing took 156.75257325172424 s


## 4) Poincare

In [14]:
#load embeddings
poincare_embeddings = pd.read_csv("text_data_poincare_A_full_10d.tsv", sep="\t",header=None)

poincare_rels_dict = {}

for _,row in poincare_embeddings.iterrows():
#     print(row[0])
#     print([row[x] for x in range(1,11)])
#     break
    poincare_rels_dict[row[0]] = [row[x] for x in range(1,11)]


In [10]:
poincare_embeddings.shape

(196141, 11)

In [22]:
p_keys = list(set([key for key in poincare_rels_dict.keys()]))
exp_keys = list(set([obj for key in experimental_data.keys() for obj in experimental_data[key]]))
difference = list(set(exp_keys)-set(p_keys))
difference

['', 'nan', 'null']

In [23]:
a_time = time.time()
poincare = {}
for key in experimental_data.keys():
    if key != '':
        poincare[key] = power_means([poincare_rels_dict[obj] for obj in experimental_data[key] if obj not in difference])
    
    if poincare[key] == []:
        poincare.pop(key)
    if str(poincare[key]) == 'nan':
        poincare.pop(key)

patterns_dict = {}
poincare_patterns = {}
t_data = {}
poincare_transactions = {}

for key in poincare.keys():
    if str(key)[0] == 'p':
        poincare_patterns[key] = poincare[key]
        patterns_dict[key] = experimental_data[key]
    else:
        poincare_transactions[key] = poincare[key]
        t_data[key] = experimental_data[key]

        
    

poincare_cutoff = calc_cutoff(patterns_dict,poincare_patterns,t_data,poincare_transactions)

print(poincare_cutoff)

print("Processing took {} s".format(time.time()-a_time))

  import sys
  return np.power(np.power(data,p).mean(axis=0), 1/p)
  ret = ret.dtype.type(ret / rcount)
  import sys


0.4390734031031121
Processing took 9.89581561088562 s


In [25]:
test_patterns1 = [key for key in poincare_patterns.keys() if len(experimental_data[key]) == 1][:5000] #75%/10k;59%/10k
test_patterns2 = [key for key in poincare_patterns.keys() if len(experimental_data[key]) == 2][:len(patterns_1)] #65%/10k;23%/10k
test_patterns3 = [key for key in poincare_patterns.keys() if len(experimental_data[key]) == 3][:len(patterns_1)] #86%/10k;43%/10k

test_patterns = test_patterns1

a_time = time.time()

length = 0
correct = 0
empty_run = 0
for pattern in test_patterns:
    for transaction in poincare_transactions.keys():
        found = False
        if cosine_distance(poincare_patterns[pattern],poincare_transactions[transaction]) < poincare_cutoff:
            if set(patterns_dict[pattern]) <= set(data_dict[transaction]):
                correct += 1
            found = True
            break
    if not found:
        empty_run+=1
            
    length += 1


print("Searching quality: {}".format(float(correct)/(float(length)-float(empty_run))))
print("Empty runs: {} Correct: {} TOTAL: {} ratio correct/total: {}".format(float(empty_run),float(correct),float(length),float(correct)/float(length)))              
print("Processing took {} s".format(time.time()-a_time))


test_patterns = test_patterns2

a_time = time.time()

correct = 0
empty_run = 0
for pattern in test_patterns:
    for transaction in poincare_transactions.keys():
        found = False
        if cosine_distance(poincare_patterns[pattern],poincare_transactions[transaction]) < poincare_cutoff:
            if set(patterns_dict[pattern]) <= set(data_dict[transaction]):
                correct += 1
            found = True
            break
    if not found:
        empty_run+=1
            
    length += 1


print("Searching quality: {}".format(float(correct)/(float(length)-float(empty_run))))
print("Empty runs: {} Correct: {} TOTAL: {} ratio correct/total: {}".format(float(empty_run),float(correct),float(length),float(correct)/float(length)))              
print("Processing took {} s".format(time.time()-a_time))

test_patterns = test_patterns3

a_time = time.time()

correct = 0
empty_run = 0
for pattern in test_patterns:
    for transaction in poincare_transactions.keys():
        found = False
        if cosine_distance(poincare_patterns[pattern],poincare_transactions[transaction]) < poincare_cutoff:
            if set(patterns_dict[pattern]) <= set(data_dict[transaction]):
                correct += 1
            found = True
            break
    if not found:
        empty_run+=1
            
    length += 1


print("Searching quality: {}".format(float(correct)/(float(length)-float(empty_run))))
print("Empty runs: {} Correct: {} TOTAL: {} ratio correct/total: {}".format(float(empty_run),float(correct),float(length),float(correct)/float(length)))              
print("Processing took {} s".format(time.time()-a_time))


# C, d == 10
# Searching quality: 0.0762
# Processing took 0.2690761089324951 s
# Searching quality: 0.013961072047284214
# Processing took 0.4956812858581543 s
# Searching quality: 0.005426275696545967
# Processing took 0.4933924674987793 s

# A, d == 10
# Searching quality: 0.0914
# Empty runs: 0.0 Correct: 457.0 TOTAL: 5000.0 ratio correct/total: 0.0914
# Processing took 1.212188959121704 s
# Searching quality: 0.014467631175411694
# Empty runs: 0.0 Correct: 217.0 TOTAL: 14999.0 ratio correct/total: 0.014467631175411694
# Processing took 2.1888270378112793 s
# Searching quality: 0.00013034410844629823
# Empty runs: 0.0 Correct: 2.0 TOTAL: 15344.0 ratio correct/total: 0.00013034410844629823
# Processing took 0.05595088005065918 s

Searching quality: 0.0914
Empty runs: 0.0 Correct: 457.0 TOTAL: 5000.0 ratio correct/total: 0.0914
Processing took 1.212188959121704 s
Searching quality: 0.014467631175411694
Empty runs: 0.0 Correct: 217.0 TOTAL: 14999.0 ratio correct/total: 0.014467631175411694
Processing took 2.1888270378112793 s
Searching quality: 0.00013034410844629823
Empty runs: 0.0 Correct: 2.0 TOTAL: 15344.0 ratio correct/total: 0.00013034410844629823
Processing took 0.05595088005065918 s
