# 2. Experiment: toxic comments data

## A. Import modules 
## B. Import functions 
## C. Load  data
## D. Generate embeddings
### 1) SVD
### 2) word2vec
### 3) Poincare
## E. Results

# A. Import modules

In [40]:
# import libs
import numpy as np
import time
import gensim
from collections import Counter
from itertools import combinations
import pandas as pd
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from math import log, isnan
import random
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

from gensim.models import word2vec
try:
    maketrans = ''.maketrans
except AttributeError:
    # fallback for Python 2
    from string import maketrans
    
import nltk

results = []

In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# A. Import functions

In [42]:
def text_transformation(input_text):
    stemmer = LancasterStemmer()
    stop_words = set(stopwords.words("english"))
    
    
    result = input_text.lower().replace('\n',' ').translate(maketrans("","", string.punctuation)).strip().split(" ")
    
    result = [stemmer.stem(str(i)) for i in result if not i in stop_words]
    return result

def generate_svd(transactions_dict, n_dim = 300, negative = False):
	data_list = [v for _,v in transactions_dict.items()]

	unigrams_cnt = Counter()
	bigrams_cnt = Counter()
	for text in data_list:
		for x in text:
			unigrams_cnt[x] += 1
		for x, y in map(sorted, combinations(text, 2)):
			bigrams_cnt[(x, y)] += 1

	id2uni = {}
	uni2id = {}
	it = 0

	for uni,_ in unigrams_cnt.items():
		id2uni[it] = uni
		uni2id[uni] = it
		it +=1


	sum_uni = float(sum(unigrams_cnt.values()))
	sum_bi = float(sum(bigrams_cnt.values()))

	data, rows, cols = [], [], []
	for (x, y), n in bigrams_cnt.items():
		rows.append(uni2id[x])
		cols.append(uni2id[y])
		data.append(log((n / sum_bi) / (unigrams_cnt[x] / sum_uni) / (unigrams_cnt[y] / sum_uni)))
	PMI = csc_matrix((data, (cols, rows)), shape = (len(unigrams_cnt), len(unigrams_cnt)))
	U,_,_ = svds(PMI, k = n_dim)
	norms = np.sqrt(np.sum(np.square(U), axis=1, keepdims=True))
	U /= np.maximum(norms, 1e-7)
    
    

	result_t_dict = {}

	for key in transactions_dict.keys():
		for product in transactions_dict[key]:
			temp = [U[uni2id[product]] for product in transactions_dict[key]]
			result_t_dict[key] = power_means([x for x in temp])

   
	return result_t_dict

def generate_word2vec(transactions_dict, n_dim = 300, n_workers = 10, n_epochs = 20, negative = False):
	data_list = [v for _,v in transactions_dict.items()]
	window_size = max([len(x) for x in data_list])

	model = word2vec.Word2Vec(data_list, size = n_dim, window = window_size, min_count = 1, workers = n_workers)

	model.train(data_list, total_examples = len(data_list), epochs = n_epochs)

    
	result_t_dict = {}
	result_p_dict = {}

	for key in transactions_dict.keys():
		result_t_dict[key] = power_means([model[product] for product in transactions_dict[key]])

	return result_t_dict

def power_means(list_of_vectors, p = 1):
	data = np.array(list_of_vectors)

	return np.power(np.power(data,p).mean(axis=0), 1/p)

# B. Load data

In [43]:
dataset = pd.read_csv('toxic_comments.csv', nrows = 10000)
dataset['text'] = dataset.apply(lambda x: text_transformation(x['comment_text']), axis = 1)

data_dict = {}
score_dict = {}

for i,row in dataset.iterrows():
    data_dict[row['id']] = row['text']
    score_dict[row['id']] = row['toxic']

    
keys = list(data_dict.keys())
for key in keys:
    if data_dict[key] == []:
        data_dict.pop(key)
        score_dict.pop(key)

# C. Generate embeddings

## 1) SVD

In [81]:
svd_dim = 100

svd_embeddings = generate_svd(data_dict, n_dim = svd_dim)

X_svd = [svd_embeddings[key] for key in score_dict.keys()]
Y_svd = [score_dict[key] for key in score_dict.keys()]

## 2) word2vec

In [86]:
word2vec_dim = 20

word2vec_embeddings = generate_word2vec(data_dict, n_dim = word2vec_dim)

X_word2vec = [word2vec_embeddings[key] for key in score_dict.keys()]
Y_word2vec = [score_dict[key] for key in score_dict.keys()]



## 3) Poincare

In [103]:
poincare_dataset = pd.read_csv("text_data_poincare_A_10k_50d.tsv", sep="\t",header=None)

poincare_rels_dict = {}
poincare_dim = 50

for _,row in poincare_dataset.iterrows():
    poincare_rels_dict[row[0]] = [row[x] for x in range(1,poincare_dim+1)]

diff_key = list(set([obj for key in data_dict.keys() for obj in data_dict[key]])-set([key for key in poincare_rels_dict.keys()]))


In [104]:
  
poincare_embeddings = {}
for key in data_dict.keys():
    if key != '':
        poincare_embeddings[key] = power_means([poincare_rels_dict[obj] for obj in data_dict[key] if obj not in diff_key])
    
    if poincare_embeddings[key] == []:
        poincare_embeddings.pop(key)
        score_dict.pop(key)
    if str(poincare_embeddings[key]) == 'nan':
        poincare_embeddings.pop(key)
        if key in score_dict.keys():
            score_dict.pop(key)

  
  ret = ret.dtype.type(ret / rcount)
  


In [105]:
X_poincare = [poincare_embeddings[key] for key in score_dict.keys()]
Y_poincare = [score_dict[key] for key in score_dict.keys()]

# D. Results

# 0) Dummy classifier

In [68]:
_,_,Y_svd_train,Y_svd_test = train_test_split(X_svd,Y_svd,test_size=0.3,stratify=Y_svd,random_state=111)

d_clf = DummyClassifier(strategy='prior', random_state=0).fit([0 for i in range(len(Y_svd_train))],Y_svd_train)

scoretrain = d_clf.score([1 for i in range(len(Y_svd_train))],Y_svd_train)
scoretest  = d_clf.score([0 for i in range(len(Y_svd_test))],Y_svd_test)

# print("Dummy classifier training score :{:2f} , Test Score: {:2f} \n".format(scoretrain,scoretest))

results.append({"method":"dummy", "test_score":scoretest, "train_score":scoretrain,"dim":0})

## 1) SVD

In [83]:
X_svd_train,X_svd_test,Y_svd_train,Y_svd_test = train_test_split(X_svd,Y_svd,test_size=0.3,stratify=Y_svd,random_state=111)

SVC_svd = SVC(kernel='linear').fit(X_svd_train,Y_svd_train)

scoretrain = SVC_svd.score(X_svd_train,Y_svd_train)
scoretest  = SVC_svd.score(X_svd_test,Y_svd_test)

# print("Linear SVM training score :{:2f} , Test Score: {:2f} \n".format(scoretrain,scoretest))on

results.append({"method":"svd", "test_score":scoretest, "train_score":scoretrain,"dim":svd_dim})

## 2) word2vec

In [87]:
X_word2vec_train,X_word2vec_test,Y_word2vec_train,Y_word2vec_test = train_test_split(X_word2vec,Y_word2vec,test_size=0.3,stratify=Y_word2vec,random_state=111)


SVC_word2vec = SVC(kernel='linear').fit(X_word2vec_train,Y_word2vec_train)

scoretrain = SVC_word2vec.score(X_word2vec_train,Y_word2vec_train)
scoretest  = SVC_word2vec.score(X_word2vec_test,Y_word2vec_test)

# print("Linear SVM training score :{:2f} , Test Score: {:2f} \n".format(scoretrain,scoretest))

results.append({"method":"word2vec", "test_score":scoretest, "train_score":scoretrain,"dim":word2vec_dim})

## 3) Poincare

In [106]:
X_poincare_train,X_poincare_test,Y_poincare_train,Y_poincare_test = train_test_split(X_poincare,Y_poincare,test_size=0.3,stratify=Y_poincare,random_state=111)


SVC_poincare = SVC(kernel='linear').fit(X_poincare_train,Y_poincare_train)

scoretrain = SVC_poincare.score(X_poincare_train,Y_poincare_train)
scoretest  = SVC_poincare.score(X_poincare_test,Y_poincare_test)


# print("Linear SVM training score :{:2f} , Test Score: {:2f} \n".format(scoretrain,scoretest))

results.append({"method":"poincare", "test_score":scoretest, "train_score":scoretrain,"dim":poincare_dim})

# Results

In [107]:
df_results = pd.DataFrame(results)

df_results


Unnamed: 0,dim,method,test_score,train_score
0,0,dummy,0.903,0.902843
1,10,svd,0.903,0.902843
2,10,word2vec,0.924667,0.920703
3,10,poincare,0.922333,0.918834
4,0,dummy,0.903,0.902843
5,0,dummy,0.903,0.902843
6,0,dummy,0.903,0.902843
7,0,dummy,0.903,0.902843
8,0,dummy,0.903,0.902843
9,0,dummy,0.903,0.902843


In [108]:
df_results.to_csv('experiment2_results.csv',index=None)