<a href="https://colab.research.google.com/github/SebastianJia/nlp_research_conceptor/blob/master/Re_implement_CN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data
  

In [0]:
import numpy as np
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, KMeans
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft
import os
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
!pip install -q gdown
!gdown https://drive.google.com/uc?id=1U_UGB2vyTuTIcbV_oeDtJCtAtlFMvXOM # download a small subset of glove
!gdown https://drive.google.com/uc?id=1j_b4TRpL3f0HQ8mV17_CtOXp862YjxxB # download a small subset of word2vec
!ls

Downloading...
From: https://drive.google.com/uc?id=1U_UGB2vyTuTIcbV_oeDtJCtAtlFMvXOM
To: /content/small_glove.txt
333MB [00:04, 82.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1j_b4TRpL3f0HQ8mV17_CtOXp862YjxxB
To: /content/small_word2vec.txt
267MB [00:02, 90.9MB/s]
sample_data  small_glove.txt  small_word2vec.txt


In [0]:
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip
!ls

--2019-01-17 02:21:55--  https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki-news-300d-1M.vec.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.24.21
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.24.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2019-01-17 02:22:28 (20.6 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   
sample_data	 small_word2vec.txt	wiki-news-300d-1M.vec.zip
small_glove.txt  wiki-news-300d-1M.vec


In [0]:
!du -h wiki-news-300d-1M.vec

2.2G	wiki-news-300d-1M.vec


# Load Fasttext, small GloVe and small word2vec data

In [0]:
import gensim

from gensim.models.keyedvectors import KeyedVectors

fasttext = KeyedVectors.load_word2vec_format('/content/' + 'wiki-news-300d-1M.vec')

In [0]:
!python -m gensim.scripts.glove2word2vec -i small_glove.txt -o small_glove_w2v.txt
!python -m gensim.scripts.glove2word2vec -i small_word2vec.txt -o small_w2v_w2v.txt

2019-01-17 02:26:37,164 - glove2word2vec - INFO - running /usr/local/lib/python3.6/dist-packages/gensim/scripts/glove2word2vec.py -i small_glove.txt -o small_glove_w2v.txt
2019-01-17 02:26:37,425 - glove2word2vec - INFO - converting 128607 vectors from small_glove.txt to small_glove_w2v.txt
2019-01-17 02:26:38,625 - glove2word2vec - INFO - Converted model with 128607 vectors and 300 dimensions
2019-01-17 02:26:40,325 - glove2word2vec - INFO - running /usr/local/lib/python3.6/dist-packages/gensim/scripts/glove2word2vec.py -i small_word2vec.txt -o small_w2v_w2v.txt
2019-01-17 02:26:40,539 - glove2word2vec - INFO - converting 76078 vectors from small_word2vec.txt to small_w2v_w2v.txt
2019-01-17 02:26:41,451 - glove2word2vec - INFO - Converted model with 76078 vectors and 300 dimensions


In [0]:
glove = KeyedVectors.load_word2vec_format('/content/' + 'small_glove_w2v.txt')
w2v = KeyedVectors.load_word2vec_format('/content/' + 'small_w2v_w2v.txt')

# Post-processing with CN

In [0]:
import io
def cn_mat(pre_cn_f_name, alpha):
  pre_cn_data = eval(pre_cn_f_name)
  #word_pairs = set(list(cn_data.keys()))
  cn_mat = []
  for word in pre_cn_data.vocab:
    cn_mat.append(pre_cn_data[word])
  word_vec = np.array(cn_mat, dtype = float).T
  num_word = word_vec.shape[1]
  num_vec = word_vec.shape[0]
  print(num_word, num_vec)
  corr_mat = np.dot(word_vec, word_vec.T)/num_word
  #print('got corr_mat')
  concept_mat = corr_mat @ np.linalg.inv(corr_mat + alpha ** (-2) * np.eye(num_vec))
  #print('got concep_mat')
  new_mat = ((np.eye(num_vec)-concept_mat)@word_vec).T
  #print('got new_mat')
  return new_mat
  
cn_fasttext_mat = cn_mat('fasttext', alpha = 2)
print('CN preprocess done for fasttext data')
cn_glove_mat = cn_mat('glove', alpha = 2)
print('CN preprocess done for glove data')
cn_w2v_mat = cn_mat('w2v', alpha =2)
print('CN preprocess done for w2v data')

999994 300
CN preprocess done for fasttext data
128607 300
CN preprocess done for glove data
76078 300
CN preprocess done for w2v data


# Experiment 1: Word similarity evaluation

#Load word similarity text data

In [0]:
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-MEN-TR-3k.txt
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-MTurk-287.txt
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-RG-65.txt
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-RW-STANFORD.txt
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-SIMLEX-999.txt
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-SimVerb-3500.txt
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-WS-353-ALL.txt
!ls

--2019-01-17 02:37:22--  https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-MEN-TR-3k.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53593 (52K) [text/plain]
Saving to: ‘EN-MEN-TR-3k.txt’


2019-01-17 02:37:23 (2.03 MB/s) - ‘EN-MEN-TR-3k.txt’ saved [53593/53593]

--2019-01-17 02:37:24--  https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/wordSimData/EN-MTurk-287.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7218 (7.0K) [text/plain]
Saving to: ‘EN-MTurk

In [0]:
!pwd

/content


#Compare word similarity scores and calculate Spearman Correlation

In [0]:
def get_sim(data_f_name, cn_f_name, cn_mat, alpha):
  cn_data = eval(cn_f_name)
  #word_pairs = set(list(cn_data.keys()))
  fin = io.open(data_f_name, 'r', encoding='utf-8', newline='\n', errors='ignore')
  dataset = []
  word_vec = []
  keys = []
  ls_word = list(cn_data.vocab)
  for line in fin:
    
    tokens = line.rstrip().split()
    if tokens[0] in cn_data.vocab and tokens[1] in cn_data.vocab:
      dataset.append(((tokens[0], tokens[1]), float(tokens[2])))
      id1 = ls_word.index(tokens[0])
      id2 = ls_word.index(tokens[1])
      word_vec.append(cn_mat[id1])
      word_vec.append(cn_mat[id2])
      keys.append(tokens[0])
      keys.append(tokens[1])
  dataset.sort(key = lambda score: -score[1]) #sort based on score
 # print(cn_data['gem'])
  cn_dataset = {}
  cn_dataset_list = []
  
  for ((word1, word2), score) in dataset:
    #print(word1, word2)
    id1 = ls_word.index(word1)
    id2 = ls_word.index(word2)
    sim_score = 1 - cosine_similarity(cn_mat[id1].reshape(1,-1), cn_mat[id2].reshape(1,-1))
    cn_dataset[(word1, word2)] = sim_score
    cn_dataset_list.append(((word1, word2),sim_score))
  cn_dataset_list.sort(key = lambda score: score[1])
  spearman_list1=[]
  spearman_list2=[]
  for pos_1, (pair, score_1) in enumerate(dataset):
    score_2 = cn_dataset[pair]
    pos_2 = cn_dataset_list.index((pair, score_2))
    spearman_list1.append(pos_1)
    spearman_list2.append(pos_2)
  rho = spearmanr(spearman_list1, spearman_list2)
  return rho[0] 



In [0]:
dataSets = ['EN-RG-65.txt', 'EN-WS-353-ALL.txt', 'EN-RW-STANFORD.txt', 'EN-MEN-TR-3k.txt', 'EN-MTurk-287.txt', 'EN-SIMLEX-999.txt', 'EN-SimVerb-3500.txt']
for dataset in dataSets:
    dataSetAddress = '/content/'+  dataset
    print('evaluating the data set', dataSetAddress)
    print('Fasttext ', 'GloVe ', 'w2v ')
    print("%.4f" % get_sim(dataSetAddress, 'fasttext',cn_fasttext_mat, alpha =2), "%.4f" % get_sim(dataSetAddress, 'glove', cn_glove_mat, alpha =2), "%.4f" % get_sim(dataSetAddress, 'w2v', cn_w2v_mat, alpha =2))

evaluating the data set /content/EN-RG-65.txt
Fasttext  GloVe  w2v 
0.8670 0.7913 0.7972
evaluating the data set /content/EN-WS-353-ALL.txt
Fasttext  GloVe  w2v 
0.7335 0.7886 0.6926
evaluating the data set /content/EN-RW-STANFORD.txt
Fasttext  GloVe  w2v 
0.5369 0.5898 0.5804
evaluating the data set /content/EN-MEN-TR-3k.txt
Fasttext  GloVe  w2v 
0.8064 0.8339 0.7869
evaluating the data set /content/EN-MTurk-287.txt
Fasttext  GloVe  w2v 
0.7110 0.7116 0.6662
evaluating the data set /content/EN-SIMLEX-999.txt
Fasttext  GloVe  w2v 
0.4567 0.4858 0.4684
evaluating the data set /content/EN-SimVerb-3500.txt
Fasttext  GloVe  w2v 
0.3654 0.3632 0.3830


# STS Benchmark

#Load STS datasets

In [0]:
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-dev.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-mt.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-other.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-test.csv
!wget https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-train.csv

--2019-01-17 03:15:12--  https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-dev.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255680 (250K) [text/plain]
Saving to: ‘sts-dev.csv’


2019-01-17 03:15:12 (4.82 MB/s) - ‘sts-dev.csv’ saved [255680/255680]

--2019-01-17 03:15:14--  https://raw.githubusercontent.com/liutianlin0121/Conceptor-Negation-WV/master/data/stsbenchmark/sts-mt.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 513141 (501K) [text/plain]
Saving to: ‘sts-mt.csv’


2019-01

In [0]:
!pwd
!ls

/content
EN-MEN-TR-3k.txt     sample_data	  sts-other.csv
EN-MTurk-287.txt     small_glove.txt	  sts-test.csv
EN-RG-65.txt	     small_glove_w2v.txt  sts-train.csv
EN-RW-STANFORD.txt   small_w2v_w2v.txt	  wiki-news-300d-1M.vec
EN-SIMLEX-999.txt    small_word2vec.txt   wiki-news-300d-1M.vec.zip
EN-SimVerb-3500.txt  sts-dev.csv
EN-WS-353-ALL.txt    sts-mt.csv


In [0]:
import io
def load_sts_dataset(fname):
      fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    # For a STS dataset, loads the relevant information: the sentences and their human rated similarity score.
      sent_pairs = []
      for line in fin:
          items = line.rstrip().split('\t')
          if len(items) == 7 or len(items) == 9:
              sent_pairs.append((re.sub("[^0-9]", "", items[2]) + '-' + items[1] , items[5], items[6], float(items[4])))
          elif len(items) == 6 or len(items) == 8:
              sent_pairs.append((re.sub("[^0-9]", "", items[1]) + '-' + items[0] , items[4], items[5], float(items[3])))
          else:
              print('data format is wrong!!!')
      return pd.DataFrame(sent_pairs, columns=["year_task", "sent_1", "sent_2", "sim"])






def load_all_sts_dataset():
    # Loads all of the STS datasets 
    resourceFile = '/content/'
    sts_train = load_sts_dataset(resourceFile + 'sts-train.csv') 
    sts_dev = load_sts_dataset(resourceFile + "sts-dev.csv")
    sts_test = load_sts_dataset(resourceFile + "sts-test.csv")
    sts_other = load_sts_dataset(resourceFile + "sts-other.csv")
    sts_mt = load_sts_dataset(resourceFile +"sts-mt.csv")
    
    sts_all = pd.concat([sts_train, sts_dev, sts_test, sts_other, sts_mt ])
    
    return sts_all

sts_all = load_all_sts_dataset()


# Load dataset by year-task

In [0]:
def load_by_task_year(sts_all):
  sts_task_year = {}
  for i in sts_all['year_task']:
    indices = [index for index, x in enumerate(sts_all['year_task']) if x == i]
    sts_task_year[i] = sts_all.iloc[indices]
  return sts_task_year
sts_year_task = load_by_task_year(sts_all)
print(sts_year_task.keys())
print(sts_year_task['2012-MSRvid'][0:5])

dict_keys(['2012-MSRvid', '2014-images', '2015-images', '2014-deft-forum', '2012-MSRpar', '2014-deft-news', '2013-headlines', '2014-headlines', '2015-headlines', '2016-headlines', '2017-track5.en-en', '2015-answers-forums', '2016-answer-answer', '2012-surprise.OnWN', '2013-FNWN', '2013-OnWN', '2014-OnWN', '2014-tweet-news', '2015-belief', '2016-plagiarism', '2016-question-question', '2012-SMTeuroparl', '2012-surprise.SMTnews', '2016-postediting'])
     year_task                                         sent_1  \
0  2012-MSRvid                         A plane is taking off.   
1  2012-MSRvid                A man is playing a large flute.   
2  2012-MSRvid  A man is spreading shreded cheese on a pizza.   
3  2012-MSRvid                   Three men are playing chess.   
4  2012-MSRvid                    A man is playing the cello.   

                                              sent_2   sim  
0                        An air plane is taking off.  5.00  
1                          A man is

# Load dataset by year

In [0]:
sts_year = {}
def load_by_year(sts_all):
  for year in ['2012', '2013', '2014', '2015', '2016', '2017']:
    indices = [index for index, x in enumerate(sts_all['year_task'])if year in x]
    # store year as dictionary, [year: year-task]
    #year_task = sts_all.iloc[indices]
    sts_year[year] = sts_all.iloc[indices]
  return sts_year
sts_year = load_by_year(sts_all)
print(len(sts_year.keys()))
print(sts_year['2016'][:5])

6
           year_task                                             sent_1  \
5552  2016-headlines  Driver backs into stroller with child, drives off   
5553  2016-headlines   Spain Princess Testifies in Historic Fraud Probe   
5554  2016-headlines  Senate confirms Obama nominee to key appeals c...   
5555  2016-headlines  U.N. rights chief presses Egypt on Mursi deten...   
5556  2016-headlines  US Senate confirms Janet Yellen as US Federal ...   

                                                 sent_2  sim  
5552  Driver backs into mom, stroller with child the...  4.0  
5553   Spain princess testifies in historic fraud probe  5.0  
5554  Senate approves Obama nominee to key appeals c...  5.0  
5555   UN Rights Chief Presses Egypt on Morsi Detention  5.0  
5556  Senate confirms Janet Yellen as next Federal R...  5.0  


# Preparation for STS Evaluation


*   Define Sentence class, which has raw data and tokenized data
*   Get similarity scores based on embeddings



In [0]:
class Sentence:
  def __init__(self, sentence):
    self.raw = sentence
    normalized = sentence.replace("‘", "'").replace("’", "'")
    self.tokens = [token.lower() for token in nltk.word_tokenize(normalized)]

def get_sim(sentences1, sentences2, cn_fname, cn_mat):
  model = eval(cn_fname)
  embeddings = []
  ls_word = list(model.vocab)
  for sent_1, sent_2 in zip(sentences1, sentences2):
    tokens1 = sent_1.tokens
    tokens2 = sent_2.tokens
    tokens1 = [token for token in tokens1 if token in model.vocab and token.islower()]
    tokens2 = [token for token in tokens2 if token in model.vocab and token.islower()]
    ids1 = [ls_word.index(token) for token in tokens1 ]
    ids2 = [ls_word.index(token) for token in tokens2 ]
    embedding1 = np.average([cn_mat[id] for id in ids1], axis = 0)
    embedding2 = np.average([cn_mat[id] for id in ids2], axis = 0)
    if isinstance(embedding1, float) or isinstance(embedding2, float):
      embeddings.append(np.zeros(300))
      embeddings.append(np.zeros(300))
    else:
      embeddings.append(embedding1)
      embeddings.append(embedding2)
  sim_score = [cosine_similarity(embeddings[id*2].reshape(1, -1), embeddings[id*2+1].reshape(1, -1))[0][0] for id in range(len(embeddings)//2)]
  return sim_score
        
  
  


In [0]:
model_list = ['glove', 'w2v', 'fasttext']
pearson_cors = {}
mat = []
for year_task in sts_all['year_task'].unique():
  for model in model_list:
    if model == 'glove':
      mat = cn_glove_mat
    elif model == 'w2v':
      mat = cn_w2v_mat
    elif model == 'fasttext':
      mat = cn_fasttext_mat
        
    sentences1=[Sentence(sent1) for sent1 in sts_year_task[year_task]['sent_1']]
    sentences2=[Sentence(sent2) for sent2 in sts_year_task[year_task]['sent_2']]
    sim = get_sim(sentences1, sentences2, model, mat)
    pearson_correlation = round(scipy.stats.pearsonr(sim, sts_year_task[year_task]['sim'])[0] * 100,2)
    pearson_cors[(model, year_task)] = pearson_correlation
count = 0
for (i,j) in pearson_cors.keys():
  if count % 3 ==0:
    print('')
  count +=1
  print(i, j, pearson_cors[(i,j)])
    

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)



glove 2012-MSRvid 62.5
w2v 2012-MSRvid 75.22
fasttext 2012-MSRvid 66.44

glove 2014-images 65.81
w2v 2014-images 78.24
fasttext 2014-images 63.41

glove 2015-images 71.43
w2v 2015-images 80.48
fasttext 2015-images 71.13

glove 2014-deft-forum 37.57
w2v 2014-deft-forum 42.8
fasttext 2014-deft-forum 40.18

glove 2012-MSRpar 41.19
w2v 2012-MSRpar 40.3
fasttext 2012-MSRpar 45.03

glove 2014-deft-news 69.08
w2v 2014-deft-news 65.57
fasttext 2014-deft-news 64.76

glove 2013-headlines 67.0
w2v 2013-headlines 64.78
fasttext 2013-headlines 67.04

glove 2014-headlines 61.71
w2v 2014-headlines 61.09
fasttext 2014-headlines 63.36

glove 2015-headlines 69.18
w2v 2015-headlines 68.88
fasttext 2015-headlines 69.84

glove 2016-headlines 67.19
w2v 2016-headlines 65.13
fasttext 2016-headlines 66.05

glove 2017-track5.en-en 65.42
w2v 2017-track5.en-en 73.44
fasttext 2017-track5.en-en 61.34

glove 2015-answers-forums 48.62
w2v 2015-answers-forums 53.66
fasttext 2015-answers-forums 45.04

glove 2016-answe