In [0]:
!rm -rf bert
!git clone https://github.com/google-research/bert

Cloning into 'bert'...
remote: Enumerating objects: 336, done.[K
Receiving objects:   0% (1/336)   Receiving objects:   1% (4/336)   Receiving objects:   2% (7/336)   Receiving objects:   3% (11/336)   Receiving objects:   4% (14/336)   Receiving objects:   5% (17/336)   Receiving objects:   6% (21/336)   Receiving objects:   7% (24/336)   Receiving objects:   8% (27/336)   Receiving objects:   9% (31/336)   Receiving objects:  10% (34/336)   Receiving objects:  11% (37/336)   Receiving objects:  12% (41/336)   Receiving objects:  13% (44/336)   Receiving objects:  14% (48/336)   Receiving objects:  15% (51/336)   Receiving objects:  16% (54/336)   Receiving objects:  17% (58/336)   Receiving objects:  18% (61/336)   Receiving objects:  19% (64/336)   Receiving objects:  20% (68/336)   Receiving objects:  21% (71/336)   Receiving objects:  22% (74/336)   Receiving objects:  23% (78/336)   Receiving objects:  24% (81/336)   Receiving objects:  25% (84/336)   R

In [0]:
import sys

sys.path.append('bert/')

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import codecs
import collections
import json
import re
import os
import pprint
import numpy as np
import tensorflow as tf

import modeling
import tokenization

In [0]:
assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.74.233.194:8470
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 9611210435262079889),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 15387922455441132777),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 8314216884389194026),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 14294924951444309688),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 10225460494399321503),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:

In [0]:
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

***** BERT pretrained directory: gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12 *****
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_config.json
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.index
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt.meta
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/checkpoint
gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/vocab.txt


In [0]:
LAYERS = [-1, -2, -3, -4]
NUM_TPU_CORES = 8
MAX_SEQ_LENGTH = 128
BERT_CONFIG = BERT_PRETRAINED_DIR + '/bert_config.json'
CHKPT_DIR = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
VOCAB_FILE = BERT_PRETRAINED_DIR + '/vocab.txt'
INIT_CHECKPOINT = BERT_PRETRAINED_DIR + '/bert_model.ckpt'
BATCH_SIZE = 128

In [0]:
class InputExample(object):

  def __init__(self, unique_id, text_a, text_b=None):
    self.unique_id = unique_id
    self.text_a = text_a
    self.text_b = text_b


In [0]:
class InputFeatures(object):
  """A single set of features of data."""

  def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
    self.unique_id = unique_id
    self.tokens = tokens
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.input_type_ids = input_type_ids

In [0]:
def input_fn_builder(features, seq_length):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_unique_ids = []
  all_input_ids = []
  all_input_mask = []
  all_input_type_ids = []

  for feature in features:
    all_unique_ids.append(feature.unique_id)
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_input_type_ids.append(feature.input_type_ids)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]

    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "unique_ids":
            tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_type_ids":
            tf.constant(
                all_input_type_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
    })

    d = d.batch(batch_size=batch_size, drop_remainder=False)
    return d

  return input_fn

In [0]:
def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
                     use_one_hot_embeddings):
  """Returns `model_fn` closure for TPUEstimator."""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    unique_ids = features["unique_ids"]
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    input_type_ids = features["input_type_ids"]

    model = modeling.BertModel(
        config=bert_config,
        is_training=False,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=input_type_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    if mode != tf.estimator.ModeKeys.PREDICT:
      raise ValueError("Only PREDICT modes are supported: %s" % (mode))

    tvars = tf.trainable_variables()
    scaffold_fn = None
    (assignment_map,
     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, init_checkpoint)
    if use_tpu:

      def tpu_scaffold():
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
        return tf.train.Scaffold()

      scaffold_fn = tpu_scaffold
    else:
      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    all_layers = model.get_all_encoder_layers()

    predictions = {
        "unique_id": unique_ids,
    }

    for (i, layer_index) in enumerate(layer_indexes):
      predictions["layer_output_%d" % i] = all_layers[layer_index]

    output_spec = tf.contrib.tpu.TPUEstimatorSpec(
        mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn

In [0]:
def convert_examples_to_features(examples, seq_length, tokenizer):
  """Loads a data file into a list of `InputBatch`s."""

  features = []
  for (ex_index, example) in enumerate(examples):
    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
      tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
      # Modifies `tokens_a` and `tokens_b` in place so that the total
      # length is less than the specified length.
      # Account for [CLS], [SEP], [SEP] with "- 3"
      _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
    else:
      # Account for [CLS] and [SEP] with "- 2"
      if len(tokens_a) > seq_length - 2:
        tokens_a = tokens_a[0:(seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    input_type_ids = []
    tokens.append("[CLS]")
    input_type_ids.append(0)
    for token in tokens_a:
      tokens.append(token)
      input_type_ids.append(0)
    tokens.append("[SEP]")
    input_type_ids.append(0)

    if tokens_b:
      for token in tokens_b:
        tokens.append(token)
        input_type_ids.append(1)
      tokens.append("[SEP]")
      input_type_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < seq_length:
      input_ids.append(0)
      input_mask.append(0)
      input_type_ids.append(0)

    assert len(input_ids) == seq_length
    assert len(input_mask) == seq_length
    assert len(input_type_ids) == seq_length

    if ex_index < 5:
      tf.logging.info("*** Example ***")
      tf.logging.info("unique_id: %s" % (example.unique_id))
      tf.logging.info("tokens: %s" % " ".join(
          [tokenization.printable_text(x) for x in tokens]))
      tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
      tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
      tf.logging.info(
          "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))

    features.append(
        InputFeatures(
            unique_id=example.unique_id,
            tokens=tokens,
            input_ids=input_ids,
            input_mask=input_mask,
            input_type_ids=input_type_ids))
  return features

In [0]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  """Truncates a sequence pair in place to the maximum length."""

  # This is a simple heuristic which will always truncate the longer sequence
  # one token at a time. This makes more sense than truncating an equal percent
  # of tokens from each, since if one sequence is very short then each token
  # that's truncated likely contains more information than a longer sequence.
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_length:
      break
    if len(tokens_a) > len(tokens_b):
      tokens_a.pop()
    else:
      tokens_b.pop()

In [0]:
def read_sequence(input_sentences):
  examples = []
  unique_id = 0
  for sentence in input_sentences:
    line = tokenization.convert_to_unicode(sentence)
    examples.append(InputExample(unique_id=unique_id, text_a=line))
    unique_id += 1
  return examples
  

In [0]:
def get_features(input_text, dim=768):
#   tf.logging.set_verbosity(tf.logging.INFO)

  layer_indexes = LAYERS

  bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

  tokenizer = tokenization.FullTokenizer(
      vocab_file=VOCAB_FILE, do_lower_case=True)

  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
  run_config = tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      tpu_config=tf.contrib.tpu.TPUConfig(
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=is_per_host))

  examples = read_sequence(input_text)

  features = convert_examples_to_features(
      examples=examples, seq_length=MAX_SEQ_LENGTH, tokenizer=tokenizer)

  unique_id_to_feature = {}
  for feature in features:
    unique_id_to_feature[feature.unique_id] = feature

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=INIT_CHECKPOINT,
      layer_indexes=layer_indexes,
      use_tpu=True,
      use_one_hot_embeddings=True)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = tf.contrib.tpu.TPUEstimator(
      use_tpu=True,
      model_fn=model_fn,
      config=run_config,
      predict_batch_size=BATCH_SIZE,
      train_batch_size=BATCH_SIZE)

  input_fn = input_fn_builder(
      features=features, seq_length=MAX_SEQ_LENGTH)

  # Get features
  for result in estimator.predict(input_fn, yield_single_examples=True):
    unique_id = int(result["unique_id"])
    feature = unique_id_to_feature[unique_id]
    output = collections.OrderedDict()
    for (i, token) in enumerate(feature.tokens):
      layers = []
      for (j, layer_index) in enumerate(layer_indexes):
        layer_output = result["layer_output_%d" % j]
        layer_output_flat = np.array([x for x in layer_output[i:(i + 1)].flat])
        layers.append(layer_output_flat)
      output[token] = sum(layers)[:dim]
  
  return output

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
embeddings11 = get_features(["Ram is a very good boy"])
print(embeddings11)
#embeddings22 = get_features(["pop is a cruel girl"])
#print(cosine_similarity(embeddings11['[CLS]'].reshape(1,-1), embeddings22['[CLS]'].reshape(1,-1)) )

INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] ram is a very good boy [SEP]
INFO:tensorflow:input_ids: 101 8223 2003 1037 2200 2204 2879 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INF

In [0]:
#print(np.hstack((embeddings11['[CLS]'].reshape(1,-1), embeddings11['[CLS]'].reshape(1,-1) )).shape)
X = [0]*768
for key, value in embeddings11.items():
  for idx,j in enumerate(value):
    X[idx] += j

Y = [0]*768
for key, value in embeddings22.items():
  for idx,j in enumerate(value):
    Y[idx] += j

print(Y)
print(len(Y))

print(cosine_similarity(np.array(X).reshape(1,-1), np.array(Y).reshape(1,-1)))


[-9.480851113796234, -18.523557901382446, -8.419290542602539, 6.50225467979908, -4.907821550965309, 2.9926143884658813, 3.464988797903061, 20.16760605573654, 4.12293067574501, -12.226728439331055, 8.635327965021133, -3.317184641957283, 4.565897852182388, -0.3121004104614258, -18.207531690597534, 2.8462284058332443, 7.739456862211227, 4.459620181471109, 9.017326444387436, 8.561210632324219, -2.158512830734253, -0.7956739217042923, -10.39866653084755, 7.779000006616116, 3.789206340909004, 7.528216391801834, 0.3372573256492615, -3.619321346282959, 1.9700332283973694, -0.41765378415584564, 0.8599937483668327, 11.448735542595387, 12.164157062768936, 5.212155908346176, -5.316073536872864, -5.271736666560173, -4.745068073272705, 10.856734573841095, -18.757728964090347, 2.6204579696059227, -6.128585442900658, -10.282315254211426, 2.056325316429138, -6.604543536901474, -1.0314731001853943, -12.481860280036926, 2.450008511543274, 6.600971153005958, 12.386284083127975, -19.297965586185455, -7.826

In [0]:
print(len(embeddings['[CLS]']))
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
f = open("seed.txt", "a")

f.write("a"+ str(cosine_similarity(embeddings['[CLS]'].reshape(1,-1), embeddings['[CLS]'].reshape(1,-1) )[0][0])+"\n")
f.write("bbb")
f.close()

768


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!unzip -uq "/content/drive/My Drive/trainsub.zip" -d "/content/drive/My Drive/Trainsub"

In [0]:
import os
from sklearn.metrics.pairwise import cosine_similarity

foldername = "/content/drive/My Drive/Trainsub/trainsub"
files = os.listdir(foldername)

f = open("trainsubcosine.txt", "a")

for i in files:
  filei = open(foldername+"/"+i)
  datai = filei.read();
  embeddingsi = get_features(datai)
  for j in files:
    if i==j: continue;
    filej = open(foldername+"/"+j)
    dataj = filej.read()
    embeddingsj = get_features(dataj)
    cs = cosine_similarity(embeddingsi['[CLS]'].reshape(1,-1), embeddingsj['[CLS]'].reshape(1,-1) )[0][0]
    f.write(str(i)+" "+str(j)+" "+str(cs)+"\n")
    
f.close()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:tensorflow:Shutdown TPU system.
INFO:tensorflow:prediction_loop marked as finished
INFO:tensorflow:prediction_loop marked as finished
INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] i [SEP]
INFO:tensorflow:input_ids: 101 1045 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_mask: 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 1
INFO:tensorflow:toke

In [0]:
import os
from sklearn.metrics.pairwise import cosine_similarity
import pickle

foldername = "/content/drive/My Drive/Trainsub/trainsub"
files = os.listdir(foldername)

for i in files:
  if(not i.endswith(".txt")): continue
  filei = open(foldername+"/"+i)
  datai = filei.read()
  #print(datai)
  embeddingsi = get_features([datai])
  X = [0]*768
  for key, value in embeddingsi.items():
    #print(key)
    #if(key in ['[CLS]', '[SEP]']): continue
    #print(key, "hello world")
    for idx,j in enumerate(value):
      X[idx] += j
  with open(foldername+"/"+i+".pkl", 'wb') as fp:
    pickle.dump(X, fp)

  #with open(foldername+"/"+i+".pkl", 'rb') as fp:
   # P = pickle.load(fp)
  #print(P)
  #break


<__main__.InputExample object at 0x7f6a80b70a90>
INFO:tensorflow:*** Example ***
INFO:tensorflow:unique_id: 0
INFO:tensorflow:tokens: [CLS] senior member of maha ##z - e - az ##adi , az ##am in ##qi ##lab ##i has said that if resistance leaders really want to bring kashmir out of log - jam and take the movement towards success , it becomes necessary for them to consider national honour and subject ##ivity as a master key and to bring unity within their ranks . resistance leadership across the border should , in voice , demand from india and pakistan to withdraw their forces from both sides of border so that people of united valley could determine their political future in own parliament according to principle of right to self - determination , he said . [SEP]
INFO:tensorflow:input_ids: 101 3026 2266 1997 24404 2480 1011 1041 1011 17207 17190 1010 17207 3286 1999 14702 20470 2072 2038 2056 2008 2065 5012 4177 2428 2215 2000 3288 13329 2041 1997 8833 1011 9389 1998 2202 1996 2929 2875 31

In [0]:
import os
from sklearn.metrics.pairwise import cosine_similarity
import pickle

foldername = "/content/drive/My Drive/Trainsub/trainsub"
files = os.listdir(foldername)
#print(files)
f = open("small.txt", "a")
for i in files:
  if(not i.endswith(".pkl")): continue
  with open(foldername+"/"+i, 'rb') as fp:
    P = pickle.load(fp)
  for j in files:
    if(not j.endswith(".pkl")): continue

    if i==j: continue
    with open(foldername+"/"+j, 'rb') as fq:
      Q = pickle.load(fq)
    cs = cosine_similarity(np.array(P).reshape(1,-1), np.array(Q).reshape(1,-1))[0][0]
    print(i, j, cs)
    f.write(str(i)+" "+str(j)+" "+str(cs))

f.close()
  #print(P)
  #break


1_3.txt.pkl 1_9.txt.pkl 0.9006199591682085
1_3.txt.pkl 1_10.txt.pkl 0.8815057770876091
1_3.txt.pkl 2_1.txt.pkl 0.9067148182167399
1_3.txt.pkl 2_2.txt.pkl 0.9120520742057465
1_3.txt.pkl 2_7.txt.pkl 0.8615628852086248
1_3.txt.pkl 2_8.txt.pkl 0.8919740330813452
1_3.txt.pkl 3_1.txt.pkl 0.8959671933166239
1_3.txt.pkl 3_3.txt.pkl 0.9036656217977492
1_3.txt.pkl 3_8.txt.pkl 0.8778445669589776
1_3.txt.pkl 3_9.txt.pkl 0.8511047401457096
1_3.txt.pkl 1_2.txt.pkl 0.9104169801938213
1_9.txt.pkl 1_3.txt.pkl 0.9006199591682085
1_9.txt.pkl 1_10.txt.pkl 0.9151044559838151
1_9.txt.pkl 2_1.txt.pkl 0.9044149678865228
1_9.txt.pkl 2_2.txt.pkl 0.9154032907454108
1_9.txt.pkl 2_7.txt.pkl 0.837158703770123
1_9.txt.pkl 2_8.txt.pkl 0.9103068117746156
1_9.txt.pkl 3_1.txt.pkl 0.8837502803978033
1_9.txt.pkl 3_3.txt.pkl 0.8701724712358857
1_9.txt.pkl 3_8.txt.pkl 0.9231257987490266
1_9.txt.pkl 3_9.txt.pkl 0.8993397462778129
1_9.txt.pkl 1_2.txt.pkl 0.9202250438836008
1_10.txt.pkl 1_3.txt.pkl 0.8815057770876091
1_10.txt.

In [0]:
import os
from sklearn.metrics.pairwise import cosine_similarity

seedfoldername = "/content/drive/My Drive/Semi/train/PositiveSeed"
nonseedfoldername =  "/content/drive/My Drive/Semi/train/Positive"

seedfiles = os.listdir(seedfoldername)
nonseedfiles = os.listdir(nonseedfoldername)

print(seedfiles)
f = open("seedPos.txt", "a")

for i in seedfiles:
  seedfile = open(seedfoldername+"/"+i)
  dataseedfile = seedfile.read();
  embeddingsseed = get_features(dataseedfile)
  for j in nonseedfiles:
    nonseedfile = open(nonseedfoldername+"/"+j)
    datanonseedfile = nonseedfile.read()
    embeddingsnonseed = get_features(datanonseedfile)
    cs = cosine_similarity(embeddingsseed['[CLS]'].reshape(1,-1), embeddingsnonseed['[CLS]'].reshape(1,-1) )[0][0]
    f.write(str(i)+" "+str(j)+" "+str(cs)+"\n")
    
f.close()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias: