In [4]:
import os
def restart_runtime():
  os.kill(os.getpid(), 9)

In [5]:
%%capture
%%bash
pip install tensorflow_hub
pip install tensorflow_text
pip install tensorflow
pip install tensorflow_recommenders
pip install sklearn
pip install 'scikit_learn~=0.23.0' #for random_gaussian projection matrix
pip install apache-beam[interactive] #tensorflow pipeline library
pip install scann #index library
pip install python-snappy #to speed up TFrecord managment

In [6]:
!git clone https://github.com/UniversalDot/tensorflow

Cloning into 'tensorflow'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 38 (delta 5), reused 21 (delta 4), pack-reused 15[K
Unpacking objects: 100% (38/38), done.
Checking out files: 100% (11/11), done.


In [None]:
restart_runtime()

In [1]:
import numpy as np
import pandas as pd

import os

import tensorflow as tf
import sklearn
import random

2022-09-27 07:51:01.054790: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-27 07:51:01.224902: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-27 07:51:01.977849: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2022-09-27 07:51:01.978016: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open 

### Content based model

Creating a tensorflow model for indexing


0.   Create embeddings and matrix projection preprocess
1.   Function to add and remove elements in the index
2.   Use ScanNN to index the closest embed in the file




In [2]:
import tensorflow as tf 
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_recommenders as tfrc

In [3]:
def load_embedding(url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'):
  return hub.load(url)

def embed_text(input, model = None):
  if model == None:
    model = load_embedding()
  return model.predict(input)

def random_projection_matrix_gen(original_dim:int, projected_dim:int, save_pickle:bool = False):
  """
  ## Inputs
  - **original_dim**: dimension of the embedding space
  - **project_dm**: dimension of the output shrinked space
  - **save_pickle**: if True saves the matrix weights in a pickle file(default: False)
  
  ## Outputs
  Random projection matrix as np array 
  """
  import pickle
  from sklearn.random_projection import gaussian_random_matrix

  random_projection_matrix = None
  if projected_dim and original_dim > projected_dim:
    random_projection_matrix = gaussian_random_matrix(
        n_components=projected_dim, n_features=original_dim).T
    print("A Gaussian random weight matrix was creates with shape of {}".format(random_projection_matrix.shape))
    
    if save_pickle:
      print('Storing random projection matrix to disk...')
      with open('random_projection_matrix', 'wb') as handle:
        pickle.dump(random_projection_matrix, 
                    handle, protocol=pickle.HIGHEST_PROTOCOL)
  return random_projection_matrix

In [4]:
def generate_embedding(input, embedding_model, random_project_matrix = None):
  
  if isinstance(input, tf.Tensor):
    x = embedding_model.predict(input)['outputs'].numpy()
  else:
    x = embedding_model.predict(tf.constant(input))['outputs'].numpy()
  
  if random_project_matrix is not None:
    x.dot(random_project_matrix)
    
  return x

def embed_fun(array, batch_size, embedding_model, rpm):
    for i in range(0, len(array), batch_size):
      batch = array[i:i+batch_size]
      x = embedding_model(batch).numpy()
      if rpm:
        x = x.dot(rpm)
      yield batch, x


In [5]:
def _format_jobdesc(string:str) -> str:
  formatted_string = string.replace('\xa0', ' ')
  formatted_string = formatted_string.replace('-', '')
  formatted_string = formatted_string.replace('â€¢', '')

  return formatted_string

In [6]:
embedding_model = load_embedding()

2022-09-27 07:51:16.868789: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-27 07:51:17.063038: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-27 07:51:17.063882: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-27 07:51:17.065529: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [7]:
jobdesc = pd.read_csv('/content/tensorflow/dataset/job_desc.csv', index_col = None, names = None)
jobdesc = np.array(list(jobdesc.jobdescription)) 
jobdesc = [_format_jobdesc(s) for s in jobdesc]

tcmalloc: large alloc 2145624064 bytes == 0x39fda000 @  0x7f61fe42a001 0x7f61f04aa1af 0x7f61f0506557 0x7f61f0506d1b 0x7f61f05a7333 0x58eb9c 0x51b4e6 0x5b41c5 0x604133 0x631049 0x58ea5d 0x51ae13 0x4cfb74 0x51b7ef 0x4cfb74 0x51b7ef 0x4cfb74 0x4d19df 0x51b31c 0x58f2a7 0x51740e 0x58f2a7 0x517947 0x5b41c5 0x4ba80a 0x4d29f9 0x5913c6 0x51908c 0x5b4a3e 0x58f49e 0x51837f


In [8]:
%%time
from tqdm import tqdm

s = np.ndarray((0), dtype = str)
emb = np.ndarray((0, 512), dtype = np.float32)
for batch in tqdm(embed_fun(jobdesc, 128, embedding_model, None)):
  s = np.append(s, batch[0])
  emb = np.append(emb, batch[1], axis=0)

99it [00:37,  1.66it/s]tcmalloc: large alloc 1210269696 bytes == 0xc15d2000 @  0x7f61fe42a001 0x7f61f04aa1af 0x7f61f0500c23 0x7f61f05a986d 0x7f61f05aa17f 0x7f61f05aa2d0 0x4ba22b 0x7f61f04eb944 0x58ebef 0x51ae13 0x5b41c5 0x58f49e 0x51837f 0x5b41c5 0x4ba899 0x7f61f04eb944 0x58ebef 0x51ae13 0x5b41c5 0x58f49e 0x51b221 0x5b41c5 0x604133 0x631049 0x58ea5d 0x51ae13 0x5b41c5 0x4ba899 0x51908c 0x5b41c5 0x58f49e
112it [00:46,  1.51it/s]tcmalloc: large alloc 1367605248 bytes == 0x161bd6000 @  0x7f61fe42a001 0x7f61f04aa1af 0x7f61f0500c23 0x7f61f05a986d 0x7f61f05aa17f 0x7f61f05aa2d0 0x4ba22b 0x7f61f04eb944 0x58ebef 0x51ae13 0x5b41c5 0x58f49e 0x51837f 0x5b41c5 0x4ba899 0x7f61f04eb944 0x58ebef 0x51ae13 0x5b41c5 0x58f49e 0x51b221 0x5b41c5 0x604133 0x631049 0x58ea5d 0x51ae13 0x5b41c5 0x4ba899 0x51908c 0x5b41c5 0x58f49e
123it [00:54,  1.15it/s]tcmalloc: large alloc 1546313728 bytes == 0xc15d2000 @  0x7f61fe42a001 0x7f61f04aa1af 0x7f61f0500c23 0x7f61f05a986d 0x7f61f05aa17f 0x7f61f05aa2d0 0x4ba22b 0x7f61f

CPU times: user 1min 30s, sys: 9.1 s, total: 1min 39s
Wall time: 1min 39s





In [9]:
print(s.shape)
print(emb.shape)
s_df = tf.data.Dataset.from_tensor_slices(s)
emb_df = tf.data.Dataset.from_tensor_slices(emb)

(22000,)
(22000, 512)


In [10]:
import tensorflow_recommenders as tfrs

metrics = tfrs.metrics.FactorizedTopK(
  candidates=emb_df.batch(128)
)
task = tfrs.tasks.Retrieval(
  metrics=metrics
)


class QueryModel(tfrs.Model):
  def __init__(self, embedding_model, random_projection_matrix = None, task = None) -> None:
    super(QueryModel, self).__init__()
    self.embedding_model:tf.keras.Model = embedding_model
    self.random_projection_matrix:tf.Tesnor or None = random_projection_matrix
    self.task: tf.keras.layers.Layer or None = task

  #@tf.function(input_signature=[tf.TensorSpec([], tf.string)])
  def __call__(self, input):
    x = self.embedding_model(input)
    if self.random_projection_matrix is not None:
      x = tf.matmul(x, self.random_projection_matrix)
    return x

In [31]:
#rpm = random_projection_matrix_gen(512, 64)

In [11]:
model = QueryModel(embedding_model, None, task)

In [12]:
%%time
import tensorflow_recommenders as tfrs
IndexNN = tfrs.layers.factorized_top_k.ScaNN(model)
IndexNN.index_from_dataset(s_df.batch(256).map(lambda x:(x,model(x))))

Cause: could not parse the source code of <function <lambda> at 0x7f6169412d40>: no matching AST found among candidates:



Cause: could not parse the source code of <function <lambda> at 0x7f6169412d40>: no matching AST found among candidates:

CPU times: user 19min 4s, sys: 12 s, total: 19min 16s
Wall time: 10min 19s


<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f60d16eecd0>

In [13]:
_, res = IndexNN(tf.constant(['Looking for a python developer experienced in django and flask']))

In [14]:
res

<tf.Tensor: shape=(1, 10), dtype=string, numpy=
array([[b'Looking for Python programmer who will be doing python coding all dayPythonREST APILinuxAWS',
        b'Python Developer  Location: Houston, TXDuration: Fulltime Description:Client is looking for a high quality Python developer with loads of experience building scalable web applications to join its platform team. This is a highly technical position requiring a deep understanding of both Python and Django as well as PL/pgSQL and RDBMS technology.Requirements for this position 4+ years working with Python and Django Expected strong knowledge of Python 3.3+ and the latest Django software Strong knowledge of PL/pgSQL, PostgreSQL Strong knowledge of the Django ORM Strong knowledge of Python data structures and data flows Strong knowledge of the Linux command line and best practices as a Linux server administrator Some asynchronous programming experience  not necessarily in Python. Any further query you may contact at amit@iconma.com,

In [16]:
module_no_signatures_path = os.path.join('/content', 'recommendation_module')
print('Saving model...')
tf.saved_model.save(IndexNN, module_no_signatures_path)

Saving model...




In [17]:
imported = tf.saved_model.load(module_no_signatures_path)

In [19]:
imported(tf.constant(['Looking for a python developer experienced in django and flask']))

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[0.60617656, 0.56061584, 0.50254804, 0.4984687 , 0.47240555,
         0.46601087, 0.45382226, 0.449813  , 0.449813  , 0.4401013 ]],
       dtype=float32)>,
 <tf.Tensor: shape=(1, 10), dtype=string, numpy=
 array([[b'Looking for Python programmer who will be doing python coding all dayPythonREST APILinuxAWS',
         b'Python Developer  Location: Houston, TXDuration: Fulltime Description:Client is looking for a high quality Python developer with loads of experience building scalable web applications to join its platform team. This is a highly technical position requiring a deep understanding of both Python and Django as well as PL/pgSQL and RDBMS technology.Requirements for this position 4+ years working with Python and Django Expected strong knowledge of Python 3.3+ and the latest Django software Strong knowledge of PL/pgSQL, PostgreSQL Strong knowledge of the Django ORM Strong knowledge of Python data structures and data flows

In [23]:
!zip -r /content/model.zip /content/recomendations_module

  adding: content/recomendations_module/ (stored 0%)
  adding: content/recomendations_module/variables/ (stored 0%)
  adding: content/recomendations_module/variables/variables.data-00000-of-00001 (deflated 17%)
  adding: content/recomendations_module/variables/variables.index (deflated 79%)
  adding: content/recomendations_module/assets/ (stored 0%)
  adding: content/recomendations_module/saved_model.pb (deflated 11%)
