In [22]:
!pip install ndjson



In [23]:
import numpy as np
import ndjson
from sklearn.feature_extraction.text import CountVectorizer
from numpy.linalg import svd

In [24]:
f = open('/content/cran_experiment.ndjson')
file = ndjson.reader(f)
#corpus
corpus = [d['title'] for d in file]
corpus

['experimental investigation of the aerodynamics of a\nwing in a slipstream .',
 'simple shear flow past a flat plate in an incompressible fluid of small\nviscosity .',
 'the boundary layer in simple shear flow past a flat plate .',
 'approximate solutions of the incompressible laminar\nboundary layer equations for a plate in shear flow .',
 'one-dimensional transient heat conduction into a double-layer\nslab subjected to a linear heat input for a small time\ninternal .',
 'one-dimensional transient heat flow in a multilayer\nslab .',
 'the effect of controlled three-dimensional roughness\non boundary layer transition at supersonic speeds .',
 'measurements of the effect of two-dimensional and three-dimensional\nroughness elements on boundary layer transition .',
 'transition studies and skin friction measurements on\nan insulated flat plate at a mach number of 5.8 .',
 'the theory of the impact tube at low pressure .',
 'similar solutions in compressible laminar free mixing\nproblems 

In [25]:
#query to be searched
query = ['theory of three dimensional flow in subsonic and supersonic']

In [26]:
vectorizer = CountVectorizer()
td = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names_out()  #vocabulry in ascending order

array(['000', '01', '02', ..., 'zero', 'zone', 'zoom'], dtype=object)

In [27]:
td_mat = td.T.toarray()
print("Term-document matrix\n", td_mat)

Term-document matrix
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [28]:
#SVD of the Term-Document matrix
[u,s,vt] = svd(td_mat,False)
print(u)
print(s)
print(vt)

[[-3.28147049e-03 -2.20692233e-04  3.30412749e-04 ...  2.60801001e-02
   4.26681604e-02  8.06280165e-03]
 [-3.06009282e-03  6.18691577e-03 -4.21341452e-03 ...  7.64734320e-03
  -6.08814238e-03  2.05914210e-02]
 [-1.26428758e-03  1.38508936e-03 -2.03424063e-03 ... -1.26947786e-02
   4.79556231e-02  3.03599277e-02]
 ...
 [-8.25039625e-03  3.04790653e-03  1.09283178e-03 ... -1.12757026e-17
   1.11889664e-16 -6.86299975e-17]
 [-1.14908949e-04 -5.69106190e-04  1.68572165e-03 ... -7.56172836e-03
   1.24016603e-02 -1.65962883e-02]
 [-2.43490601e-04  7.63616206e-04 -2.74698607e-04 ... -5.03416569e-02
   2.66991809e-02  4.23238775e-03]]
[5.99885869e+01 2.58684822e+01 2.44801670e+01 ... 4.23050503e-15
 4.23050503e-15 4.23050503e-15]
[[-3.70907376e-02 -1.98079458e-02 -1.52261733e-02 ... -1.77321570e-02
  -1.40711215e-02 -2.25736772e-02]
 [ 5.37071629e-03 -1.39656033e-02 -6.60808257e-02 ...  2.43348548e-02
   2.17260456e-02 -6.94974656e-04]
 [-1.91540433e-02  1.81860034e-02  1.20911862e-02 ...  1.

In [29]:
#Transforming the query into a vector
q = vectorizer.transform(query)
q = q.toarray()
print(q)

[[0 0 0 ... 0 0 0]]


In [30]:
#Finding the new query vector
q = np.matmul(np.matmul(q,u),np.linalg.inv(np.diag(s)))
print(q)

[[-0.02320735 -0.00431524  0.04513855 ...  0.0389036   0.01993874
  -0.06482973]]


In [31]:
#finding the most similar document for the query vector by taking the dot product between query and 
#each column in the vt matrix and normalize them
similarities = []
for i in range(len(vt)):
    similarities.append(list(np.dot(q,vt[:,i])/(np.linalg.norm(q)*np.linalg.norm(vt[:,i])))[0])

print('Most similar document:')
idx = similarities.index(max(similarities))
corpus[idx]

Most similar document:


'base pressure at subsonic speeds in the presence of\na supersonic jet .'