# SVD in NLP (Working Example)

In [1]:
from numpy import diag
from numpy import dot
from numpy import zeros
from scipy.linalg import svd
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg as la
import math
from numpy import linalg as LA

Document-Term-Matrix 

$$
\begin{array}{c}
D_m*_n = U S V^T\\
\end{array}
$$
where:
1. U is the doc. concept similarity matrix 
2. V Term concept similarity matrix 
3. S Diagonal element 


\begin{equation*}
\mathbf{𝐷𝑚∗𝑛} =  \begin{vmatrix}
\mathbf{1} & \mathbf{1} & \mathbf{1} & \mathbf{0} & \mathbf{0} \\
\mathbf{2} & \mathbf{2} & \mathbf{2} & \mathbf{0} & \mathbf{0} \\
\mathbf{1} & \mathbf{1} & \mathbf{1} & \mathbf{0} & \mathbf{0} \\
\mathbf{5} & \mathbf{5} & \mathbf{5} & \mathbf{0} & \mathbf{0} \\
\mathbf{0} & \mathbf{0} & \mathbf{0} & \mathbf{2} & \mathbf{2} \\
\mathbf{0} & \mathbf{0} & \mathbf{0} & \mathbf{3} & \mathbf{3} \\
\mathbf{0} & \mathbf{0} & \mathbf{0} & \mathbf{1} & \mathbf{1} 
\end{vmatrix}
\end{equation*}

In [2]:
Doc = np.array([[1, 1,1,0,0], [2, 2,2,0,0],[1, 1,1,0,0],[0, 0,0,2,2],[0, 0,0,3,3],[0, 0,0,1,1]])
Doc

array([[1, 1, 1, 0, 0],
       [2, 2, 2, 0, 0],
       [1, 1, 1, 0, 0],
       [0, 0, 0, 2, 2],
       [0, 0, 0, 3, 3],
       [0, 0, 0, 1, 1]])

So to find the eigenvalues of the above entity we compute matrices $$ \begin{array}{c} Doc * Doc.^T\\ \end{array}
$$
Doc\* Doc.^T and Doc.^T\* Doc.  As previously stated , the eigenvectors of Doc\* Doc.^T  make up the columns of U so we can do the following analysis to find U.




In [3]:
Doc_U=Doc.dot(Doc.T)
Doc_U

array([[ 3,  6,  3,  0,  0,  0],
       [ 6, 12,  6,  0,  0,  0],
       [ 3,  6,  3,  0,  0,  0],
       [ 0,  0,  0,  8, 12,  4],
       [ 0,  0,  0, 12, 18,  6],
       [ 0,  0,  0,  4,  6,  2]])

In [4]:
Doc_v=Doc.T.dot(Doc)
Doc_v

array([[ 6,  6,  6,  0,  0],
       [ 6,  6,  6,  0,  0],
       [ 6,  6,  6,  0,  0],
       [ 0,  0,  0, 14, 14],
       [ 0,  0,  0, 14, 14]])

# Step 1 find  S Diagonal element

In [41]:
results_S_v = la.eig(Doc_v)
results_S_v[0] 
S=np.zeros((6,6), float)
digsig=np.sqrt(results_S_v[0])
indices_diagonal = np.diag_indices(5)

S[indices_diagonal] = digsig
S

  import sys


array([[4.24264069e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 3.47520110e-16, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.29150262e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00]])

# Step 2: find U  the doc. concept similarity matrix

In [42]:
w, U = LA.eig(Doc_U)

print(U)

[[ 4.08248290e-01 -6.30979952e-18  8.39314581e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 8.16496581e-01 -4.47213595e-01 -5.11601494e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 4.08248290e-01  8.94427191e-01  1.83888407e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 -8.45154255e-01
   5.34522484e-01 -3.47209664e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  5.07092553e-01
   8.01783726e-01 -2.95128215e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.69030851e-01
   2.67261242e-01  9.54826577e-01]]


# Step 3: find V Term concept similarity matrix

In [15]:
w, VT = LA.eig(Doc_v)

print(VT)

[[ 5.77350269e-01 -8.13894371e-01  2.76671174e-17  0.00000000e+00
   0.00000000e+00]
 [ 5.77350269e-01  4.63356089e-01  7.07106781e-01  0.00000000e+00
   0.00000000e+00]
 [ 5.77350269e-01  3.50538283e-01 -7.07106781e-01  0.00000000e+00
   0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  7.07106781e-01
  -7.07106781e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  7.07106781e-01
   7.07106781e-01]]


# Apply SVD Algorithm 

In [7]:
U, s, VT = svd(Doc)

In [17]:
U

array([[ 4.08248290e-01, -6.30979952e-18,  8.39314581e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.16496581e-01, -4.47213595e-01, -5.11601494e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.08248290e-01,  8.94427191e-01,  1.83888407e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -8.45154255e-01,  5.34522484e-01, -3.47209664e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         5.07092553e-01,  8.01783726e-01, -2.95128215e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.69030851e-01,  2.67261242e-01,  9.54826577e-01]])

In [18]:
VT

array([[ 5.77350269e-01, -8.13894371e-01,  2.76671174e-17,
         0.00000000e+00,  0.00000000e+00],
       [ 5.77350269e-01,  4.63356089e-01,  7.07106781e-01,
         0.00000000e+00,  0.00000000e+00],
       [ 5.77350269e-01,  3.50538283e-01, -7.07106781e-01,
         0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         7.07106781e-01, -7.07106781e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         7.07106781e-01,  7.07106781e-01]])

In [21]:
S=np.zeros((6,6), float)
digsig=np.sqrt(results_S[0])
indices_diagonal = np.diag_indices(5)

S[indices_diagonal] = s
S

array([[5.29150262e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.24264069e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 5.01683046e-16, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.82724267e-17,
        0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        9.78300899e-49, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00]])