# Document Embeddings Guide

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print('Ready')

: 

## Documents

In [None]:
print('Doc1: dog dog barks')
print('Doc2: cat cat meows')
print('Doc3: dog dog dog cat cat play')

## Matrix A

In [None]:
A = np.array([[2,0,3],[1,0,0],[0,2,2],[0,1,0],[0,0,1]], dtype=float)
vocab = ['dog','barks','cat','meows','play']
docs = ['Doc1','Doc2','Doc3']
A_df = pd.DataFrame(A, index=vocab, columns=docs)
print('Matrix A:')
print(A_df.astype(int))

## Transpose

In [None]:
At = A.T
At_df = pd.DataFrame(At, index=docs, columns=vocab)
print('Transpose A^T:')
print(At_df.astype(int))

## Similarity Matrix

In [None]:
ATA = At @ A
ATA_df = pd.DataFrame(ATA, index=docs, columns=docs)
print('A^T x A:')
print(ATA_df.astype(int))

## How A^T x A is Calculated

In [None]:
print('Doc1 with Doc1:')
print('  [2,1,0,0,0] dot [2,1,0,0,0] = 2*2+1*1 = 5')
print()
print('Doc1 with Doc2:')
print('  [2,1,0,0,0] dot [0,0,2,1,0] = 0')
print()
print('Doc1 with Doc3:')
print('  [2,1,0,0,0] dot [3,0,2,0,1] = 2*3 = 6')
print()
print('YOUR INSIGHT: 2 x 3 = 6 is CORRECT!')

## Eigenanalysis

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(ATA)
idx = np.argsort(eigenvalues)[::-1]
lam = eigenvalues[idx]
v = eigenvectors[:, idx]
print('Eigenvalues:')
for i in range(3):
    pct = (lam[i]/np.sum(lam))*100
    print(f'Lambda{i+1}={lam[i]:.1f} ({pct:.0f}%)')

## Verify A x v = lambda x v

In [None]:
print('Level 1: A x v = lambda x v')
for i in range(3):
    ev = v[:, i]
    el = lam[i]
    left = ATA @ ev
    right = el * ev
    match = np.allclose(left, right)
    print(f'Pattern {i+1}: {match}')

## Level 2: A x V = V x Lambda

In [None]:
V = v
Lambda = np.diag(lam)
left = ATA @ V
right = V @ Lambda
match = np.allclose(left, right)
print(f'Level 2 (A x V = V x Lambda): {match}')

## Level 3: A = V x Lambda x V^T

In [None]:
Vt = V.T
reconstructed = V @ Lambda @ Vt
match = np.allclose(reconstructed, ATA)
print(f'Level 3 (A = V x Lambda x V^T): {match}')

## Document Embeddings

In [None]:
embeddings = v.T
emb_df = pd.DataFrame(embeddings, index=docs, columns=['P1','P2','P3'])
print('Document Embeddings:')
print(emb_df.round(3))
print()
print('Compression: 15 numbers to 9 numbers (40% smaller)')

## Visualization

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
colors = ['red', 'blue', 'green']
ax1.bar([1,2,3], lam, color=colors, alpha=0.7)
ax1.set_ylabel('Eigenvalue')
ax1.set_title('Eigenvalues')
pcts = (lam/np.sum(lam))*100
ax2.pie(pcts, labels=['P1','P2','P3'], autopct='%1.0f%%', colors=colors)
ax2.set_title('Importance')
plt.tight_layout()
plt.show()

## Summary

In [None]:
print('COMPLETE WORKFLOW:')
print('Documents -> Matrix -> Transpose -> Similarity')
print('-> Eigenanalysis -> Embeddings')
print()
print('KEY RESULTS:')
print('Lambda1 = 4.0 (57%)')
print('Lambda2 = 2.0 (29%)')
print('Lambda3 = 1.0 (14%)')
print()
print('YOUR INSIGHT:')
print('2 x 3 = 6 is EXACTLY CORRECT!')
print()
print('You mastered eigenanalysis and embeddings!')