# Zadanie z Singular Value Decomposition

### Pobieranie danych z Bibli 

In [1]:
import re

path = 'files/bible.txt'
bible = open(path,'r')

lines = bible.readlines()    

### Podział teksu na 1000 dokumentow po 31 zdań oraz wyliczenie bag_of_words

In [2]:
documents = [[]]
bag_of_words = set()

index = 0
i = 0
for line in lines:
    p = re.sub('.*:.*\t','',line).lower()
    wordList = re.sub("[^\w]", " ",  p).split()
    documents[index] += wordList
    bag_of_words |= set(wordList)
    if(i == 31):
        index += 1
        i = 0
        documents.append([])
    i += 1
    

### Obliczenie częstotliwości występowania każdego słowa  z bag_of_words dla każdego dokumentu

In [3]:
from collections import Counter

doc_word_frequency = []

for doc in documents:
    dict_bag_of_words = dict.fromkeys(bag_of_words,0)
    for word in doc:
        dict_bag_of_words.update({word:dict_bag_of_words.get(word)+1})
    doc_word_frequency.append(dict_bag_of_words)

### Tworzenie term-by-document matrix

In [4]:
import numpy as np

words_number = len(bag_of_words)
term_by_document_matrix = np.zeros((words_number,1000))
list_bag_of_words = list(bag_of_words)
    
def create_term_by_document_matrix():
    words_number = len(bag_of_words)
    term_by_document_matrix = np.zeros((words_number,1000))
    list_bag_of_words = list(bag_of_words)

    for i in range(1000):
        for index,word in enumerate(list_bag_of_words):
            term_by_document_matrix[index,i] = doc_word_frequency[i].get(word)
            
    return term_by_document_matrix
            
term_by_document_matrix = create_term_by_document_matrix()

### Redukcja znaczenia często występujących słów
(zostanie zastosowana później)

In [5]:
idf_vector = []

N = 1000
for index,word in enumerate(list_bag_of_words):
    nw = 0
    for i in range(N):
        if(term_by_document_matrix[index,i] > 0):
            nw += 1
    if(nw != 0):
        idf_vector.append(np.log10(N/nw))      

### Funkcja znajdująca k najbardziej podobnych dokumentów do podanego zapytania

In [6]:
from numpy import linalg as LA
from heapq import nlargest

def rate_of_similarity(sentence,k):
    query = re.sub("[^\w]", " ",  sentence).lower().split()
    
    query_bag_of_words = dict.fromkeys(bag_of_words,0)
    for word in query:
        if(word in bag_of_words):
            query_bag_of_words.update({word:query_bag_of_words.get(word)+1})
    
    q = []
    for index,word in enumerate(list_bag_of_words):
        q.append(query_bag_of_words.get(word))
    
    similarity_rate = {}
    for i in range(1000):
        dj = term_by_document_matrix[:,[i]]
        q_norm = LA.norm(q)
        dj_norm = LA.norm(dj)
        cosj = np.dot(q,dj)/(q_norm*dj_norm)
        similarity_rate.update({i:cosj})
        
    return nlargest(k, similarity_rate, key=similarity_rate.get)

In [7]:
sentence = "God loves you."
rate_of_similarity(sentence,20)

[954,
 956,
 953,
 929,
 901,
 480,
 915,
 861,
 987,
 957,
 862,
 932,
 934,
 916,
 937,
 481,
 208,
 982,
 983,
 158]

### SVG

In [8]:
U, s, V = np.linalg.svd(term_by_document_matrix, full_matrices=False)
S = np.diag(s)
np.allclose(term_by_document_matrix, np.dot(U, np.dot(S, V)))
term_by_document_matrix = np.dot(U, np.dot(S, V))

sentence = "God loves you."
rate_of_similarity(sentence,20)

[954,
 956,
 953,
 929,
 901,
 480,
 915,
 861,
 987,
 957,
 862,
 932,
 934,
 916,
 937,
 481,
 208,
 982,
 983,
 158]

### Nie ma różnicy w wynikach przed i po procesie odszumiania.

## Wpływ IDF na wyniki

### Przed użyciem IDF

In [9]:
term_by_document_matrix = create_term_by_document_matrix()
sentence = "God loves you."
before_IDF = rate_of_similarity(sentence,20)

### Po użyciu IDF

In [27]:
for i in range(1000):
    for j in range(len(idf_vector)):
        term_by_document_matrix[j,i] *= idf_vector[j]

sentence = "God loves you."      
after_IDF = rate_of_similarity(sentence,20)

[954, 956, 953, 929, 861, 901, 480, 862, 932, 915, 957, 987, 934, 916, 937, 208, 158, 936, 983, 161]


In [28]:
print(before_IDF)

print(after_IDF)

[954, 956, 953, 929, 901, 480, 915, 861, 987, 957, 862, 932, 934, 916, 937, 481, 208, 982, 983, 158]
[954, 956, 953, 929, 861, 901, 480, 862, 932, 915, 957, 987, 934, 916, 937, 208, 158, 936, 983, 161]


### Jak można zauważyć powyżej, otrzymane wyniki niezbyt się różnią. Niestety ciężko subiektywnie ocenić, który sposób wylicza lepsze rozwiązanie problemu.