# 向量空间模型

In [1]:
import re
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np

texts = [
    "I am happy.",
    "You are happy.",
    "I am you"
]

In [2]:
df = pd.read_csv('new_train.tsv', header=0, sep='\t')
texts = list(np.array(df)[:, 0])

In [3]:
class Bag:
    def __init__(self, texts):
        self.__texts = []
        self.__len = -1
        self.__value = {}
        for text in texts:
            words = re.split(r'\W+', text.lower())
            self.__texts.append(words)
            for word in words:
                if word not in self.__value and len(word) > 0:
                    self.__len += 1
                    self.__value[word] = self.__len


    def inquiry(self, word):
        if word not in self.__value:
            pass
        else:
            return self.__value[word]

    def len(self):
        return self.__len

    def __str__(self):
        return f"<Bag> value: {str(self.__value)}; length: {str(self.__len)}"

    def value(self):
        return self.__value


In [4]:
def word2vec(word, bag):
    dimension = bag.len()+1
    vec = torch.zeros(dimension)
    vec[bag.inquiry(word)] = 1
    return vec


In [5]:
class Query:
    def __init__(self, word, vec):
        self.__word = word
        self.__vec = vec

    def __repr__(self):
        return f"<Query> word: {self.__word}; vec: {self.__vec}"

    def word(self):
        return self.__word

In [6]:
class Document:
    def __init__(self, text, bag):
        self.__text = text
        self.__queries = []
        self.__bag = bag
        for word in self.__text:
            if len(word) > 0:
                # print(word)
                self.__queries.append(Query(word, word2vec(word, bag)))
        self.__init_tf()
        self.__vector = torch.zeros(self.__bag.len()+1)


    def __repr__(self):
        return f"<Document> text: {self.__text}; queries: {self.__queries}"

    def __init_tf(self):
        text_length = len(self.__queries)
        bag_length = self.__bag.len()
        frequency = torch.zeros(bag_length+1)

        # self.__tf = torch.cat([torch.zeros([1,bag_length+1]), torch.arange(0,bag_length+1,1).reshape(1,-1)],0).t()

        # print(self.__queries)

        for query in self.__queries:
            word = query.word()
            key = self.__bag.inquiry(word)
            # self.__tf[key][0] += 1.0
            frequency[key] += 1
        self.__tf = frequency/text_length
        # print(f"<tf> {self.__tf}")

    def tf(self):
        return self.__tf

    def assign_vector(self, value):
        self.__vector = value

    def vector(self):
        return self.__vector

In [12]:
class Documents:
    def __init__(self, texts):
        self.__texts = texts
        self.__bag = Bag(texts)
        # print(self.__bag)
        self.__value = []

    def init_doc(self, a, b):
        for text in self.__texts[a:b]:
            self.__value.append(Document(re.split(r'\W+', text.lower()), self.__bag))
        self.__init_idf()
        self.__init_vector()


    def __str__(self):
        return f"<Documents>\nbag: {str(self.__bag)}; value: {str(self.__value)}"

    def __getitem__(self, index):
        return self.__value[index]

    def __init_idf(self):
        doc_length = len(self.__value)
        bag_length = self.__bag.len()
        frequency = torch.zeros(bag_length+1)
        for word in self.__bag.value():
            # print(word)
            word_id = self.__bag.inquiry(word)
            for doc in self.__value:
                # print(doc.tf()[word_id])
                if doc.tf()[word_id] > torch.tensor(0.0):
                    frequency[word_id] += 1
        # print(frequency)
        self.__idf = torch.log(doc_length/(frequency+1))
        # print(f"<idf> {self.__idf}")

    def idf(self):
        return self.__idf

    def __init_vector(self):
        for doc in self.__value:
            vector = torch.zeros(self.__bag.len()+1)
            vector = torch.mul(self.__idf, doc.tf())
            doc.assign_vector(vector)

    def value(self):
        return self.__value

In [13]:
documents = Documents(texts)
documents.init_doc(0, 5)

In [14]:
for doc in documents.value():
    print(doc.vector())

tensor([0.0509, 0.0509, 0.0124,  ..., 0.0000, 0.0000, 0.0000])
tensor([0.0000, 0.0000, 0.0194,  ..., 0.0000, 0.0000, 0.0000])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0.0000, 0.0000, 0.0117,  ..., 0.0000, 0.0000, 0.0000])
tensor([0., 0., 0.,  ..., 0., 0., 0.])


In [412]:
for (a, b) in zip(documents.value(), documents.value()[1:]):
    print(a.vector(), b.vector())
    print(torch.cosine_similarity(a.vector(), b.vector(),dim=0))

tensor([0.1352, 0.1352, 0.1352, 0.0000, 0.0000]) tensor([0.0000, 0.0000, 0.1352, 0.1352, 0.3662])
tensor(0.1889)
tensor([0.0000, 0.0000, 0.1352, 0.1352, 0.3662]) tensor([0.1352, 0.1352, 0.0000, 0.1352, 0.0000])
tensor(0.1889)


In [426]:
length = documents.value().__len__()
similarity = torch.zeros([length, length])
similarity

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [440]:
for i in range(length):
    for j in range(length):
        # print(documents.value()[i].vector(), documents.value()[j].vector())
        veci = documents.value()[i].vector()
        vecj = documents.value()[j].vector()
        similarity[i][j] = torch.cosine_similarity(veci, vecj, dim=0)
print(similarity)

tensor([[1.0000, 0.1889, 0.6667],
        [0.1889, 1.0000, 0.1889],
        [0.6667, 0.1889, 1.0000]])
