In [None]:
import math

def norm(vec):
    '''Return the norm of a vector stored as a dictionary, as
    described in the handout for Project 3.
    '''

    sum_of_squares = 0.0
    for x in vec:
        sum_of_squares += vec[x] * vec[x]

    return math.sqrt(sum_of_squares)

def cosine_similarity(vec1, vec2):
    denom = norm(vec1) * norm(vec2)
    numer = 0
    for x in vec1:
        if x in vec2:
            numer += vec1[x] * vec2[x]
    return numer / denom

def build_semantic_descriptors(sentences):
    descriptors = {}
    for sentence in sentences:
        words = {}

        for word in sentence:
            if word not in words:
                words[word] = 1

        for word in sentence:
            temp = {}
            temp.update(words)
            temp.pop(word)
            if word not in descriptors:
                descriptors[word] = temp
            else:
                for w in temp:
                    if w in descriptors[word]:
                        descriptors[word][w] += 1
                    else:
                        descriptors[word][w] = 1

    return descriptors

def build_semantic_descriptors_from_files(filenames):
    combined_descriptors = {}
    for file in filenames:
        sentences = []
        text = open(file, "r", encoding="latin1").read()
        temp = ""
        for c in text:
            if c == "." or c == "!" or c == "?":
                stripped = ""
                for ch in temp:
                    if ch not in [",", "-", "â€”", "--", ":", ";", '"', "'", "...", "*", "(", ")", "/"]:
                        stripped += ch
                    else:
                        stripped += " "
                sentences.append(stripped.strip().lower().split(" "))
                temp = ""
            else:
                temp += c
        descriptors = build_semantic_descriptors(sentences)

        for word in descriptors.keys():
            if word not in combined_descriptors.keys():
                temp = {}
                temp.update(descriptors[word])
                combined_descriptors[word] = temp
            else:
                for w in descriptors[word].keys():
                    if w in combined_descriptors[word]:
                        combined_descriptors[word][w] += descriptors[word][w]
                    else:
                        combined_descriptors[word][w] = descriptors[word][w]
    return combined_descriptors

def most_similar_word(word, choices, semantic_descriptors, similarity_fn):
    similarity = []
    for choice in choices:
        keys = semantic_descriptors.keys()
        if choice in keys and word in keys:
            similarity.append(similarity_fn(semantic_descriptors[word], semantic_descriptors[choice]))
        else:
            similarity.append(-1)

    max_sim = -2
    most_sim = ""
    for i in range(len(similarity)):
        if similarity[i] > max_sim:
            max_sim = similarity[i]
            most_sim = choices[i]

    return most_sim

def run_similarity_test(filename, semantic_descriptors, similarity_fn):
    lines = open(filename).read().split("\n")
    tests = []
    for line in lines:
        tests.append(line.split(" "))

    counter = 0

    for test in tests:
        prediction = most_similar_word(test[0], test[2:], semantic_descriptors, similarity_fn)
        if prediction == test[1]:
            counter += 1

    return counter*100/len(tests)

if __name__ == '__main__':
    print(cosine_similarity({'a':1, 'b':2, 'c':3}, {'b':4, 'c':5, 'd':6}))
    dictionary = build_semantic_descriptors([["i", "am", "a", "sick", "man"],
                                            ["i", "am", "a", "spiteful", "man"],
                                            ["i", "am", "an", "unattractive", "man"],
                                            ["i", "believe", "my", "liver", "is", "diseased"],
                                            ["however", "i", "know", "nothing", "at", "all", "about", "my",
                                            "disease", "and", "do", "not", "know", "for", "certain", "what", "ails", "me"]])
    print(dictionary['man'])
    print(dictionary['liver'])
    print(build_semantic_descriptors_from_files(["test.txt", "test2.txt"])["man"])
    print(build_semantic_descriptors_from_files(["test.txt", "test2.txt"])["liver"])
    print(most_similar_word("sick", ["spiteful", "unattractive", "liver", "diseased", "my"], build_semantic_descriptors_from_files(["test.txt"]), cosine_similarity))

    dictionary = build_semantic_descriptors_from_files(["war_and_peace.txt", "swanns_way.txt"])
    print(run_similarity_test("trial.txt", dictionary, cosine_similarity))
    print(build_semantic_descriptors_from_files(["test3.txt"])["file"])

0.7005166394541485
{'i': 1, 'am': 1, 'a': 1, 'sick': 1, 'man': 1}
{'i': 2, 'am': 1, 'a': 1, 'sick': 1, 'man': 1, 'believe': 1, 'my': 1, 'liver': 1, 'is': 1, 'diseased': 1}
