In [1]:
import csv
import pandas as pd
import os
import re
import math
from zipfile import ZipFile
import itertools


directory = os.getcwd()
file_name = "2015_labeled_dataset.zip"
csv_file_directory = directory + "/articles_2015_translated.csv"
all_articles = (pd.read_csv(csv_file_directory, usecols = ["article_id"])).values.tolist()
all_paragraphs = (pd.read_csv(csv_file_directory, usecols = ["paragraphs"])).values.tolist()
all_themes = (pd.read_csv(csv_file_directory, usecols = ["ressort"])).values.tolist()
all_titles = (pd.read_csv(csv_file_directory, usecols = ["title"])).values.tolist()
all_characters = (pd.read_csv(csv_file_directory, usecols = ["characters"])).values.tolist()
all_authors = (pd.read_csv(csv_file_directory, usecols = ["authors"])).values.tolist()
all_subtitles = (pd.read_csv(csv_file_directory, usecols = ["subtitle_en"])).values.tolist()


with ZipFile(file_name, 'r') as zip:
    all_files = zip.namelist()

query_file = "2015_labeled_dataset/FALTER_20150204B39E03880F.csv"
rec_files = pd.read_csv(query_file, header = None)
rec_files = rec_files.values.tolist()
query_file = query_file.split("/")[1]
query_file = query_file.replace(".csv", "")


def get_ids(query_file, rec_files):
    article_ids = {}
    for this_file in rec_files:
        this_file = str(this_file)
        this_file = this_file.replace("\\", ""); this_file = this_file.replace("['", ""); this_file = this_file.replace("']", "")
        this_file = this_file.split("t")
        for i in range(len(all_articles)):
            this_article = "".join(all_articles[i])
            if this_article == query_file:
    #            query_unique, tot_words_query = unique_words("".join(paragraphs[i]))
                query_index = i
            elif this_article == this_file[0]:
                article_ids[i] = this_file[0]
                break
    return article_ids, query_index


def remove_special_chars(paragraph):
    whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ äüö ÄÜÖ ß')
    paragraph = ''.join(filter(whitelist.__contains__, paragraph))
    paragraph = re.sub(" +"," ", paragraph)
    paragraph = paragraph.split(" ")
    return paragraph


def count_words(paragraph, unique_words_for_all_texts):
    word_freq = {}
    for word in paragraph:
        word = word.lower()
        if word not in word_freq.keys():
            word_freq[word] = 1
            if word not in unique_words_for_all_texts.keys():
                unique_words_for_all_texts[word] = 1
            else:
                val = unique_words_for_all_texts[word]
                val+=1
                unique_words_for_all_texts[word] = val
        else:
            val = word_freq[word]
            val+=1
            word_freq[word] = val
    return word_freq, unique_words_for_all_texts


def normalize_tf(unique_words):
    for word, count in unique_words.items():
        tf = math.log10(1+count)
        unique_words[word] = tf
    return unique_words


def compute_tf(ID, unique_words_for_all_texts):
    paragraph = remove_special_chars("".join(all_paragraphs[ID]))
    unique_words, unique_words_for_all_texts = count_words(paragraph, unique_words_for_all_texts)
    tf_dictionary = normalize_tf(unique_words)
    return tf_dictionary, unique_words_for_all_texts


def multiple_tfs(article_ids):
    all_tf_dicts = {}
    unique_words_for_all_texts = {}
    for ID, FalterNR in article_ids.items():
        tf_dictionary,unique_words_for_all_texts = compute_tf(ID, unique_words_for_all_texts)
        all_tf_dicts[ID] = tf_dictionary
    return all_tf_dicts, unique_words_for_all_texts


def compute_idf(unique_words_for_all_texts):
    idf_dict = {}
    for word, count in unique_words_for_all_texts.items():
        idf = math.log10(len(article_ids.values())/count)
        idf_dict[word] = idf
    return idf_dict


def compute_tfidf(tf_dict, idf_dict):
    tfidf_dict = {}
    for word, tf_score in tf_dict.items():
        try:
            tfidf_score = tf_score*idf_dict[word]
            tfidf_dict[word] = tfidf_score
        except KeyError:
            continue
    return tfidf_dict


def multiple_tfidfs(all_tf_dicts, idf_dict):
    all_tfidf_dict = {}
    for ID, tf_dict in all_tf_dicts.items():
        tfidf_dict = compute_tfidf(tf_dict, idf_dict)
        all_tfidf_dict[ID] = tfidf_dict
    return all_tfidf_dict


def compute_cos_sim(query_tfidf, tfidf_dict):
    cosine_sim_dict = {}
    for ID, current_tfidf_dict in tfidf_dict.items():
        dot_product = 0
        query_len = 0
        for query_word, query_tfidf_score in query_tfidf.items():
            query_len+=(query_tfidf_score)**2
            doc_len = 0
            for word, tfidf_score in current_tfidf_dict.items():
                doc_len+=(tfidf_score)**2
                if query_word == word:
                    dot_product+=(query_tfidf_score * tfidf_score)
        cosine_sim_dict[ID] = (dot_product / (math.sqrt(query_len) * math.sqrt(doc_len)))
    return cosine_sim_dict


def biggest_tfidf_score(tfidf_dict):
    print_list = []
    sorted_dict= dict(sorted(tfidf_dict.items(), key=lambda item: item[1]))
    sorted_dict = dict(reversed(list(sorted_dict.items())))
    biggest_words = list(sorted_dict.keys())
    for word in biggest_words:
        if word in query_tf.keys():
            print_list.append(word)
    return print_list


article_ids, queryfile_index = get_ids(query_file, rec_files)
all_tf_dicts, unique_words_for_all_texts = multiple_tfs(article_ids)
idf_dict = compute_idf(unique_words_for_all_texts)
tfidf_dict = multiple_tfidfs(all_tf_dicts, idf_dict)
query_tf, x = compute_tf(queryfile_index, {})
query_tfidf = compute_tfidf(query_tf, idf_dict)
cosine_sim = compute_cos_sim(query_tfidf, tfidf_dict)


print(f"Query file Article ID: {all_articles[queryfile_index]}")
print(f"Theme: {all_themes[queryfile_index]}")
print(f"Title: {all_titles[queryfile_index]}")
print(f"Subtitle: {all_subtitles[queryfile_index]}")
print(f"Authors: {all_authors[queryfile_index]}")
print(f"Chars: {all_characters[queryfile_index]}\n")



for ID, value in cosine_sim.items():
    print(f"Article ID: {all_articles[ID]}")
    print(f"Cosine_similarity to query file: {value}")
    print(f"Theme: {all_themes[ID]}")
    print(f"Title: {all_titles[ID]}")
    print(f"Subtitle: {all_subtitles[ID]}")
    print(f"Authors: {all_authors[ID]}")
    print(f"Chars: {all_characters[ID]}\n")
    #print(f"highest scoring words: {biggest_tfidf_score(tfidf_dict[ID])}\n")


#Prøve å sjekke for symmetri (se om anbefalte dokumenter anbefaler query fil tilbake)

Query file Article ID: ['FALTER_20150204B39E03880F']
Theme: ['Feuilleton']
Title: ['Like mich am Arsch!']
Subtitle: ['Under the glaring-crazy surface, Deichkind is one of the smartest pop bands in Germany. The new album proves this once more']
Authors: ["['Begegnung: Gerhard Stöger']"]
Chars: [8905]

Article ID: ['FALTER_2015091699890360D5']
Cosine_similarity to query file: 0.10467868115743142
Theme: ['Feuilleton']
Title: ['Spaß, Schweiß und Zigarettenrauch']
Subtitle: ['For almost a year, Wanda has been considered the hottest shit in German-speaking pop. The waltz wanted to know how horny that feels and accompanied the band, who released their second album at the beginning of October, on Tourbus']
Authors: ["['Embedded Journalist: Klaus Nüchtern']"]
Chars: [16453]

Article ID: ['FALTER_20150819C082867E45']
Cosine_similarity to query file: 0.07023585843732422
Theme: ['Feuilleton']
Title: ['Der Bub, der zwei Mädchen ist']
Subtitle: ['The German-Swiss duo Boy also puts on harmonious pop 