In [None]:
import os
import csv
from collections import defaultdict
from tqdm import tqdm
from math import log

In [None]:
directory = "../pre/tf-idf"

In [None]:
!rm -rf {directory}/*

In [None]:
os.makedirs(directory, exist_ok=True)

In [None]:
keywords_filepath = "../pre/zipf/keywords-0.01-0.50.csv"

keywords = []
with open(keywords_filepath) as f:
    reader = csv.reader(f)

    keywords = [row[0] for row in reader]

In [None]:
keywords[0:10]

In [None]:
counts_filepath = "../pre/counts/"
counts_csvs = os.listdir(counts_filepath)

def process_keyword_doc_frequency(keywords):
    keyword_doc_frequency = defaultdict(int)

    for counts_csv in tqdm(counts_csvs, total=len(counts_csvs), desc="Processing CSVs"):

        with open(counts_filepath + counts_csv) as f:
            reader = csv.reader(f)

            # Skip header
            next(reader)

            words = [row[0] for row in reader]

            for keyword in keywords:
                if keyword in words:
                    keyword_doc_frequency[keyword] += 1

    with open('../pre/global/keyword_doc_frequecy.csv', 'w') as f:
        writer = csv.writer(f)

        writer.writerow(['keyword', 'doc_frequency'])

        for keyword, doc_frequency in keyword_doc_frequency.items():
            writer.writerow([keyword, doc_frequency])

    return keyword_doc_frequency

In [None]:
keyword_doc_frequency = defaultdict(int)

if os.path.exists('../pre/global/keyword_doc_frequecy.csv'):
    with open('../pre/global/keyword_doc_frequecy.csv') as f:
        reader = csv.reader(f)

        # Skip header
        next(reader)

        for row in reader:
            keyword_doc_frequency[row[0]] = int(row[1])
else:
    keyword_doc_frequency = process_keyword_doc_frequency(keywords)

In [None]:
def idf(N, D):
    return log((N + 1) / (D + 1))

In [None]:
keyword_idf = defaultdict(float)

for keyword in keywords:
    keyword_idf[keyword] = idf(len(counts_csvs), keyword_doc_frequency[keyword])

In [None]:
with open(directory + '/keyword_idf.csv', 'w') as f:
    writer = csv.writer(f)

    writer.writerow(['keyword', 'idf'])

    for keyword, value in keyword_idf.items():
        writer.writerow([keyword, value])

In [None]:
def tf_idf(frequency, N, D):
    return frequency * idf(N, D)

In [None]:
docs = []

os.makedirs(directory + '/docs', exist_ok=True)

for doc in tqdm(counts_csvs, total=len(counts_csvs), desc="Processing TD-IDF..."):
    doc_tdidf = defaultdict(float)

    with open(counts_filepath + doc, 'r') as f:
        reader = csv.reader(f)

        # Skip header
        next(reader)

        word_frequency = [(row[0], int(row[1])) for row in reader]

        for word, frequency in word_frequency:
            if word in keywords:
                doc_tdidf[word] += tf_idf(frequency, len(counts_csvs), keyword_doc_frequency[word])

    with open(directory + f'/docs/tfidf-{doc}', 'w') as f:
        writer = csv.writer(f)

        writer.writerow(['word', 'weight'])

        for word, weight in doc_tdidf.items():
            writer.writerow([word, weight])