In [54]:
import os
import string

def get_filtered_data(rootDir):
    ''' Return filtered data from root folder as array of strings and 
    the corresponding paths to the respective files
    '''
    allDirs = []
    allFiles = []
    files_txt = []
    data = []
    dataFiltered = []
    filesFiltered = []
    
    translator = str.maketrans('', '', string.punctuation)

    for root, dirs, files in os.walk(rootDir, topdown=False):
        for name in dirs:
            allDirs.append(os.path.join(root, name))

    for dP in allDirs:
        onlyFiles = [f for f in os.listdir(dP) if os.path.isfile(os.path.join(dP, f))]
        for oF in onlyFiles: allFiles.append(os.path.join(dP, oF))

    for aF in allFiles:
        #with open(aF, 'r', encoding = "ISO-8859-1") as newFile: 
        with open(aF, 'r', encoding = "utf-8", errors='replace') as newFile: 
            data.append(newFile.read())
            
    for ind, fileData in enumerate(data):
        filt = [d == '\x0c' for d in fileData]
        if sum(filt) < len(filt):
            s = fileData.lower()
            s = s.translate(translator)
            dataFiltered.append(s)
            filesFiltered.append(allFiles[ind])
            
    return (dataFiltered, filesFiltered)


def document_to_words(doc):
    doc = doc.split('\n')
    doc_short = []
    doc_words = []
    doc_words_short = []
    for d in doc:
        if len(d)>1: doc_short.append(d)
    for d in doc_short:
        doc_words += d.split(' ')
    for d in doc_words:
        if len(d)>1: doc_words_short.append(d)
    return doc_words_short

def remove_stopwords(tokens):
    '''Remove stop words from an array of tokens'''
    
    from nltk.corpus import stopwords
    #nltk.download('stopwords')
    #stopWords = set(stopwords.words('english'))
    stopWords = ['the', 'to', '-', 'pr', 'der', 'is', 'of', 'die', 'in', 'and', 'und', '–', '•', '✔', '●', 'a']
    
    tokens_filt = []
    for gT in tokens:
        if gT not in stopWords: tokens_filt.append(gT)
            
    return tokens_filt


def tokenize(data):
    '''Extract different token arrays for every single files in data (>>tokens)
    and extract ONE single token array for all files (>>globalTokens)'''
    tokens = []
    globalData = ''
    for d in data: 
        globalData += ' ' + d
        tokens.append(nltk.word_tokenize(d))
    globalTokens = nltk.word_tokenize(globalData)
    return (tokens, globalTokens)

def calc_document_tf(tokens):
    '''Compute tf's for every file 
        -> dict: tf[indFile]['word'] = tf 
    '''
    count = Counter(tokens)
    totalCount = sum(count.values())
    tf_doc = {}
    for c in count:
        tf_doc[c] = count[c] / totalCount
    return tf_doc

def calc_idf(term, tokens_filtered):
    '''Compute idf's'''
    no_of_doc = 0
    conn = []
    # Count number of documents, in which the term occurs
    for ind, t in enumerate(tokens_filtered):
        if (term in t): 
            no_of_doc += 1
            conn.append(ind)
    if (no_of_doc > 0): return (log(len(tokens_filtered) / no_of_doc), conn)
    else: return (None, conn)


In [55]:
import nltk
import string
import os

# Loading the data files
rootDir = '/home/flo/Hackdays_Mannheim_2019/github/docs_txt'
print("Loading data...")
(data, fileDirs) = get_filtered_data(rootDir)
print("Tokenizing...")
(tokens, globalTokens) = tokenize(data)
print("Removing stopwords...")
tokens_filtered = []
for t in tokens: tokens_filtered.append(remove_stopwords(t))
globalTokens_filtered = remove_stopwords(globalTokens)   


Loading data...
Tokenizing...
Removing stopwords...


In [30]:
from collections import Counter
from math import log

print("Calculate tf's")
tf = calc_document_tf(tokens_filtered[0])
# Calculate tf's for every word in every file
TF = []
for t in tokens_filtered:
    tf = calc_document_tf(t)
    TF.append(tf)

print("Calculate idf's", flush=True)
print("----------|")
IDF = {}
for ind, t_glob in enumerate(globalTokens_filtered):
    if (ind % int(0.1*len(globalTokens_filtered)) == 0): print("#", flush=True, end='')
    IDF[t_glob] = calc_idf(t_glob, tokens_filtered)

word_conn = {}
for word in IDF:
    conn = IDF[word][1]
    tfs = [TF[c][word] for c in conn]
    idf = IDF[word][0]
    word_conn[word] = (conn, tfs, idf)


print("\nDONE!")


Calculate tf's
Calculate idf's
----------|
#

KeyboardInterrupt: 

In [None]:
export_path = '/home/flo/Hackdays_Mannheim_2019/github/data_structure/data.json'

import json
jsonFile = json.dumps(word_conn)

f = open(export_path,"w")
f.write(jsonFile)
f.close()

In [56]:
from collections import defaultdict
token2Files = defaultdict(set)
filentoken2occ = defaultdict(int)
token2occ = defaultdict(int)
for i, tokens in enumerate(tokens_filtered):
    for t in tokens:
        if i not in token2Files[t]:
            token2Files[t].add(i)
        filentoken2occ[i, t] += 1

    

In [70]:
tokens_filtered_test = set()
for t in tokens_filtered:
    tokens_filtered_test.update(t)
len(tokens_filtered_test)

67009

In [58]:
len(globalTokens_filtered)

910128

In [53]:
len(globalTokens_filtered)

910118