# TF-Value generation

This notebook is used to generate the tf-word counts. 
It takes a lot of time to run.

In [30]:
import os
import pandas as pd
import dask.dataframe as dd
from dask.multiprocessing import get
from utils.input_utils.text_extraction import extract_html, extract_raw
from utils.evaluation_utils.evaluation import get_all_files
import uuid
import spacy
import pickle
from collections import Counter

In [40]:

raw_files = os.path.abspath("../data/no_problem/")
processed_files = os.path.abspath("../data/processed")
counter_files = os.path.abspath("../data/counters")
print(raw_files)

/Users/bockstaller/code/vocabulary-extraction/src/data/no_problem


In [41]:
def extract_and_store_text(path, outpath):
    txt = extract_html(path)
    filename = outpath + "/" + str(uuid.uuid4()) + ".txt"
    with open(filename, "w") as file:
        file.write(txt)
    print(filename)
    return filename

def count_words(path, outpath=counter_files):
    print(path)
    nlp = spacy.load("en_core_web_lg")
    txt = extract_raw(path)
    doc = nlp(txt)
    words = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    c = Counter(words) 

    filename = outpath + "/" + str(uuid.uuid4()) + '.pickle'
    
    with open(filename, 'wb+') as outputfile:
        pickle.dump(c, outputfile)
    
    print(filename)
    
    return filename
    
count_words("/Users/bockstaller/code/vocabulary-extraction/src/data/processed/7427b7eb-4161-4a2f-801c-07cb2f8cede8.txt")

/Users/bockstaller/code/vocabulary-extraction/src/data/processed/7427b7eb-4161-4a2f-801c-07cb2f8cede8.txt
/Users/bockstaller/code/vocabulary-extraction/src/data/counters/d4b62129-d85b-4d27-8c16-a85ac977f598.pickle


'/Users/bockstaller/code/vocabulary-extraction/src/data/counters/d4b62129-d85b-4d27-8c16-a85ac977f598.pickle'

In [47]:
files = get_all_files(raw_files)
data = pd.DataFrame(files[0:10], columns=["raw_files"])
print(data)


                                           raw_files
0  /Users/bockstaller/code/vocabulary-extraction/...
1  /Users/bockstaller/code/vocabulary-extraction/...
2  /Users/bockstaller/code/vocabulary-extraction/...
3  /Users/bockstaller/code/vocabulary-extraction/...
4  /Users/bockstaller/code/vocabulary-extraction/...
5  /Users/bockstaller/code/vocabulary-extraction/...
6  /Users/bockstaller/code/vocabulary-extraction/...
7  /Users/bockstaller/code/vocabulary-extraction/...
8  /Users/bockstaller/code/vocabulary-extraction/...
9  /Users/bockstaller/code/vocabulary-extraction/...


In [48]:
ddata = dd.from_pandas(data, npartitions=4)

ddata["preprocessed_files"] = ddata.map_partitions(
    lambda df: df.apply(
        (lambda row: extract_and_store_text(*row, outpath=processed_files)), axis=1
    ),
    meta=("df", str),
).compute(scheduler="processes")

print(ddata["preprocessed_files"])

Dask Series Structure:
npartitions=3
0    object
3       ...
6       ...
9       ...
Name: preprocessed_files, dtype: object
Dask Name: getitem, 22 tasks


In [49]:
ddata['counters']  = ddata.map_partitions(
    lambda df: df.apply(
        (lambda row: count_words(row['preprocessed_files'])), axis=1
    ),
    meta=("df", str),
).compute(scheduler="processes")

ddata

Unnamed: 0_level_0,raw_files,preprocessed_files,counters
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,object,object,object
3,...,...,...
6,...,...,...
9,...,...,...


In [59]:
c = Counter()

for index, row in ddata.iterrows():
    with open(row['counters'], 'rb') as inputfile:
        c1 = pickle.load(inputfile)
        c += c1
        
print(c.most_common(10))
        


[(' ', 1427), ('\xa0', 857), ('  ', 573), ('al', 543), ('et', 541), ('     ', 395), ('    ', 378), ('stars', 350), ('\u2009', 340), ('1999', 335)]
