In [None]:
main_path = "../../../data/NAIL_DATAFIELD_txt/parsed_v3"

In [None]:
import codecs
import json
from collections import Counter
import operator

def read_json_file(file_name):
    with codecs.open(file_name,encoding='utf-8') as fin:
        return json.loads(fin.read())
    
# def write_dict_to_file(file_name,dict_):
#     with codecs.open(file_name,'w', encoding='utf-8') as fout:
#         fout.write(json.dumps(dict_, indent=2, ensure_ascii=False))

def log_JSON_Analysis(log_json):
    col = read_json_file(log_json)['file_descriptors']
    doc_list = list(col.values())
    print("Number of Documentes\n",len(col))
    print()

    author_counter = Counter([text["author_name"] for text in doc_list])
    del author_counter["No Author"]
    
    author_set = list(set(dict(author_counter)))
    print("Number of authors\n",len(author_set))
    print()
        
    most_common = {ac:author_counter[ac] for ac in author_counter if author_counter[ac] > 10}
    print("Authors with more then 10 documents (", len(most_common),')')

    print(Counter(most_common))
    print()
    
    authors_accum_file_size = {}
    for author in author_set:
        authors_accum_file_size[author] = sum([doc["file_size"] for doc in doc_list if doc["author_name"] == author])

    sorted_size = sorted(authors_accum_file_size.items(), key=operator.itemgetter(1),reverse=True)      
    
    print("Top 20 Authors with accumulated file sizes")
    print(sorted_size[:20])
        

log_JSON_Analysis('log-final.json')

In [None]:
import codecs
import json
import os

def read_json_file(file_name):
    with codecs.open(file_name,encoding='utf-8') as fin:
        return json.loads(fin.read())

def merge_docs(author_name,log_file, overwrite= False):
    log = read_json_file(log_file)
    author_docs = [doc for doc in list(log['file_descriptors'].values()) if doc['author_name'] == author_name]

    base_folder = log["folder_path"] 
    if not os.path.isdir(base_folder):
        print('Value: "%s" is not set correctly. Cannot find folder:\n%s\nBYE' %("folder_path", base_folder))
        return
    dest_folder = base_folder + "merged/"
    dest_file = (dest_folder + author_name + '.txt').replace(",","_")
    
    print("Creating:",dest_file)
    
    if not os.path.exists(dest_folder): os.makedirs(dest_folder)
        
    if os.path.isfile(dest_file) and not overwrite:
        print('File %s exists already & overwrite is not set. ending here...' %(dest_file))
        return
    
    def get_docs_abs_path(doc_dict):
        return base_folder + doc_dict["rel_path"] + doc_dict["file_name"]
    
    file_names = [get_docs_abs_path(doc) for doc in author_docs]
    print('Merging %s files' % len(file_names))

    
    with codecs.open(dest_file, 'w') as outf:
        for fname in file_names:
            if not os.path.exists(fname):
                print("%s\ndoes not exist. skipping")
                continue
            with open(fname) as inf:
                for line in inf:
                    outf.write(line)
                    
    print("DONE!\nResulting file is a size of %s bytes" % os.stat(dest_file).st_size)


merge_docs('Chomsky,Noam','log-final.json',overwrite=True)

In [None]:
import codecs
from random import random
import os

def split_for_train_test(file_path, destination_folder = None, test_ratio = 0.05, overwrite=False):
        if not destination_folder:
            destination_folder = file_path[:file_path.rindex('.')]+'_SPLIT/'
        print("Splitting %s" % file_path)
        print("Destination folder:",destination_folder)
        
        if not os.path.exists(destination_folder): os.makedirs(destination_folder)
            
        fout_train = destination_folder + 'train.txt'
        fout_test = destination_folder + 'test.txt'
        
        if os.path.exists(fout_train) and not overwrite:
            print("%s exists and overwrite is not set.\nBye" % fout_train)
            return

        if os.path.exists(fout_test) and not overwrite:
            print("%s exists and overwrite is not set. Bye" % fout_test)
            return
            
            
        f_in = codecs.open(file_path, 'r', 'UTF-8')
        f_out_train = codecs.open(fout_train, 'w', 'UTF-8')
        f_out_test = codecs.open(fout_test, 'w', 'UTF-8')

        for line in f_in:
            if random() < test_ratio:
                f_out_test.write(line)
            else:
                f_out_train.write(line)
        print("DONE")

split_for_train_test(main_path + '/merged/Chomsky_Noam.txt',overwrite=True)