In [None]:
%run ../../v4/misc/data_access.py

In [None]:
import codecs
import json
from collections import Counter
import operator
import pandas as pd
from IPython.core.display import display, HTML

def read_json_file(file_name):
    with codecs.open(file_name,encoding='utf-8') as fin:
        return json.loads(fin.read())
    
# def write_dict_to_file(file_name,dict_):
#     with codecs.open(file_name,'w', encoding='utf-8') as fout:
#         fout.write(json.dumps(dict_, indent=2, ensure_ascii=False))


def get_accum_file_sizes(doc_list, get_sorted = True):
    authors_accum_file_size = {}
    author_set = get_author_set(doc_list)
    for author in author_set:
        authors_accum_file_size[author] = sum([doc["file_size"] for doc in doc_list if doc["author_name"] == author])
    if not get_sorted:
        return authors_accum_file_size
    sorted_sizes = sorted(authors_accum_file_size.items(), key=operator.itemgetter(1),reverse=True) 
    return sorted_sizes
    
def get_author_counter(doc_list):
    return Counter([text["author_name"] for text in doc_list])

def get_author_set(doc_list):
    author_counter = get_author_counter(doc_list)
    del author_counter["No Author"]
    return list(set(dict(author_counter)))
    
def log_JSON_Analysis(log_json, n = 30):
    col = read_json_file(log_json)['file_descriptors']
    doc_list = list(col.values())
    print("Number of Documentes\n",len(col))
    print()

    author_counter = get_author_counter(doc_list)
    author_set = get_author_set(doc_list)
    
    print("Number of authors\n",len(author_set))
    print()
        
    most_common = {ac:author_counter[ac] for ac in author_counter if author_counter[ac] > 10}
    print("Authors with more then 10 documents (", len(most_common),')')

    print(Counter(most_common))
    print()   
    
    print("Top %s Authors with accumulated file sizes:" % n)
    print()
    
    sorted_sizes = get_accum_file_sizes(doc_list)
    top_n = sorted_sizes[:n]
    row_list = []
    for a in top_n:
        new_row = {}
        new_row['name'] = a[0]
        new_row['size'] = a[1]
        new_row['docs'] = author_counter[a[0]]
        row_list.append(new_row)
        
    top_n_df = pd.DataFrame(row_list,columns=['name','docs','size'])  
    display(top_n_df)
    

log_JSON_Analysis('log-final.json', 50)

In [None]:
import codecs
import json
import os

def read_json_file(file_name):
    with codecs.open(file_name,encoding='utf-8') as fin:
        return json.loads(fin.read())

def merge_docs(author_name,log_file, dest_folder = None,overwrite= False):
    log = read_json_file(log_file)
    author_docs = [doc for doc in list(log['file_descriptors'].values()) if doc['author_name'] == author_name]

    base_folder = get_data_folder() + log["folder_path"] 
    print(base_folder)
    if not os.path.isdir(base_folder):
        print('''Value: "%s" is not set correctly. Cannot find folder:\n%s
        The folder is relative to the project data folder\nBYE''' %("folder_path", base_folder))
        return
    if not dest_folder:
        dest_folder = base_folder + "merged/"
    dest_file = (dest_folder + author_name + '.txt').replace(",","_")
    
    print("Creating:",dest_file)
    
    if not os.path.exists(dest_folder): os.makedirs(dest_folder)
        
    if os.path.isfile(dest_file) and not overwrite:
        print('File %s exists already & overwrite is not set. ending here...' %(dest_file))
        return dest_file
    
    def get_docs_abs_path(doc_dict):
        return base_folder + doc_dict["rel_path"] + doc_dict["file_name"]
    
    file_names = [get_docs_abs_path(doc) for doc in author_docs]
    print('Merging %s files' % len(file_names))
    

    
    with codecs.open(dest_file, 'w') as outf:
        for fname in file_names:
            if not os.path.exists(fname):
                print("%s\ndoes not exist. skipping")
                continue
            with open(fname) as inf:
                for line in inf:
                    outf.write(line)
                    
    print("DONE!\nResulting file is a size of %s bytes" % os.stat(dest_file).st_size)
    return dest_file

merge_docs('Chomsky,Noam','log-final.json',overwrite=True)

In [None]:
import codecs
from random import random
import os

def split_for_train_test(file_path, destination_folder = None, test_ratio = 0.05, overwrite=False):
        if not destination_folder:
            destination_folder = file_path[:file_path.rindex('.')]+'_SPLIT/'
        print("Splitting %s" % file_path)
        print("Destination folder:",destination_folder)
        
        if not os.path.exists(destination_folder): os.makedirs(destination_folder)
            
        fout_train = destination_folder + 'train.txt'
        fout_test = destination_folder + 'test.txt'
        
        if os.path.exists(fout_train) and not overwrite:
            print("%s exists and overwrite is not set.\nBye" % fout_train)
            return

        if os.path.exists(fout_test) and not overwrite:
            print("%s exists and overwrite is not set. Bye" % fout_test)
            return
            
            
        f_in = codecs.open(file_path, 'r', 'UTF-8')
        f_out_train = codecs.open(fout_train, 'w', 'UTF-8')
        f_out_test = codecs.open(fout_test, 'w', 'UTF-8')

        for line in f_in:
            if random() < test_ratio:
                f_out_test.write(line)
            else:
                f_out_train.write(line)
        print("DONE")

log_file = 'log-final.json'
file_path = get_data_folder() + read_json_file(log_file)["folder_path"] + '/merged/Chomsky_Noam.txt'
split_for_train_test(file_path ,overwrite=True)

In [None]:
### Get the top 30 create their merged texts and split them in train/test
import codecs 
import json

TOP_N = 30
def read_json_file(file_name):
    with codecs.open(file_name,encoding='utf-8') as fin:
        return json.loads(fin.read())

log_file = 'log-final.json'
col = read_json_file(log_file)['file_descriptors']
doc_list = list(col.values())
    
sorted_sizes = get_accum_file_sizes(doc_list)
top_n_names = [author_size_tuple[0] for author_size_tuple in sorted_sizes[:TOP_N]]
for author in top_n_names:
    print(author)
#     merge_docs(author,log_file)
print("ALL DONE")
    

In [None]:
%run -n ../../v4/pytorch_RVAE/train_word_embeddings.py
%run -n ../../v4/pytorch_RVAE/train.py

In [None]:
# from pytorch_RVAE import train_word_embeddings 
import shutil
import os

def author_folder_name(author_name):
    return author_name.replace(",","_")

def prepare_rvae_train(author_name,log_file,rvae_data_folder):
    merged_file = merge_docs(author_name,log_file)
    split_for_train_test(merged_file,rvae_data_folder,overwrite=True)

def run_rvae():
    train_word_embeddings.run()
    train.run()
    pass

def move_rvae_model_files(rvae_data_folder, model_folder):
    model_files = os.listdir(rvae_data_folder)
    for file in model_files:
        shutil.move(rvae_data_folder + file,model_folder + file)
    
log_file = 'log-final.json'
rvae_data_folder = get_project_folder() + 'src/v4/pytorch_RVAE/data/'
author_model_folder = get_data_folder() + author_folder_name('Chomsky,Noam') + '/'
        
prepare_rvae_train('Chomsky,Noam',log_file,rvae_data_folder)
run_rvae()
move_rvae_model_files(rvae_data_folder,author_model_folder)