In [None]:
%run ../../v4/misc/data_access.py

In [None]:
# 1 load w2v model

import gensim
from nltk.corpus import stopwords
import numpy as np


model = gensim.models.Word2Vec.load(get_model_folder() + 'word2vec_models/wiki_plus_v3_valid_combined.txt_numpy.w2vmodel')


In [None]:
#1.1 some static functions

# sentence to vector via w2v
def to_vector(sentence):
    sentence = sentence.lower()
    # not using english stop words, only using content words
    words = [word for word in sentence.split() if (word not in stopwords.words("english") and word in model.vocab)]

    vectors = []
    for word in words:
        vectors.append(model[word])

    vector = np.mean(vectors, axis=0)
    #print(model.similar_by_vector(vector))
    return vector


# converts a v4 text file into feather file with w2v vec saved
# v4_txt_to_feather('/mnt/drive1/data/eco/NAIL_DATAFIELD_txt/parsed_v4/own_collection_valid.txt', 'out.feather')
def v4_txt_to_feather(path, feather_name):
    data = open(path).readlines()

    row_list = []
    for line in data:
        sentence = line.split(';')[0].rstrip()
        vec = to_vector(sentence)
        new_row = {}
        index = 0
        try:
            for i in vec:
                new_row['p'+str(index)] = i
                index += 1

            new_row['sentence'] = sentence
            row_list.append(new_row)
        except:
            # only happening sometimes. \o/
            pass

    df = pd.DataFrame(row_list)
    df.to_feather(feather_name)


def v3_txt_to_df(path, author):
    data = open(path).readlines()
    path, filename = os.path.split(path)
    row_list = []
    for line in data:
        vec = to_vector(line)
        new_row = {}
        index = 0
        try:
            for i in vec:
                new_row['p'+str(index)] = i
                index += 1

            new_row['sentence'] = line
            new_row['author'] = author
            new_row['filename'] = filename
            row_list.append(new_row)
        except:
            # only happening sometimes. \o/
            pass

    return pd.DataFrame(row_list)

In [None]:
def get_similar(df, sentence):

    sentence_vec = to_vector(sentence)

    dist = 0.05
    ldf = df
    for i in range(300):
        tdf = ldf[(ldf['p'+str(i)] > sentence_vec[i]-dist) & (ldf['p'+str(i)] < sentence_vec[i]+dist)]
        #tdf = ldf.query(str(sentence_vec[i]-dist)+'<p' + str(i) + '<' + str(sentence_vec[i]+dist))
        num_sentences = len(tdf.index)
        if num_sentences > 0:
            ldf = tdf
        if num_sentences < 5:
            print('stopped at index ' + str(i))
            break

    display(ldf[['sentence','author', 'filename']])
    index = random.randint(0, len(ldf) - 1)
    return ldf.iloc[index]

In [None]:
# training
import json
import codecs
import os
import pandas as pd

def read_json_file(file_name):
   with codecs.open(file_name,encoding='utf-8') as fin:
       return json.loads(fin.read())

json = read_json_file(get_project_folder() + 'src/python/notebooks/log-final.json')
base_path = json['folder_path']

authors = {}

for key in json['file_descriptors']:
    values = json['file_descriptors'][key]
    rel_path = values['rel_path']
    if True: #rel_path == 'arts_arthistory_aesthetics/' or rel_path == 'own_mixed_collection/':
        file_name = values['file_name']
        author_name = values['author_name']
        abs_path = os.path.join(base_path + rel_path, file_name)
        if author_name not in authors:
            authors[author_name] = []

        authors[author_name].append(abs_path)
        

# dont start this unless you have some time :)
generate_dataframes = False
if generate_dataframes:
    dataframes = []
    # saving dataframes
    for key in authors:
        # edit this to include authors with >n books
        if len(authors[key]) > 2:
            #print(key + ' ' + str(len(authors[key])))
            for path in authors[key]:
                print(path)
                df = v3_txt_to_df(path, key)
                dataframes.append(df)
                print(len(dataframes))

    print(len(dataframes))
    bigdata = pd.concat(dataframes, ignore_index=True)
    bigdata.to_feather(get_model_folder() +  'parsed_v3_log-final_arts_arthistory_aesthetics_bigger_than_2.feather')

In [None]:
# 3 using it
import random
import pandas as pd
from IPython.core.display import display, HTML
import time
import feather

pd.set_option('max_colwidth',200)

df = feather.read_dataframe(get_model_folder() + 'parsed_v3_log-final_arts_arthistory_aesthetics_bigger_than_2.feather')

    
start_time = time.time()
# get_similar(df,'power war fight pain kill')
selection = get_similar(df, 'touring machine algorithm')
print('selection: ' + selection['author'])
end_time = time.time() - start_time
print(str(end_time) + ' seconds.' )