# Ideas

Can we find evidence among all contexts of positive examples of a property by comparing them to the contexts of all negative examples?

Here, similar concepts should come in handy, as there should be a lot of overlap in their contexts. Distinctive aspects should be all the more salient. 



In [1]:
import json
import pandas as pd
from statistics import stdev
import os

from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
import seaborn as sn


In [7]:
def load_data(prop):
    path = f'../data/aggregated/{prop}.json'
    with open(path) as infile:
        prop_dict = json.load(infile)
    return prop_dict

def combine_contexts(prop, label, target_words, model):
    
    c = ''
    for target in target_words:
        path = f'../contexts/{model}/vocab/{target}.txt'
        if os.path.isfile(path):
            with open(path) as infile:
                c += f' {infile.read()}'
    path_dir = f'../contexts/{model}/{prop}/{label}'
    os.makedirs(path_dir, exist_ok=True)
    path_all = f'{path_dir}/ALL.txt'
    with open(path_all, 'w') as outfile:
        outfile.write(c)
        
def get_tfidf(path_pos, paths_neg):
         
    # first path is pos, rest is neg:
    paths = [path_pos]
    paths.extend(paths_neg)
    vectorizer = tfidf(input = 'filename')
    x = vectorizer.fit_transform(paths)
    x = x.toarray()

    vocab = vectorizer.get_feature_names()


    # number_of_paths
    n_paths = len(paths)
    vec_pos = x[0]

    vec_dict = dict()
    vec_dict['pos'] = vec_pos
    vec_dict['mean_neg'] =  []
    for vec in x.T:
        vec_neg = vec[1:]
        mean_neg = sum(vec_neg)/len(vec_neg)
        vec_dict['mean_neg'].append(mean_neg)
    for n, path in enumerate(paths):
        if n > 0:
            vec_dict[path] = x[n]
    df = pd.DataFrame(vec_dict, index = vocab).sort_values('pos', ascending=False)

    return df


def tfidf_all_pos_vs_neg(prop, model):
    condition = 'all_pos_vs_neg'
    load_data(prop)

    path_results_dir = f'../results/{model}/tfidf/{condition}'
    os.makedirs(path_results_dir, exist_ok=True)

    prop_dict = load_data(prop)
    target_pos = [k for k, d in prop_dict.items() if d['ml_label'] in ['all', 'all-some', 'few-some']]
    target_neg = [k for k, d in prop_dict.items() if d['ml_label'] in ['few']]

    label = 'pos'
    combine_contexts(prop, label, target_pos, model)
    path_pos = f'../contexts/{model}/{prop}/pos/ALL.txt'
    paths_neg = []
    # add all neg paths
    for target in target_neg:
        path = f'../contexts/{model}/vocab/{target}.txt'
        if os.path.isfile(path):
            paths_neg.append(path)
    df = get_tfidf(path_pos, paths_neg)
    df.to_csv(f'{path_results_dir}/{prop}.csv') 
    
    
def tfidf_each_pos_vs_neg(prop, model):
    condition = 'each_pos_vs_neg'
    load_data(prop)

    path_results_dir = f'../results/{model}/tfidf/{condition}/{prop}'
    os.makedirs(path_results_dir, exist_ok=True)

    prop_dict = load_data(prop)
    target_pos = [k for k, d in prop_dict.items() if d['ml_label'] in ['all', 'all-some', 'few-some']]
    target_neg = [k for k, d in prop_dict.items() if d['ml_label'] in ['few']]

    paths_neg = []
    # add all neg paths
    for target in target_neg:
        path = f'../contexts/{model}/vocab/{target}.txt'
        if os.path.isfile(path):
            paths_neg.append(path)
    for target in target_pos:   
        path_pos = f'../contexts/{model}/vocab/{target}.txt'
        if os.path.isfile(path_pos):
            paths = [path_pos]
            paths.append(paths_neg)
            df = get_tfidf(path_pos, paths_neg)
            df.to_csv(f'{path_results_dir}/{target}.csv') 
       

In [8]:
properties = ['fly']
model = 'wiki'
for prop in properties:
    tfidf_all_pos_vs_neg(prop, model)


In [5]:
properties = ['swim']#, 'blue', 'swim', 'fly']
model = 'wiki'
for prop in properties:
    tfidf_each_pos_vs_neg(prop, model)

In [55]:
(298*1.44)/60

7.152

In [56]:
with open('hi.txt', 'a') as outfile:
    outfile.write('hey')

In [9]:
import itertools


In [40]:



def get_batches(n_lines, n_batches):
    batch_size = round(n_lines / n_batches)
    batches = []
    start = 0
    for n in range(n_batches):
        end = start + batch_size
        batches.append((start, end))
        start = end
    # add final batch
    last_b = batches[-1]
    end = last_b[1]
    if end < n_lines:
        batches.append((end, n_lines + 1))
    return batches


n_lines = 1530697448
n_batches = 1000
batches = get_batches(n_lines, n_batches)
batch_n = 0
start, end = batches[batch_n]
print(start, end, end-start)
batch_n = 1
start, end = batches[batch_n]
print(start, end, end-start)
batch_n = 45
start, end = batches[batch_n]
print(start, end, end-start)
batch_n =  48
start, end = batches[batch_n]
print(start, end, end-start)

# 880150775 881681472

0 1530697 1530697
1530697 3061394 1530697
68881365 70412062 1530697
73473456 75004153 1530697
