In [1]:
from database import get_local_database, get_reference_flows
db = get_local_database()
reference_flows = get_reference_flows(db)

In [5]:
from sklearn.feature_extraction import DictVectorizer
from scipy.spatial import distance
from itertools import combinations

def get_references():
    most_popular_reference = next(db.materialized_reference_popularity.find().sort([('popularity', -1)]))
    max_popularity = most_popular_reference['popularity']
    reference_popularity = db.materialized_reference_popularity.find({'popularity': {'$lte': int(max_popularity / 2)}}).sort([('popularity', -1), ('_id', 1)])
    return reference_popularity

def calculate_vectors(reference_id):
    vectorizer = DictVectorizer()
    features = {}
    for focal in reference_flows:
        reference_flow = reference_flows[focal]
        try:
            last_index_of_reference = len(reference_flow) - reference_flow[::-1].index(reference_id) - 1
            limited_reference_flow = reference_flow[:last_index_of_reference]
        except ValueError:
            # Use full history when there is no global reference in there
            limited_reference_flow = reference_flow

        #Exclude the currently investigated reference
        limited_reference_flow = filter(lambda x: x != reference_id, limited_reference_flow)

        #Feature selection algorithm (key is the feature and value is the intensity)
        features[focal] = dict([(reference, 1) for reference in limited_reference_flow])

    vectorizer.fit(features.values())
    return dict(map(lambda e: (e[0], vectorizer.transform(e[1])), features.items()))

def benchmark(vectors, reference_focals):
    def focal_distance(focal_a, focal_b):
        return distance.euclidean(vectors[focal_a].toarray(), vectors[focal_b].toarray())

    def average_distance(focals):
        total = 0.0
        count = 0
        for (a, b) in combinations(focals, 2):
            total += focal_distance(a, b)
            count += 1
        return total / count

    def average_distance_between(focals_a, focals_b):
        total = 0.0
        count = 0
        for a in focals_a:
            for b in focals_b:
                total += focal_distance(a, b)
                count += 1
        return total / count

    other_focals = list(filter(lambda x: x not in reference_focals, vectors.keys()))
    return {
        'within_reference': average_distance(reference_focals),
        'within_non_reference': average_distance(other_focals),
        'between_and_non_reference': average_distance_between(reference_focals, other_focals)
    }

import pandas as pd

references = get_references()
bs = []
for i in range(500):
    print(i)
    reference = next(references)
    vectors = calculate_vectors(reference['_id'])
    b = benchmark(vectors, reference['focals'])
    bs.append({
        **b,
        'reference': reference['_id'],
        'supports_hypothesis': b['within_reference'] < b['within_non_reference']
    })
    result = pd.DataFrame(bs)
    print(result['supports_hypothesis'].value_counts())

0
True    1
Name: supports_hypothesis, dtype: int64
1
True     1
False    1
Name: supports_hypothesis, dtype: int64
2
True     2
False    1
Name: supports_hypothesis, dtype: int64
3
True     3
False    1
Name: supports_hypothesis, dtype: int64
4
True     3
False    2
Name: supports_hypothesis, dtype: int64
5


KeyboardInterrupt: 