In [1]:
from database import get_local_database
db = get_local_database()

In [2]:
reference_flows = {}
information_flow = db.materialized_information_flow.find().sort('date', 1)
for row in information_flow:
    focal = row['focal']
    reference = row['reference']
    if not focal in reference_flows:
        reference_flows[focal] = []
    reference_flows[focal].append(reference)

In [3]:
most_popular_reference = next(db.materialized_reference_popularity.find().sort([('popularity', -1)]))
max_popularity = most_popular_reference['popularity']
reference_popularity = db.materialized_reference_popularity.find({'popularity': {'$lte': int(max_popularity / 2)}}).sort([('popularity', -1), ('_id', 1)])
global_reference = next(reference_popularity)
global_reference_focals = global_reference['focals']

In [4]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
features = {}
for focal in reference_flows:
    reference_flow = reference_flows[focal]
    try:
        last_index_of_global_reference = len(reference_flow) - reference_flow[::-1].index(global_reference['_id']) - 1
        limited_reference_flow = reference_flow[:last_index_of_global_reference]
    except ValueError:
        # Use full history when there is no global reference in there
        limited_reference_flow = reference_flow

    #Feature selection algorithm (key is the feature and value is the intensity)
    features[focal] = dict([(reference, 1) for reference in limited_reference_flow])

vectorizer.fit(features.values())
vectors = dict(map(lambda e: (e[0], vectorizer.transform(e[1])), features.items()))

In [5]:
from scipy.spatial import distance
from itertools import combinations

def focal_distance(focal_a, focal_b):
    return distance.euclidean(vectors[focal_a].toarray(), vectors[focal_b].toarray())

def average_distance(focals):
    total = 0.0
    count = 0
    for (a, b) in combinations(focals, 2):
        total += focal_distance(a, b)
        count += 1
    return total / count

def average_distance_between(focals_a, focals_b):
    total = 0.0
    count = 0
    for a in focals_a:
        for b in focals_b:
            total += focal_distance(a, b)
            count += 1
    return total / count


non_global_reference_focals = list(filter(lambda x: x not in global_reference_focals, vectors.keys()))
average_distances = {
    'within_global_reference': average_distance(global_reference_focals),
    'within_non_global_reference': average_distance(non_global_reference_focals),
    'between_global_and_non_global_reference': average_distance_between(global_reference_focals, non_global_reference_focals)
}