In [1]:
from sqlalchemy import Table, MetaData, create_engine

In [None]:
engine = create_engine("postgresql://postgres:sydney@localhost/beehive-data_development")

## Fetch beneficiaries from database

In [7]:
bens = {}
with engine.connect() as conn:
    res = conn.execute("""SELECT g.id AS grant_id, 
            b.id AS ben_id, 
            b.label AS ben_label, 
            b."group" AS ben_group, 
            b.sort AS ben_label, 
            g.description AS grant_description
        FROM grants g
            INNER JOIN stakeholders s ON g.id = s.grant_id
            INNER JOIN beneficiaries b ON s.beneficiary_id = b.id
        GROUP BY b.id, g.id
        ORDER BY b.id""")
    for i in res:
        if i[2] not in bens:
            bens[i[2]] = []
        bens[i[2]].append(i[5])

## Feature detection and distinctiveness

From <https://de.dariah.eu/tatom/feature_selection.html>

In [9]:
import os
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [65]:
import pandas as pd

In [13]:
# turn the beneficiaries dict into some lists in the correct format
raw_texts = []
categories = []
for i in bens:
    categories.append(i)
    raw_texts.append( " ".join(bens[i]) )

In [19]:
vectorizer = CountVectorizer(input='content')
dtm = vectorizer.fit_transform(raw_texts)
vocab = np.array(vectorizer.get_feature_names())
# fit_transform returns a sparse matrix (which uses less memory)
# but we want to work with a normal numpy array.
dtm = dtm.toarray()
# normalize counts to rates per 1000 words
rates = 1000 * dtm / np.sum(dtm, axis=1, keepdims=True)

In [27]:
# calculate averages
averages = []
for i in rates:
    averages.append(np.mean(i, axis=0))

In [95]:
pd.DataFrame([(b, len(bens[b])) for b in bens])

Unnamed: 0,0,1
0,With family/relationship challenges,7
1,This organisation,40
2,With housing/shelter challenges,4
3,With mental diseases or disorders,8
4,"At risk of sexual exploitation, trafficking, f...",5
5,With physical diseases or disorders,4
6,Other organisations,11
7,Facing income poverty,6
8,With disabilities,9
9,Who are unemployed,11


In [99]:
# check one category against all the others
ben = u'This organisation'

print "Looking at category", ben
print "Number of source documents:", len(bens[ben])

ben_indices, other_indices = [], []
for index, fn in enumerate(categories):
    if ben==fn:
        ben_indices.append(index)
    else:
        other_indices.append(index)

ben_rates = rates[ben_indices, :]
other_rates = rates[other_indices, :]

ben_rates_avg = np.mean(ben_rates, axis=0)
other_rates_avg = np.mean(other_rates, axis=0)

distinctive_indices = (ben_rates_avg * other_rates_avg) == 0

ranking = np.argsort(ben_rates_avg[distinctive_indices] + other_rates_avg[distinctive_indices])[::-1]

keyness = np.abs(ben_rates_avg - other_rates_avg)
ranking = np.argsort(keyness)[::-1]

rates_avg = np.mean(rates, axis=0)
keyness = np.abs(ben_rates_avg - other_rates_avg) / rates_avg
ranking = np.argsort(keyness)[::-1]
vocab[ranking][0:10]

Looking at category This organisation
Number of source documents: 40


array([u'heart', u'post', u'plans', u'placing', u'oversee', u'develops',
       u'sustaining', u'outreach', u'oska', u'operation'], 
      dtype='<U16')

In [100]:
pd.DataFrame([
        ben_rates_avg,
        other_rates_avg,
        ranking,
        keyness
    ], index=[ben, "Others", "ranking", "keyness"], columns=vocab).transpose().sort_values("keyness", ascending=False)

Unnamed: 0,This organisation,Others,ranking,keyness
contractual,1.0,0.000000,509.0,19.000000
economic,1.0,0.000000,382.0,19.000000
heart,1.0,0.000000,146.0,19.000000
offices,1.0,0.000000,431.0,19.000000
robust,1.0,0.000000,309.0,19.000000
develops,2.0,0.000000,126.0,19.000000
horticulture,1.0,0.000000,70.0,19.000000
school,1.0,0.000000,106.0,19.000000
scope,1.0,0.000000,496.0,19.000000
seeks,1.0,0.000000,334.0,19.000000
