In [5]:
import sqlite3
import pandas as pd
from scipy.stats import chi2_contingency

cnx_lda = sqlite3.connect("1_31_LDA.db")
cnx_sentiment = sqlite3.connect("2016-01_sentiments_annotated.db")

In [6]:
# get topic distribution over stories
_ = pd.read_sql("SELECT * FROM [1_31_LDA]", cnx_lda)
topics = [str(i) for i in range(100)]
df_lda = _[topics]
topics_lemmas = _.loc[_.index[-1]][topics]
df_lda.index = _['story_id']
df_lda = df_lda[:-1]

In [7]:
# get emotion vectors
_ = pd.read_sql("SELECT * FROM [2016-01_sentiments_annotated.db]", cnx_sentiment)
df_emotions = _[['negative', 'ambiguous', 'positive']]
df_emotions.index = _['story_id']


In [8]:
def controversy(topic, cutoff_topic=.1, df_emotions=df_emotions, df_lda=df_lda, mode="custom"):
    # retrieve all relevant story ids for given topic
    story_ids = list()
    for row in df_lda.iterrows():
        if row[1][topic] is not None:
            if float(row[1][topic]) > cutoff_topic:
                story_ids.append(row[0])
    story_ids = set(story_ids)

    # retrieve all emotions vectors for relevant stories
    emotion_vectors = list()
    for row in df_emotions.iterrows():
        if str(row[0]) in story_ids:
            if row[1].values.sum() > 0:
                emotion_vectors.append(list(row[1].values))

    # calculate divergence
    if len(emotion_vectors) > 2:
        if mode == "chi2":
            _, p, _, _ = chi2_contingency(emotion_vectors)
            score = 1 - p
        elif mode == "custom":
            neg = 0
            pos = 0
            for e in emotion_vectors:
                neg += e[0]
                pos += e[2]
            if neg >= 5 and pos >= 5:
                if neg >= pos:
                    score = pos / neg
                else:
                    score = neg / pos
            else:
                score = 0
        print("topic " + topic + ": controversy score: " + str(score))
        return score, story_ids

    else:
        print("topic " + topic + ": not enough stories with emotions vectors in that topic")
        return 0, story_ids


# evaluate for each topic
stories = list()
controversy_scores = list()
for topic in topics:
    score, ids = controversy(topic)
    controversy_scores.append(score)
    stories.append(ids)




topic 0: controversy score: 0.919527896996
topic 1: controversy score: 0.606060606061
topic 2: controversy score: 0.797833935018
topic 3: controversy score: 0.70243902439
topic 4: controversy score: 0.950310559006
topic 5: controversy score: 0.527777777778
topic 6: controversy score: 0.647777777778
topic 7: controversy score: 0.55
topic 8: controversy score: 0.75
topic 9: controversy score: 0.653250773994
topic 10: controversy score: 0.560468140442
topic 11: controversy score: 0.511879049676
topic 12: controversy score: 0.613526570048
topic 13: controversy score: 0.841121495327
topic 14: controversy score: 0.628272251309
topic 15: controversy score: 0.513196480938
topic 16: controversy score: 0.627214170692
topic 17: controversy score: 0.692434324397
topic 18: controversy score: 0.811926605505
topic 19: controversy score: 0.382394366197
topic 20: controversy score: 0.501992031873
topic 21: controversy score: 0.86974789916
topic 22: controversy score: 0.907185628743
topic 23: controvers

In [None]:
df_topic_controversy = pd.DataFrame(index=topics)
df_topic_controversy['controversy'] = controversy_scores
df_topic_controversy['lemmas'] = topics_lemmas
df_topic_controversy['story_ids'] = stories
df_topic_controversy.to_csv("January_controversy_scores.csv")