In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import seaborn as sns
import pandas as pd
import gensim
import nltk
import re
import string
from curses.ascii import isprint
from collections import defaultdict
import json

In [3]:
stop_words = set(open('stop_words').read().split('\n')[:-1])
#stop_words = 'the,for,to,you,call,that'.split(',')
#print stop_words

In [4]:
def clean_text(text):
    # Remove any beginning or ending whitespace
    text.strip()
    # Combine conjunctions
    text = re.sub(r"'", " ", text)
    # Replace punctuation with space
    text = text.translate(string.maketrans(string.punctuation,''.join(" " for char in string.punctuation)))
    # Replace all control characters
    #text = ''.join(char for char in text if isprint(char))
    #text = re.sub(r"\n", " ", text)
    # Remove Digits
    text = re.sub(r"[0-9]", " ", text)
    
    
    text = text.lower()
    
    text_list = [token for token in text.split() if token not in stop_words and len(token) > 1]
    
    return text_list

def get_name_from_id(person_id):
    return alias_table[alias_table["PersonId"] == person_id]["Alias"].iloc[0]

def get_receivers(email_id):
    return list(email_rec_table[email_rec_table["EmailId"] == email_id]["PersonId"])

In [5]:
emails_table = pd.read_csv("data/emails/Emails.csv").dropna()
person_table = pd.read_csv("data/emails/Persons.csv").dropna()
alias_table = pd.read_csv("data/emails/Aliases.csv").dropna()
email_rec_table = pd.read_csv("data/emails/EmailReceivers.csv").dropna()

In [6]:
emails_table['ReceiverId'] = emails_table["Id"].apply(lambda email_id: get_receivers(email_id))
emails_table['doc'] = emails_table["ExtractedSubject"] + " " + emails_table["ExtractedBodyText"]
emails_table['token_list'] = emails_table['doc'].apply(lambda s: clean_text(s))

In [7]:
personId_to_docs = defaultdict(list)
doc_number = 0
for index, row in emails_table.iterrows():
    for p_id in row["ReceiverId"]:
        if p_id != 80:
            personId_to_docs[str(p_id)].append(doc_number)
    
    sender_id = int(row["SenderPersonId"])
    if sender_id != 80:
        personId_to_docs[str(sender_id)].append(doc_number)
    
    doc_number += 1
    

In [8]:
dictionary = gensim.corpora.Dictionary(list(emails_table['token_list']))
dictionary.save('models/dictionary.dict')

In [9]:
corpus = [dictionary.doc2bow(text) for text in emails_table['token_list']]
gensim.corpora.MmCorpus.serialize('models/corpus.mm', corpus) 

In [10]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
lda.save('models/emails_lda.model')

In [13]:
hdp = gensim.models.hdpmodel.HdpModel(corpus, dictionary, T=50)                                  
hdp.save('models/emails_hdp.model')

In [77]:
personId_to_topic_prob = defaultdict(list)

for person_id in personId_to_docs.keys():
    for top_num in range(lda.num_topics):
        personId_to_topic_prob[person_id].append(0.0)



for person_id, doc_list in personId_to_docs.items():
    name =  get_name_from_id(int(person_id))
    num_docs_assignmed = float(len(doc_list))
    for doc_num in doc_list:
        topic_dist = lda[corpus[doc_num]]
        for topic, probability in topic_dist:
            personId_to_topic_prob[person_id][topic] += (probability / num_docs_assignmed) * 100.0
          

In [83]:
bipartite_graph = dict()
bipartite_graph["nodes"] = []
bipartite_graph["edges"] = []

target = 0
for topic_num in range(lda.num_topics):
    bipartite_graph["nodes"].append({"name": str(topic_num), "value": 1})
    target += 1


for person_id, doc_list in personId_to_docs.items():
    name =  get_name_from_id(int(person_id))
    bipartite_graph["nodes"].append({"name": name, "value": 0})
    for topic, topic_prob in enumerate(personId_to_topic_prob[person_id]):
        source = topic
        if topic_prob > 0.0:
            bipartite_graph["edges"].append({"source": source, "target": target, "weight": topic_prob})    
    target += 1


with open('bipartite_graph/lda_data.json', 'w') as fp:
    json.dump(bipartite_graph, fp)

In [88]:
bipartite_graph = dict()
bipartite_graph["nodes"] = []
bipartite_graph["edges"] = []

specific_person = 10

target = 0
for topic_num in range(lda.num_topics):
    bipartite_graph["nodes"].append({"name": str(topic_num), "value": 1})
    target += 1


for person_id, doc_list in personId_to_docs.items():
    if int(person_id) == specific_person:
        name =  get_name_from_id(int(person_id))
        bipartite_graph["nodes"].append({"name": name, "value": 0})
        for topic, topic_prob in enumerate(personId_to_topic_prob[person_id]):
            source = topic
            if topic_prob > 0.0:
                bipartite_graph["edges"].append({"source": source, "target": target, "weight": topic_prob})    
        target += 1


with open('bipartite_graph/lda_specific_person.json', 'w') as fp:
    json.dump(bipartite_graph, fp)