In [25]:
import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
import nltk
import os
import re
import json
import math
import numpy as np

In [26]:
data_folder = "data/NewsArticles/"
files = os.listdir(data_folder)

In [27]:
files = sorted(files)
doc_id = {}
for i in range(len(files)):
    doc_id['doc'+str(i)] = files[i]
with open('data/doc_ids.json','w') as f:
    json.dump(doc_id, f, indent=True)

# Preprocessing

In [28]:
with open("STOPWORDS", 'r') as f:
    STOPWORDS = set(f.read().split("\n"))

In [29]:
s = set(nltk.corpus.stopwords.words('english')) | STOPWORDS
s.add('')

In [30]:
s

{'',
 "'ll",
 'a',
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 'ain',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apparently',
 'approximately',
 'are',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'at',
 'auth',
 'available',
 'away',
 'awfully',
 'b',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'begin',
 'beginning',
 'beginnings',
 'begins',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'biol',
 'both',
 'brief',
 'briefly',
 'but',
 'by',
 '

In [31]:
texts = []
text_dict = {}
for file in files:
    with open(data_folder+file,encoding="ISO-8859-1") as f:
        text = f.read()
        text = re.split('[, \.\n]', text)
        text = [word.lower() for word in text if word.lower() not in s and word.isalpha() and len(word)>=3]
        texts.append(text)
        text_dict[file] = text

In [32]:
common_dictionary = Dictionary(texts)

In [33]:
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

# Training

In [41]:
lda = gensim.models.LdaModel(common_corpus,id2word=common_dictionary ,num_topics=30)

In [42]:
lda.get_document_topics(common_dictionary.doc2bow(texts[1]))

[(20, 0.9876068)]

# Document Topics

In [12]:
doc_topic_json = {}
for i in doc_id:
    topic_array = [0 for i in range(30)]
    topics = lda.get_document_topics(common_dictionary.doc2bow(text_dict[doc_id[i]]))
    for tpc in topics:
        topic_array[tpc[0]] = str(tpc[1])
    doc_topic_json[i] = topic_array
with open("data/doc_topic.json","w") as f:
    json.dump(doc_topic_json, f)

# Text v/s Topic

In [13]:
matrix = [[str(0) for i in range(150)] for i in range(len(texts))]

### Increase for more Radius
factor = 1
###

for i in range(len(texts)):
    topics = lda.get_document_topics(common_dictionary.doc2bow(texts[i]))
    for t in topics:
        matrix[i][t[0]] = str(t[1] * factor)

In [14]:
string = ','+','.join(["Topic"+str(i) for i in range(30)])
for i in range(len(texts)):
    string += '\ndoc' + str(i) + ',' + ','.join(matrix[i])

with open("data/text_topic_matrix.csv", "w") as f:
    f.write(string)

# Word v/s Topic

In [15]:
csv_string = 'word,value'

for t in lda.get_topic_terms(1, topn=100):
#     print( lda.id2word[t[0]],t[1],t[0])
    csv_string += '\n' + lda.id2word[t[0]] + ',' + str(t[1])
# t = lda.get_topic_terms(57, topn=100)

with open('topic1.csv', 'w') as f:
    f.write(csv_string)

In [16]:
words = {}
for k,v in lda.id2word.items():
    words[v] = k
with open("data/word_ids.json", "w") as f:
    json.dump(words, f)
len(words)

1589

In [48]:
word_dict['6'], words

({2: '0.0034798672', 16: '0.001532287'},
 {'actions': 0,
  'administration': 1,
  'appointment': 2,
  'asked': 3,
  'attorney': 4,
  'authority': 5,
  'bring': 6,
  'bringing': 7,
  'charges': 8,
  'connected': 9,
  'conservative': 10,
  'consider': 11,
  'contended': 12,
  'counsel': 13,
  'criminal': 14,
  'decisions': 15,
  'department': 16,
  'designating': 17,
  'dojs': 18,
  'door': 19,
  'email': 20,
  'fast': 21,
  'floated': 22,
  'furious': 23,
  'general': 24,
  'generally': 25,
  'groups': 26,
  'gun': 27,
  'hewitt': 28,
  'hillary': 29,
  'host': 30,
  'hugh': 31,
  'idea': 32,
  'include': 33,
  'internal': 34,
  'interview': 35,
  'ire': 36,
  'jeff': 37,
  'justice': 38,
  'left': 39,
  'noncommittal': 40,
  'obama': 41,
  'open': 42,
  'openness': 43,
  'operated': 44,
  'practices': 45,
  'provoked': 46,
  'radio': 47,
  'republican': 48,
  'revenue': 49,
  'review': 50,
  'scandal': 51,
  'second': 52,
  'sessions': 53,
  'special': 54,
  'suggested': 55,
  'suggest

In [17]:
word_dict = {}
for topic in range(30):
    word_list = lda.get_topic_terms(topic, topn=100)
    for word_tuple in word_list:
        word_id = word_tuple[0]
        word_score = word_tuple[1]
        if str(word_id) not in word_dict:
            word_dict[str(word_id)] = {}
        word_dict[str(word_id)][topic] = str(word_score)

word_dict
with open("data/word_topic.json", "w") as f:
    json.dump(word_dict, f)

# DOC TEXT

In [18]:
texts = []
ix = 0
for file in files:
    print(ix)
    with open(data_folder+file,encoding="ISO-8859-1") as f:
        text = "\n"+f.read()
        text_words = set(re.split('[, \.\n]', text))
        html_string = text
        for word in text_words:
            if word.lower() in words and str(words[word.lower()]) in word_dict:
                keys = list(word_dict[str(words[word.lower()])].keys())
                classes = ''
                for i in keys:
                    classes += 't'+str(i)+" "
#                 html_string = html_string.replace(r"\b"+word+"\b","<span class=\""+classes+"hoverable\">"+word+"</span>")
                html_string = html_string.replace("\n"+word+" ","\n<span class=\""+classes+"hoverable\">"+word+"</span> ")
                html_string = html_string.replace(" "+word+"\n"," <span class=\""+classes+"hoverable\">"+word+"</span>\n")
                html_string = html_string.replace(" "+word+" "," <span class=\""+classes+"hoverable\">"+word+"</span> ")
                html_string = html_string.replace("\n"+word+"\n","\n<span class=\""+classes+"hoverable\">"+word+"</span>\n")
#                 html_string = re.sub(word+" ","<span class=\""+classes+"hoverable\">"+word+"</span>",html_string)
#                 html_string = re.sub(" "+word,"<span class=\""+classes+"hoverable\">"+word+"</span>",html_string)
        with open("data/doc"+str(ix)+".txt","w",encoding="ISO-8859-1") as f1:
            f1.write(html_string)
    ix+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


# TOPIC V/S WORDS

In [19]:
for topic in range(30):
    word_list = lda.get_topic_terms(topic, topn=100)
    file_str = "word,value"
    for word_tuple in word_list:
        word_id = word_tuple[0]
        file_str += "\n" + str(lda.id2word[word_tuple[0]]) +","+ str(word_tuple[1])
    with open("data/topic"+str(topic)+".csv", "w") as f:
        f.write(file_str)

In [22]:
texts = []
ix = 0
for file in files:
    print(ix)
    with open(data_folder+file,encoding="ISO-8859-1") as f:
        text = "\n"+f.read()
#         lines = text.split("\n")
        line_word_list = re.split('[, \.\n]', text)
        no_lines = len(line_word_list)//10
        
        line_graph_data = []
        for i in range(30):
            l = {
                "name": "Topic"+str(i),
                "id": "t"+str(i),
                "values": []
            }
            line_graph_data.append(l)
        
        
        for line_index in range(no_lines):
            line_words = line_word_list[10*line_index:10*(line_index+1)]
            line_scores = {}
            for word in line_words:
                if word.lower() in words and str(words[word.lower()]) in word_dict:
                    t_scores = word_dict[str(words[word.lower()])]
                    for t_ix in t_scores:
                        word_score = float(t_scores[t_ix]) 
                        if t_ix not in line_scores:
                            line_scores[t_ix] = 0
                        line_scores[t_ix] += word_score

            for topic in range(30):
                if topic in line_scores:
                    line_graph_data[topic]["values"].append({"line":str(line_index+1), "frequency":str(line_scores[topic])})
                else:
                    line_graph_data[topic]["values"].append({"line":str(line_index+1), "frequency":str(0)})
    with open("data/doc"+str(ix)+"_linegraph.json", "w") as f:
        json.dump({"data":line_graph_data}, f, indent=True)
    ix+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
