In [1]:
import json
import csv

In [2]:
file_path = "../data/DAC_Entire_DataBase_2016.json"

with open(file_path, "r") as f:
    database= json.load(f)
    
with open('r2_keyword_list_post.csv') as r2_keyword_list:
    reader = csv.reader(r2_keyword_list, delimiter=',')
    w_id = 0
    nTopic_id = 0
    bTopic_id = 0
    w_dic = {}
    nTopic_dic = {}
    bTopic_dic = {}
    w_nTopic = {}
    w_bTopic = {}
    for row in reader:
        if row[0] not in w_dic:
            w_id += 1
            w_dic[row[0]] = w_id
            w_nTopic[w_id] = []
            w_bTopic[w_id] = []
        for nTopic in [row[1], row[2], row[3]]:
            if nTopic:
                if nTopic not in nTopic_dic:
                    nTopic_id += 1
                    nTopic_dic[nTopic] = nTopic_id
                w_nTopic[w_id].append(nTopic_dic[nTopic])
        for bTopic in [row[4], row[5], row[6]]:
            if len(bTopic)>0:
                if bTopic not in bTopic_dic:
                    bTopic_id += 1
                    bTopic_dic[bTopic] = bTopic_id
                w_bTopic[w_id].append(bTopic_dic[bTopic])

In [3]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ|VBG|VBD|VBN|RB>* <NN.*>+ <IN>)? <JJ|VBG|VBD|VBN|RB>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
    
    cand_list = []
    for cand in candidates:
        if cand not in stop_words and not all(char in punct for char in cand):
            cand_list.append(cand)
    return cand_list

In [29]:
#check if extracted chunk should be counted towards key_chunk
def measure_similarity(key_chunk, cur_chunk):
    stemmer = nltk.stem.porter.PorterStemmer()
    same = True
    words_in_key = word_tokenize(key_chunk)
    words_in_cur = word_tokenize(cur_chunk)
    key_list = []
    cur_list = []
    for word in words_in_key:
        stem_word = stemmer.stem_word(word)
        stem_word = stem_word.lower()
        key_list.append(stem_word)
    for cur_word in words_in_cur:
        stem_cur_word = stemmer.stem_word(cur_word)
        stem_cur_word = stem_cur_word.lower()
        cur_list.append(stem_cur_word)
    for key_part in key_list:
        if key_part not in set(cur_list):
            same = False
    return same

In [36]:
import nltk
import sys
from nltk import word_tokenize
years = range(2002, 2016)
year_file_name = '{}_with_id.csv'
year_record_name = '{}_record.txt'
labels14 = []
w_labels14 = []
nTopic_labels14 = []
bTopic_labels14 = []

for year in years:
    file_name = year_file_name.format(year)
    
    texts = []
    ind_id_dic = {}
    labels = {}
    w_labels = {}
    nTopic_labels = {}
    bTopic_labels = {}
    index = 0
    whole_record = []
    with open(file_name) as paper:
        reader = csv.DictReader(paper)
        for row in reader:
            text_list = []
            text_list.append(row['title'])
            text_list.append(row['abstract'])
            text = ". ".join(text_list)
            texts.append(text)
            ind_id_dic[index] = row['id']
            index += 1
    whole_records = []
    for i in range(len(texts)):
        text = texts[i]
        nTopic_scores = {}
        bTopic_scores = {}
        keyword_scores = {}
        records = []
        try:
            extracted_chunks = extract_candidate_chunks(text)
        except UnicodeDecodeError:
            text = text.decode('ascii', 'ignore').encode('ascii')
            extracted_chunks = extract_candidate_chunks(text)
        for extracted_chunk in extracted_chunks:
            nTopic_set = set([])
            bTopic_set = set([])
            for phrase in w_dic.keys():
                try: 
                    same = measure_similarity(phrase, extracted_chunk)
                except UnicodeDecodeError:
                    extracted_chunk = extracted_chunk.decode('ascii', 'ignore').encode('ascii')
                    phrase = phrase.decode('ascii', 'ignore').encode('ascii')
                    same = measure_similarity(phrase, extracted_chunk)
                if same:
                    records.append((extracted_chunk, phrase))
                    w_id = w_dic[phrase]
                    keyword_scores.setdefault(w_id, 0)
                    keyword_scores[w_id] += 1
                    nTopics = w_nTopic[w_id]
                    nTopic_set = nTopic_set.union(set(nTopics))
                    bTopics = w_bTopic[w_id]
                    bTopic_set = bTopic_set.union(set(bTopics))
            for nTopic in nTopic_set:
                nTopic_scores.setdefault(nTopic, 0)
                nTopic_scores[nTopic] += 1
            for bTopic in bTopic_set:
                bTopic_scores.setdefault(bTopic, 0)
                bTopic_scores[bTopic] += 1
        topics_for_text = [keyword_scores, nTopic_scores, bTopic_scores]
        labels[ind_id_dic[i]] = topics_for_text
        w_labels[ind_id_dic[i]] = keyword_scores
        nTopic_labels[ind_id_dic[i]] = nTopic_scores
        bTopic_labels[ind_id_dic[i]] = bTopic_scores
        whole_record.append(records)
        
        record_name = year_record_name.format(year)
        with open(record_name, 'w') as outfile:
            index = 1
            writer = csv.writer(outfile)
            for records in whole_record:
                writer.writerow([index])
                index += 1
                for record in records:
                    writer.writerow([record])
    
    labels14.append(labels)
    w_labels14.append(w_labels)
    nTopic_labels14.append(nTopic_labels)
    bTopic_labels14.append(bTopic_labels)

In [32]:
# print w_dic.keys()

In [33]:
# nt_ids = []
# for paper_id, paper_ntopic in nTopic_labels.items():
#     for ntopic_id, freq in paper_ntopic.items():
#         if ntopic_id not in nt_ids:
#             nt_ids.append(ntopic_id)

In [34]:
# max(nt_ids)

In [35]:
# nTopic_labels

In [38]:
import pandas as pd
num_nTopic = 50
year = 2002
year_file_name = '{}_nTopic_Freq.xlsx'
for nTopic_labels in nTopic_labels14:
    list_for_df = []
    for paper_id, nTopic_label_dic in nTopic_labels.items():
        ini_nTopic_label_list = [0] * (num_nTopic +1)
        ini_nTopic_label_list[0] = paper_id
        for nTopic, freq in nTopic_label_dic.items():
            ini_nTopic_label_list[nTopic] = freq
        list_for_df.append(ini_nTopic_label_list)
    table = pd.DataFrame(list_for_df)
    table = table.sort_values(by=[0], ascending = True)
    file_name = year_file_name.format(year)
    writer = pd.ExcelWriter(file_name)
    table.to_excel(writer)
    writer.save()
    year += 1

In [16]:
bt_ids = []
for paper_id, paper_btopic in bTopic_labels.items():
    for btopic_id, freq in paper_btopic.items():
        if btopic_id not in bt_ids:
            bt_ids.append(btopic_id)

In [21]:
# nt_ids

In [39]:
import pandas as pd
num_bTopic = 10
year = 2002
year_file_name = '{}_bTopic_Freq.xlsx'
for bTopic_labels in bTopic_labels14:
    list_for_df = []
    for paper_id, bTopic_label_dic in bTopic_labels.items():
        ini_bTopic_label_list = [0] * (num_bTopic +1)
        ini_bTopic_label_list[0] = paper_id
        for bTopic, freq in bTopic_label_dic.items():
            ini_bTopic_label_list[bTopic] = freq
        list_for_df.append(ini_bTopic_label_list)
    table = pd.DataFrame(list_for_df)
    table = table.sort_values(by=[0], ascending = True)
    file_name = year_file_name.format(year)
    writer = pd.ExcelWriter(file_name)
    table.to_excel(writer)
    writer.save()
    year += 1

In [42]:
with open('50_nTopics.txt', 'w') as outfile:
    writer = csv.writer(outfile)
    nTopic_header = [""]*num_nTopic
    for nTopic, index in nTopic_dic.items():
        nTopic_header[index-1] = (index, nTopic)
    for topic_index in nTopic_header:
        writer.writerow([topic_index])

In [41]:
#write broad topic dic into txt file
with open('10_bTopics.txt', 'w') as outfile:
    writer = csv.writer(outfile)
    bTopic_header = [""]*num_bTopic
    for bTopic, index in bTopic_dic.items():
        bTopic_header[index-1] = (index, bTopic)
    for topic_index in bTopic_header:
        writer.writerow([topic_index])