In [1]:
import pandas as pd
import numpy as np
import nltk
# nltk.download()
import xml.etree.ElementTree as ET
from nltk.tokenize import WhitespaceTokenizer
import json
import glob
import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.tokenize import TweetTokenizer

In [2]:
df = pd.read_csv("./data/Lista codificare Topics in baza de date Reuters.txt", sep="\t")
df.head()

Unnamed: 0,Topic Description
1POL,CURRENT NEWS - POLITICS
2ECO,CURRENT NEWS - ECONOMICS
3SPO,CURRENT NEWS - SPORT
4GEN,CURRENT NEWS - GENERAL
6INS,CURRENT NEWS - INSURANCE


In [3]:
Tokenizer=TweetTokenizer()
punctuation_tokenizer = nltk.RegexpTokenizer(r"\w+")

def save_json_data(filename, data):
    with open(filename, 'w',) as f:
        json.dump(data, f, indent=4)

def process_sentece(data):
    #make all text lowerCase
    data = data.lower()
    data = Tokenizer.tokenize(data)
    stop = stopwords.words('english')
    #remove stopwords
    data = [item for item in data if item not in stop]
    #do the lemmatization
    
    #do the stemming
    data = [stemmer.stem(y) for y in data]
    #remove punctuation:
    data = punctuation_tokenizer.tokenize(' '.join(data))
    
    return data

def get_text_from_title(file):
    tree = ET.parse(file)
    return tree.find("title").text

def count_words_in_text(found_text):
     wordfreq = {}
     for word in process_sentece(found_text):
            if word not in wordfreq:
                wordfreq[word] = 0 
            wordfreq[word] += 1
     return wordfreq

def process_title(filename):
    wordfreq = {}
    extracted_text = get_text_from_title(filename)
    wordfreq = count_words_in_text(extracted_text)
    return {"word_freq": wordfreq, "extracted_text": extracted_text}

def topic_to_description(topic):
    return df.loc[topic].values

def topic_extractor(file):
    topics = []
    tree = ET.parse(file)
    for element in tree.find("//codes[@class='bip:topics:1.0']"):
        topics.append(element.attrib['code'])
    return topics

def process_body(file,tag):
    data = {}
    tree = ET.parse(file)
    root = tree.getroot()
    for child in root.findall(tag):
        found_text = ""
        for element in child:
            found_text += element.text + "\n "
            # print(element.text)
        #count words after doing the sentence tokenizing    
        data['word_freq'] = count_words_in_text(found_text)
        data['extracted_text'] = found_text
        return data

In [4]:
def process_file (path, name):
    file_data = {}
    file_data['text'] = process_body(path,"text")
    file_data['total_word_count'] = len(process_body(path,"text")['extracted_text'])
    file_data['title'] = process_title(path)
    file_data['topics'] = {}
    topics = topic_extractor(path)
    for item in topics:
        # add topic and its meaning
        topic = topic_to_description(item)[0]
        file_data['topics'][item] = topic
    
    #save data
    save_json_data(f"./data/processed/34/{name}.json",file_data)

In [5]:
#get an array of file path
files = glob.glob("./data/Reuters_34/Training/*")
#process each file
for path in files:
    name = path.split("\\")[1][:-4]
    process_file(path,name)



In [6]:
def merge_dict(global_text_data,local_text_data):
    for key in local_text_data.keys():
        if key in global_text_data.keys():
            global_text_data[key]+= local_text_data[key]
        else:
            global_text_data[key]= local_text_data[key]
    return global_text_data

def merge_dict_text(global_text_data,local_text_data):
    for key in local_text_data.keys():
        if key not in global_text_data.keys():
            global_text_data[key]= local_text_data[key]
      
    return global_text_data
#COMPILE GLOBAL DICTIONARY
def mk_global_dict():
    files = glob.glob("./data/processed/34/*")
    print(files)
    global_json = None
    for file in files:
        f = open(file)
        data = json.load(f)
        # print(data)
        if global_json is None:
            global_json = data
            # print(global_json)
        else:
            #add the word freq from text 
            global_text_data = global_json['text']['word_freq']
            local_text_data = data['text']['word_freq']
            global_text_data = merge_dict(global_text_data,local_text_data)
            
            #add title word counts
            global_title_data = global_json['title']['word_freq']
            local_title_data = data['title']['word_freq']
            global_title_data = merge_dict(global_title_data,local_title_data)
            #add topics
            global_topics_data = global_json['topics']
            local_topics_data = data['topics']
            global_topics_data = merge_dict_text(global_topics_data,local_topics_data)
            #update in ditionary
            global_json['text']['word_freq'] = global_text_data
            global_json['title']['word_freq'] = global_title_data
            global_json['topics'] = global_topics_data
    count = 0
    for key in global_json['text']['word_freq'].keys():
        count += global_json['text']['word_freq'][key]
    global_json['unique_words'] = len(global_json['text']['word_freq'].keys())
    global_json['unique_words_count'] = count
    global_json['text']['word_freq'] = merge_dict(global_json['text']['word_freq'],global_json['title']['word_freq'])
    del global_json['title']
    del global_json['total_word_count']
    del global_json['text']['extracted_text']
    save_json_data("./data/processed/34/global.json",global_json)
        
mk_global_dict()             
#TO DO ADD TITLE TO STEMMING

['./data/processed/34\\2504NEWS.json', './data/processed/34\\2538NEWS.json', './data/processed/34\\2775NEWS.json', './data/processed/34\\2792NEWS.json', './data/processed/34\\2822NEWS.json', './data/processed/34\\2836NEWS.json', './data/processed/34\\2848NEWS.json', './data/processed/34\\2917NEWS.json', './data/processed/34\\2955NEWS.json', './data/processed/34\\2978NEWS.json', './data/processed/34\\2982NEWS.json', './data/processed/34\\2984NEWS.json', './data/processed/34\\2988NEWS.json', './data/processed/34\\3665NEWS.json', './data/processed/34\\3785NEWS.json', './data/processed/34\\3813NEWS.json', './data/processed/34\\3902NEWS.json', './data/processed/34\\4206NEWS.json', './data/processed/34\\4263NEWS.json', './data/processed/34\\4289NEWS.json', './data/processed/34\\4294NEWS.json', './data/processed/34\\5104NEWS.json', './data/processed/34\\5216NEWS.json', './data/processed/34\\5220NEWS.json', './data/processed/34\\5229NEWS.json', './data/processed/34\\5520NEWS.json', './data/pro