In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus.reader import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.stem import rslp
from six import string_types
import numpy as np
import os
import string
import nltk


In [None]:
my_corpus = PlaintextCorpusReader('data/Raw/quinhentasCorpus/',r'.*')
#my_corpus.fileids() # <- I expect values from column ID
#my_corpus.words() # <- I expect values from column TITLE and BODY
#my_corpus.sents() # <- I expect values from column TITLE and BODY

raw_dict = set([word.lower() for word in my_corpus.words() if (word.isalpha() and (word not in stopwords.words('portuguese')))])
print(raw_dict)

In [22]:
stemmer = rslp.RSLPStemmer()
stemmed_dict = set([stemmer.stem(word) for word in raw_dict])

In [26]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems=[]
    for item in tokens:
        stems.append(rslp.RSLPStemmer().stem(item))
    return stems

In [35]:
token_dict={}
path = 'data/Raw/quinhentasCorpus/'

for dirpath, dirs, files in os.walk(path):
    for f in files:
        fname = os.path.join(dirpath, f)
        with open(fname) as pearl:
            text = pearl.read()
            token_dict[f] = text.lower().translate(str.maketrans('','', string.punctuation))

tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords.words('portuguese'))
tfs = tfidf.fit_transform(token_dict.values())

In [52]:
vocabulary = set()
for file_id in my_corpus.fileids():
    words = tokenize(my_corpus.raw(file_id))
    vocabulary.update(words)

vocabulary = list(vocabulary)
word_index= {w: idx for idx, w in enumerate(vocabulary)}

VOCABULARY_SIZE=len(vocabulary)
DOCUMENTS_COUNT=len(my_corpus.fileids())

In [54]:
word_idf = np.zeros(VOCABULARY_SIZE)
for file_id in my_corpus.fileids():
    words=set(tokenize(my_corpus.raw(file_id)))
    indexes = [word_index[word] for word in words]
    word_idf[indexes]+=1.0

word_idf=np.log(DOCUMENTS_COUNT/(1+word_idf).astype(float))
print(word_idf[word_index['vac']])

1.3823023398503531


In [62]:
def word_tf(word, document):
    if isinstance(document, string_types):
        document = tokenize(document)
    
    return float(document.count(word))/len(document)


In [60]:
def tf_idf(word, document):
    if isinstance(document, string_types):
        document = tokenize(document)
    
    if word not in word_index:
        return .0
    
    return word_tf(word, document)*word_idf[word_index[word]]

In [95]:
tf_idf('bezerros',my_corpus.raw(my_corpus.fileids()[1]))

0.0

In [97]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

documents = [tokenize(my_corpus.raw(file_id)) for file_id in my_corpus.fileids()]
dictionary = Dictionary(documents)

tfidf_model = TfidfModel([dictionary.doc2bow(d) for d in documents], id2word = dictionary)
#tfidf_values = dict(tfidf_model[dictionary.doc2bow(tokenize(my_corpus.raw))])