Permalink
Browse files

Merge branch 'removetfidf' into develop

  • Loading branch information...
2 parents c0e2b73 + 19a4dee commit 48be6e1948a6d79ee74f238ae99c1c013b9a46e5 @piskvorky piskvorky committed Mar 16, 2011
Showing with 1 addition and 237 deletions.
  1. +0 −1 src/gensim/parsing/__init__.py
  2. +0 −175 src/gensim/parsing/tfidf.py
  3. +1 −61 src/gensim/test/test_parsing.py
@@ -4,5 +4,4 @@
# bring model classes directly into package namespace, to save some typing
from porter import PorterStemmer
-from tfidf import tfidf
from preprocessing import *
@@ -1,175 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from math import log
-
-import numpy as np
-import scipy.sparse as sp
-
-def tokenize(text):
- return text.split()
-
-def tc(dataset, tokenizer=tokenize):
- """ term counts. this function creates the vocab (type to id dict)
- And also docs, this is the basic global freq dict.
- Note that this is not a bag of words (bow) format,
- here the bag contains the actual word, not the id.
- The ids are calculated a posteriori.
-
- input: a dataset (array of texts)
-
- If we wanted compatibility with gensim, this function
- could easyly output the bow..
- """
- vocab = {}
- docs = []
-
- for doc in dataset:
- d = {} # token => count
-
- for term in tokenizer(doc):
- vocab[term] = 1
- d[term] = d.get(term, 0) + 1
-
- docs.append(d)
-
- sorted_terms = sorted(vocab.keys())
- vocab = dict([(t, i) for i, t in enumerate(sorted_terms)])
-
- return docs, vocab
-
-
-
-def bow_from_tc(term_counts, vocab):
- """
- returns: a bow (bag of words). A bow is list of a tuples (corpusid, localfq)
- """
- bow = []
- for word, count in term_counts.iteritems():
- bow.append(( vocab[word], count))
- return bow
-
-def tf_from_tc(term_counts):
- """ Normalized local FQ (dividing by sum, not vector norms)
- """
- docs = []
-
- for doc in term_counts:
- d = {}
- length = sum(doc.values())
- for term, count in doc.items():
- d[term] = float(count) / length
- docs.append(d)
-
- return docs
-
-
-def idc_from_tc(term_counts):
- """ contextual diversity (n of docs the term appears in, in the corpus
- """
- t = {}
- for doc in term_counts:
- for term in doc:
- t[term] = t.get(term, 0) + 1
- return t
-
-def idf_from_tc(term_counts):
- """ inverse document frequency (related to contextual diversity)
- denominator of the tfidf formula
- """
- n_docs = len(term_counts)
- idf = {}
- idc = idc_from_tc(term_counts)
- for term in idc:
- idf[term] = log(n_docs*1.0/(idc[term]),10)# NOTE changed to log(x,10)
- return idf
-
-def tf_mul_idf(tf, idf):
- docs = []
-
- for doc in tf:
- d = {}
- for term in doc:
- d[term] = doc[term] * idf[term]
- docs.append(d)
-
- return docs
-
-def to_vector(idf_dict, vocab):
- ret = np.zeros(len(idf_dict))
- for term, idx in vocab.items():
- ret[idx] = idf_dict[term]
- return ret
-
-def to_sparse_matrix(tfidf_dict, vocab):
- tfm = sp.lil_matrix((len(vocab), len(tfidf_dict)), dtype=np.double)
-
- for j, doc in enumerate(tfidf_dict):
- for term in doc:
- try:
- i = vocab[term]
- tfm[i,j] = doc[term]
- except KeyError:
- pass
-
- return tfm
-
-def inverse_vocab(vocab):
- """
- Converts a vocab dictionary term => index to index => term
- """
- return dict((i,t) for t,i in vocab.items())
-
-def vocab_array(vocab):
- """
- Converts vocab dictionary to vocab array
- """
- return np.char.array(sorted(vocab.keys(), lambda a,b:
- cmp(vocab[a],vocab[b])))
-
-def vocab_dict(vocab):
- """
- Converts vocab array to vocab dictionary
- """
- return dict((term,i) for i,term in enumerate(vocab))
-
-def replace_vocab(td, oldvocab, newvocab):
- """
- td: V x X term-document matrix
- oldvocab: dictionary
- newvocab: dictionary
- """
- newtd = np.zeros((len(newvocab),td.shape[1]))
- for term in newvocab:
- try:
- newtd[newvocab[term]] = td[oldvocab[term]]
- except KeyError:
- newtd[newvocab[term]] = 0
- return newtd
-
-class tfidf(object):
- """
- dataset is an list of strings
- """
- def __init__(self, dataset, tokenizer=tokenize):
- self._dataset = dataset
- self._tokenizer = tokenizer
-
- def as_dict(self):
- term_counts, vocab = tc(self._dataset, self._tokenizer)
- tf = term_counts # NOTE before it was: tf_from_tc(term_counts) but
- # tf from tc normalizes, and this doesn't replicate miislita counts
- idf = idf_from_tc(term_counts)
- return tf_mul_idf(tf, idf), vocab
-
- def as_sparse_matrix(self):
- tfidf_dict, vocab = self.as_dict()
- return to_sparse_matrix(tfidf_dict, vocab), vocab
-
- def as_array(self):
- tfm, vocab = self.as_sparse_matrix()
- return tfm.toarray(), vocab
-
- def get_idf(self):
- term_counts, vocab = tc(self._dataset, self._tokenizer)
- idf = idf_from_tc(term_counts)
- return idf, vocab
@@ -9,12 +9,11 @@
import unittest
import numpy as np
-from gensim.parsing.tfidf import *
from gensim.parsing.preprocessing import *
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
- level=logging.WARNING)
+ level=logging.WARNING)
# several documents
@@ -48,65 +47,6 @@
classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]])
-class TestTfidf(unittest.TestCase):
-
- def testTokenize(self):
- self.assertEquals(tokenize("salut les amis"),
- ["salut", "les", "amis"])
-
- self.assertEquals(tokenize("salut les amis "),
- ["salut", "les", "amis"])
-
- self.assertEquals(tokenize("Salut LES amis !"),
- ["Salut", "LES", "amis", "!"])
-
- def testTermCounts(self):
- term_counts, vocab = tc(dataset)
- self.assertEquals(len(term_counts), len(dataset))
- for i in range(len(dataset)):
- # len of the documents should be equal to the sum of word counts
- self.assertEquals(len(tokenize(dataset[i])),
- sum(term_counts[i].values()))
-
- self.assertEquals(term_counts[0]["la"], 1)
- self.assertEquals(term_counts[1]["la"], 3)
- self.assertRaises(KeyError, term_counts[2].__getitem__, "la")
- self.assertEquals(term_counts[3]["la"], 1)
-
- def testTermFrequencies(self):
- term_counts, vocab = tc(dataset)
- term_frequencies = tf_from_tc(term_counts)
- for doc in term_frequencies:
- self.assertAlmostEquals(sum(doc.values()), 1.0)
-
- self.assertTrue(term_frequencies[0]["la"] > 0)
- self.assertTrue(term_frequencies[1]["la"] > 0)
- self.assertRaises(KeyError, term_frequencies[2].__getitem__, "la")
- self.assertTrue(term_frequencies[3]["la"] > 0)
-
- def testInvertDocumentCounts(self):
- term_counts, vocab = tc(dataset)
- inv_doc_counts = idc_from_tc(term_counts)
- self.assertEquals(len(vocab), len(inv_doc_counts))
- self.assertEquals(inv_doc_counts["la"], 3)
-
- def testInvertDocumentFrequencies(self):
- term_counts, vocab = tc(dataset)
- inv_doc_freq = idf_from_tc(term_counts)
- self.assertEquals(len(vocab), len(inv_doc_freq))
- self.assertTrue(inv_doc_freq["la"] > 0)
- to_vector(inv_doc_freq, vocab)
-
- def testTFIDFDict(self):
- td, v = tfidf(dataset).as_dict()
- self.assertTrue(td[0]["la"] > 0)
- self.assertTrue(td[1]["la"] > 0)
- self.assertRaises(KeyError, td[2].__getitem__, "la")
- self.assertTrue(td[3]["la"] > 0)
-
- def testTFIDFArray(self):
- td, v = tfidf(dataset).as_array()
-
class TestPreprocessing(unittest.TestCase):

0 comments on commit 48be6e1

Please sign in to comment.