/
tfidf.py
175 lines (139 loc) · 4.39 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
from math import log
import numpy as np
import scipy.sparse as sp
def tokenize(text):
return text.split()
def tc(dataset, tokenizer=tokenize):
""" term counts. this function creates the vocab (type to id dict)
And also docs, this is the basic global freq dict.
Note that this is not a bag of words (bow) format,
here the bag contains the actual word, not the id.
The ids are calculated a posteriori.
input: a dataset (array of texts)
If we wanted compatibility with gensim, this function
could easyly output the bow..
"""
vocab = {}
docs = []
for doc in dataset:
d = {} # token => count
for term in tokenizer(doc):
vocab[term] = 1
d[term] = d.get(term, 0) + 1
docs.append(d)
sorted_terms = sorted(vocab.keys())
vocab = dict([(t, i) for i, t in enumerate(sorted_terms)])
return docs, vocab
def bow_from_tc(term_counts, vocab):
"""
returns: a bow (bag of words). A bow is list of a tuples (corpusid, localfq)
"""
bow = []
for word, count in term_counts.iteritems():
bow.append(( vocab[word], count))
return bow
def tf_from_tc(term_counts):
""" Normalized local FQ (dividing by sum, not vector norms)
"""
docs = []
for doc in term_counts:
d = {}
length = sum(doc.values())
for term, count in doc.items():
d[term] = float(count) / length
docs.append(d)
return docs
def idc_from_tc(term_counts):
""" contextual diversity (n of docs the term appears in, in the corpus
"""
t = {}
for doc in term_counts:
for term in doc:
t[term] = t.get(term, 0) + 1
return t
def idf_from_tc(term_counts):
""" inverse document frequency (related to contextual diversity)
denominator of the tfidf formula
"""
n_docs = len(term_counts)
idf = {}
idc = idc_from_tc(term_counts)
for term in idc:
idf[term] = log(n_docs*1.0/(idc[term]),10)# NOTE changed to log(x,10)
return idf
def tf_mul_idf(tf, idf):
docs = []
for doc in tf:
d = {}
for term in doc:
d[term] = doc[term] * idf[term]
docs.append(d)
return docs
def to_vector(idf_dict, vocab):
ret = np.zeros(len(idf_dict))
for term, idx in vocab.items():
ret[idx] = idf_dict[term]
return ret
def to_sparse_matrix(tfidf_dict, vocab):
tfm = sp.lil_matrix((len(vocab), len(tfidf_dict)), dtype=np.double)
for j, doc in enumerate(tfidf_dict):
for term in doc:
try:
i = vocab[term]
tfm[i,j] = doc[term]
except KeyError:
pass
return tfm
def inverse_vocab(vocab):
"""
Converts a vocab dictionary term => index to index => term
"""
return dict((i,t) for t,i in vocab.items())
def vocab_array(vocab):
"""
Converts vocab dictionary to vocab array
"""
return np.char.array(sorted(vocab.keys(), lambda a,b:
cmp(vocab[a],vocab[b])))
def vocab_dict(vocab):
"""
Converts vocab array to vocab dictionary
"""
return dict((term,i) for i,term in enumerate(vocab))
def replace_vocab(td, oldvocab, newvocab):
"""
td: V x X term-document matrix
oldvocab: dictionary
newvocab: dictionary
"""
newtd = np.zeros((len(newvocab),td.shape[1]))
for term in newvocab:
try:
newtd[newvocab[term]] = td[oldvocab[term]]
except KeyError:
newtd[newvocab[term]] = 0
return newtd
class tfidf(object):
"""
dataset is an list of strings
"""
def __init__(self, dataset, tokenizer=tokenize):
self._dataset = dataset
self._tokenizer = tokenizer
def as_dict(self):
term_counts, vocab = tc(self._dataset, self._tokenizer)
tf = term_counts # NOTE before it was: tf_from_tc(term_counts) but
# tf from tc normalizes, and this doesn't replicate miislita counts
idf = idf_from_tc(term_counts)
return tf_mul_idf(tf, idf), vocab
def as_sparse_matrix(self):
tfidf_dict, vocab = self.as_dict()
return to_sparse_matrix(tfidf_dict, vocab), vocab
def as_array(self):
tfm, vocab = self.as_sparse_matrix()
return tfm.toarray(), vocab
def get_idf(self):
term_counts, vocab = tc(self._dataset, self._tokenizer)
idf = idf_from_tc(term_counts)
return idf, vocab