In [1]:
from bs4 import BeautifulSoup

In [4]:
def cleanHtml(html):
    if html == "": return ""
    return BeautifulSoup(html, 'html5lib').get_text()

#txt = "Don&#39;t forget about HTML entities and <strong>markup</strong> when mining text!<br/>"

txt = "&#169; 2022 Texas A&amp;M University - Central Texas | A member of the Texas A&amp;M University System"

In [5]:
print(cleanHtml(txt))

© 2022 Texas A&M University - Central Texas | A member of the Texas A&M University System


In [6]:
corpus = {
    'a': "Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.",
    'b': "Professor Plum has a green plant in his study.",
    'c': "Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."
}

In [7]:
corpus

{'a': 'Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.',
 'b': 'Professor Plum has a green plant in his study.',
 'c': "Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."}

In [8]:
terms = {
    'a': [i.lower() for i in corpus['a'].split()],
    'b': [i.lower() for i in corpus['b'].split()],
    'c': [i.lower() for i in corpus['c'].split()]    
}

In [9]:
corpus['a']

'Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.'

In [10]:
terms['a']

['mr.',
 'green',
 'killed',
 'colonel',
 'mustard',
 'in',
 'the',
 'study',
 'with',
 'the',
 'candlestick.',
 'mr.',
 'green',
 'is',
 'not',
 'a',
 'very',
 'nice',
 'fellow.']

In [11]:
from math import log

In [12]:
QUERY_TERMS = ['mr.', 'green']

In [13]:
def tf(term, doc, normalize=True):
    doc=doc.lower().split()
    if normalize:
        return doc.count(term.lower())/float(len(doc))
    else:
        return doc.count(term.lower())/1.0

In [20]:
a=corpus['a']

In [25]:
tf("mr.",a)

0.10526315789473684

In [26]:
2/19

0.10526315789473684

In [27]:
def idf(term, corpus):
    num_texts_with_term = len([True for text in corpus if term.lower() in text.lower().split()])
    try:
        return 1.0+log(float(len(corpus))/num_texts_with_term)
    except:
        return 1.0

In [31]:
idf("mr.",a)

1.0

In [32]:
def tf_idf(term, doc, corpus):
    return tf(term, doc) * idf(term, corpus)

In [33]:
tf_idf("mr.", a, corpus)

0.10526315789473684

In [35]:
for (k,v) in sorted(corpus.items()):
    print(k, ':', v)
print()

a : Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.
b : Professor Plum has a green plant in his study.
c : Miss Scarlett watered Professor Plum's green plant while he was away from his office last week.



In [36]:
query_scores = {'a':0, 'b':0, 'c':0}

In [37]:
for term in [t.lower() for t in QUERY_TERMS]:
    for doc in sorted(corpus):
        print('TF({0}): {1}'.format(doc,term), tf(term,corpus[doc]))
    print('IDF: {0}'.format(term), idf(term,corpus.values()))
    print()
    
    for doc in sorted(corpus):
        score = tf_idf(term, corpus[doc], corpus.values())
        print('TF-IDF({0}): {1}'.format(doc,term),score)
        query_scores[doc] += score
    print()

TF(a): mr. 0.10526315789473684
TF(b): mr. 0.0
TF(c): mr. 0.0
IDF: mr. 2.09861228866811

TF-IDF(a): mr. 0.22090655670190631
TF-IDF(b): mr. 0.0
TF-IDF(c): mr. 0.0

TF(a): green 0.10526315789473684
TF(b): green 0.1111111111111111
TF(c): green 0.0625
IDF: green 1.0

TF-IDF(a): green 0.10526315789473684
TF-IDF(b): green 0.1111111111111111
TF-IDF(c): green 0.0625



In [40]:
print("Overall TF-IDF scores for query '{0}'".format(' '.join(QUERY_TERMS)))
for (doc, score) in sorted(query_scores.items()):
    print(doc,score)

Overall TF-IDF scores for query 'mr. green'
a 0.3261697145966431
b 0.1111111111111111
c 0.0625
