<a href="https://colab.research.google.com/github/ShikaiKevinLiu/Study/blob/main/tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing and Tokenizing

In [31]:
import pandas as pd
import re
import string
import pandas as pd
from functools import reduce
from math import log


In [28]:
corpus = """
Simple example with Cats and Mouse
Another simple example with dogs and cats
Another simple example with mouse and cheese
""".split("\n")[1:-1]
print(corpus)

['Simple example with Cats and Mouse', 'Another simple example with dogs and cats', 'Another simple example with mouse and cheese']


## Bag of Words

In [29]:
# clearing and tokenizing
l_A = corpus[0].lower().split()
l_B = corpus[1].lower().split()
l_C = corpus[2].lower().split()

# Calculating bag of words, unique words in all strings
word_set = set(l_A).union(set(l_B)).union(set(l_C))
word_dict_A = dict.fromkeys(word_set, 0)
word_dict_B = dict.fromkeys(word_set, 0)
word_dict_C = dict.fromkeys(word_set, 0)

for word in l_A:
    word_dict_A[word] += 1

for word in l_B:
    word_dict_B[word] += 1

for word in l_C:
    word_dict_C[word] += 1

pd.DataFrame([word_dict_A, word_dict_B, word_dict_C])

Unnamed: 0,cats,example,dogs,with,another,and,mouse,cheese,simple
0,1,1,0,1,0,1,1,0,1
1,1,1,1,1,1,1,0,0,1
2,0,1,0,1,1,1,1,1,1


# TF

In [26]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] = count/sum_nk
    return tf

In [30]:
tf_A = compute_tf(word_dict_A, l_A)
tf_B = compute_tf(word_dict_B, l_B)
tf_C = compute_tf(word_dict_C, l_C)
pd.DataFrame([tf_A,tf_B,tf_C])

Unnamed: 0,cats,example,dogs,with,another,and,mouse,cheese,simple
0,0.166667,0.166667,0.0,0.166667,0.0,0.166667,0.166667,0.0,0.166667
1,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857,0.0,0.0,0.142857
2,0.0,0.142857,0.0,0.142857,0.142857,0.142857,0.142857,0.142857,0.142857


# IDF

In [32]:
def compute_idf(strings_list):
    n = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    
    for word, v in idf.items():
        idf[word] = log(n / float(v))
    return idf

In [33]:
idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])

# TF-IDF
tf-idf = tf$*$idf


In [44]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf

In [45]:
tf_idf_A = compute_tf_idf(tf_A, idf)
tf_idf_B = compute_tf_idf(tf_B, idf)
tf_idf_C = compute_tf_idf(tf_C, idf)
pd.DataFrame([tf_idf_A, tf_idf_B, tf_idf_C])


Unnamed: 0,cats,example,dogs,with,another,and,mouse,cheese,simple
0,0.067578,0.0,0.0,0.0,0.0,0.0,0.067578,0.0,0.0
1,0.057924,0.0,0.156945,0.0,0.057924,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.057924,0.0,0.057924,0.156945,0.0


# IF-IDF from Sklearn API

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [49]:
all_text = """
Google and Facebook are strangling the free press to death. Democracy is the loser
Your 60-second guide to security stuff Google touted today at Next '18
A Guide to Using Android Without Selling Your Soul to Google
Review: Lenovo’s Google Smart Display is pretty and intelligent
Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is
Android is better than IOS
In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency
is a numerical statistic that is intended to reflect
how important a word is to a document in a collection or corpus.
It is often used as a weighting factor in searches of information retrieval
text mining, and user modeling. The tf-idf value increases proportionally
to the number of times a word appears in the document
and is offset by the frequency of the word in the corpus
""".split("\n")[1:-1]


In [51]:
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

In [69]:
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing,max_features=50)
tfidf = tfidf_vectorizer.fit_transform(all_text)
pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())



Unnamed: 0,and,android,corpus,document,frequency,google,guide,idf,in,information,...,touted,used,user,using,value,weighting,what,without,word,your
0,0.282442,0.0,0.0,0.0,0.0,0.282442,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.233519,0.321141,0.0,0.0,0.0,...,0.372396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321141
2,0.0,0.305934,0.0,0.0,0.0,0.222462,0.305934,0.0,0.0,0.0,...,0.0,0.0,0.0,0.354763,0.0,0.0,0.0,0.354763,0.0,0.305934
3,0.312523,0.0,0.0,0.0,0.0,0.312523,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.248237,0.0,0.0,0.0,0.0,0.248237,0.0,0.0,0.0,0.0,...,0.0,0.0,0.34138,0.0,0.0,0.0,0.395866,0.0,0.0,0.0
5,0.0,0.612358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.245901,0.554605,0.0,0.0,0.277303,0.201642,0.277303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.453248,0.401922,0.0,0.0,0.0,0.0,0.329582,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401922,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249073,0.34253,...,0.0,0.397199,0.0,0.0,0.0,0.397199,0.0,0.0,0.0,0.0


In [65]:
kmeans = KMeans(n_clusters=2).fit(tfidf)
lines_for_predicting = ["tf and idf is awesome!", "some androids is there"]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))