# TF-IDF

In [39]:
#  Import dependencies
import numpy as np
import pandas as pd


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data',
          'I woke up this morning']

In [22]:
def clear_list(lst):
    if 'I' in lst:
        lst.remove('I')
    return lst

In [23]:
words_set = set()

for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

words_set = clear_list(words_set)

print(words_set)
print(len(words_set))

{'woke', 'the', 'analyze', 'morning', 'up', 'data', 'is', 'of', 'important', 'most', 'fields', 'one', 'this', 'scientists', 'best', 'courses', 'science'}
17


### Term Frequency (TF)

In [24]:
n_docs = len(corpus)
n_words_set = len(words_set)

Unnamed: 0,woke,the,analyze,morning,up,data,is,of,important,most,fields,one,this,scientists,best,courses,science
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Creating a DataFrame to log all the tf of each word by docs
df_tf =  pd.DataFrame(np.zeros((n_docs, n_words_set)), columns = list(words_set))


# Calculate TF

for i_doc in range(n_docs):
    words = corpus[i_doc].split(' ')
    words = clear_list(words)
    
    for word in words:
        df_tf[word][i_doc] += (1 / len(words))
     
df_tf

Unnamed: 0,woke,the,analyze,morning,up,data,is,of,important,most,fields,one,this,scientists,best,courses,science
0,0.0,0.090909,0.0,0.0,0.0,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.090909,0.0,0.0,0.0,0.0,0.181818
1,0.0,0.111111,0.0,0.0,0.0,0.111111,0.111111,0.111111,0.0,0.0,0.0,0.111111,0.111111,0.0,0.111111,0.111111,0.111111
2,0.0,0.0,0.25,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0
3,0.25,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0


### Inverse Document Frequency (IDF)

In [32]:
idf = {}

for word in words_set:
    oc = 0 #Total occurances of words in documents
    
    for i in range (n_docs):
        if word in clear_list(corpus[i].split(' ')):
            oc += 1 
    idf[word] = np.log10(n_docs / oc)
    
idf

{'woke': 0.6020599913279624,
 'the': 0.3010299956639812,
 'analyze': 0.6020599913279624,
 'morning': 0.6020599913279624,
 'up': 0.6020599913279624,
 'data': 0.12493873660829993,
 'is': 0.3010299956639812,
 'of': 0.3010299956639812,
 'important': 0.6020599913279624,
 'most': 0.6020599913279624,
 'fields': 0.6020599913279624,
 'one': 0.3010299956639812,
 'this': 0.3010299956639812,
 'scientists': 0.6020599913279624,
 'best': 0.6020599913279624,
 'courses': 0.6020599913279624,
 'science': 0.3010299956639812}

### TF-IDF

In [33]:
df_tf_idf = df_tf.copy()

for word in words_set:
    for doc in range(n_docs):
        df_tf_idf[word][doc] = df_tf[word][doc] * idf[word]
        
df_tf_idf

Unnamed: 0,woke,the,analyze,morning,up,data,is,of,important,most,fields,one,this,scientists,best,courses,science
0,0.0,0.027366,0.0,0.0,0.0,0.011358,0.027366,0.054733,0.054733,0.054733,0.054733,0.027366,0.0,0.0,0.0,0.0,0.054733
1,0.0,0.033448,0.0,0.0,0.0,0.013882,0.033448,0.033448,0.0,0.0,0.0,0.033448,0.033448,0.0,0.066896,0.066896,0.033448
2,0.0,0.0,0.150515,0.0,0.0,0.062469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150515,0.0,0.0,0.0
3,0.150515,0.0,0.0,0.150515,0.150515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075257,0.0,0.0,0.0,0.0


## Scikit Learn

[Cosine Similarity and TFIDF](https://medium.com/web-mining-is688-spring-2021/cosine-similarity-and-tfidf-c2a7079e13fa)

In [36]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)
column_names = vectorizer.get_feature_names_out()

In [37]:
df_tf_idf_skl = pd.DataFrame(X.toarray(), columns = column_names)
df_tf_idf_skl

Unnamed: 0,analyze,best,courses,data,fields,important,is,morning,most,of,one,science,scientists,the,this,up,woke
0,0.0,0.0,0.0,0.199417,0.312425,0.312425,0.246319,0.0,0.312425,0.492639,0.246319,0.492639,0.0,0.246319,0.0,0.0,0.0
1,0.0,0.403667,0.403667,0.257655,0.0,0.0,0.318256,0.0,0.0,0.318256,0.318256,0.318256,0.0,0.318256,0.318256,0.0,0.0
2,0.52489,0.0,0.0,0.670061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52489,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.525473,0.0,0.0,0.0,0.0,0.0,0.0,0.414289,0.525473,0.525473


### Cosine Similarities

In [38]:
df_cos_sim = pd.DataFrame(cosine_similarity(df_tf_idf, dense_output=True))
df_cos_sim

Unnamed: 0,0,1,2,3
0,1.0,0.395778,0.024279,0.0
1,0.395778,1.0,0.031046,0.073673
2,0.024279,0.031046,1.0,0.0
3,0.0,0.073673,0.0,1.0
