# Filter Artifacts

In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

The tf–idf is the product of two statistics, term frequency and inverse document frequency. There are various ways for determining the exact values of both statistics.

The score to be built in this part aims to define the importance of a keyword or phrase within a the database.

## import pkgs

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pymongo
import argparse
from configparser import ConfigParser
import os
import json

## Connect to MongoDB 

In [9]:
config = ConfigParser()
pardir = os.getcwd()
config.read(os.path.join(pardir, '../../resources/secrets.ini'))

['/Users/A.Z.Ran/Downloads/CKIDS/CKIDS/filterby_keywords/filter_using_TFIDF/../../resources/secrets.ini']

In [11]:
DB_USER = config['MONGODB']['CKIDS_USER']
DB_PASS = config['MONGODB']['CKIDS_PASS']
DB_NAME = config['MONGODB']['CKIDS_DB_NAME']
HOST = config['AWS']['HOST_IP']
PORT = config['AWS']['HOST_PORT']
client = pymongo.MongoClient("mongodb://{DB_USER}:{DB_PASS}@{HOST}:{PORT}/{DB_NAME}".format(
    DB_USER=DB_USER, DB_PASS=DB_PASS, HOST=HOST, PORT=PORT, DB_NAME=DB_NAME))
db = client[DB_NAME]
collection = db["raw_artifacts"]
result = collection.find()

## Get Data from DB

In [21]:
objID_data = {}
for obj in result:
    description = obj['description']
#     if description != '':
#         description_data += [description]
    try:
        keywords = ' '.join(obj['keywords'])
    except KeyError:
        keywords = ''
        None
#     keywords_data += [keywords]    
    title = obj['title']
#     if title != '':
#         title_data += [title]
    objID_data[obj['_id']] = title+' '+description+' '+keywords

## Calculate TFIDF scores

### Reprocess keyword list

In [51]:
kw_csv = pd.read_csv('final_kw_list.csv', index_col=0)

In [52]:
term_list = list(kw_csv["Other_word_to_match"].str.split(', '))

In [53]:
def tf(term_list, documents):
    """
    Calculate term frequency for 'term'.
    
    input:
    ------
    term_list: the keyword to be evaluated, with options. np array, shape=(T, 2)
    document: a document(here description paragraph string)
    
    output:
    -------
    a numerical frequency value.
    """
    N, T = len(documents),len(term_list)
    TF = np.zeros((N, T))
    for j in range(N):
        d = documents[j].lower()
        for i in range(T):
            word_options = term_list[i]
            TF[j, i] = sum([d.count(w.lower()) for w in word_options])
    return TF

In [54]:
def idf(TF):
    """
    Calculate inverse document frequency for term wrt all documents.
    
    input:
    ------
    TF: term frequency, numpy array (#documents, #terms)
    
    output:
    -------
    numpy array (#terms), numerical values of idfs
    """
    N = TF.shape[0]
    return np.log((277810-np.count_nonzero(TF, axis=0))/(1+np.count_nonzero(TF, axis=0)))
# np.log(N/1+np.count_nonzero(TF, axis=0))

In [55]:
TF = tf(term_list, list(objID_data.values()))

IDF = idf(TF)

TFIDF = TF*IDF

In [56]:
pd.DataFrame({'Keyword':[t[0].strip(' ') for t in term_list], 'Term_frequency':TF.sum(axis=0),
              'TFIDF_score':TFIDF.sum(axis=0), 
              'Log_TFIDF_score':np.log(TFIDF.sum(axis=0)+1)}).to_csv('final_kw_TFIDF_Score.csv')

## Calculate Relevance Scores

In [57]:
weight = pd.read_csv('final_kw_TFIDF_Score.csv', index_col=0)

In [58]:
log_doc_scores = np.log(TF@(weight['TFIDF_score'].to_numpy())+1)

In [59]:
doc_scores_result = dict(zip([str(k) for k in objID_data.keys()],log_doc_scores))

In [60]:
import matplotlib.pyplot as plt

# Choose how many bins you want here
num_bins = 20

# Use the histogram function to bin the data
counts, bin_edges = np.histogram(log_doc_scores, bins=num_bins)

# Now find the cdf
cdf = np.cumsum(counts)/len(log_doc_scores)

# And finally plot the cdf
plt.plot(bin_edges[1:], cdf)
plt.xlabel('Relevance Score')
plt.ylabel('cdf')
plt.title("CDF of Relevance Score with {0} bins".format(num_bins))
plt.savefig('rlv_score_cdf.png')
plt.show()

<Figure size 640x480 with 1 Axes>

In [61]:
with open('final_filter_TFIDF_result.json', 'w') as f:
    json.dump(doc_scores_result, f)

## Generate Samples

In [62]:
import json
from bson.objectid import ObjectId

In [63]:
# The web framework gets post_id from the URL and passes it as a string
def get(post_id):
    # Convert from string to ObjectId:
    document = collection.find_one({'_id': ObjectId(post_id)})
    return document

In [64]:
def match_words(l,j, term_list):
    stf = {}
    sample_data = []
    terms = tf(term_list, objID_data[ObjectId(l)]).sum(axis=0)
    stf = {}
    for i in range(len(terms)):
        if terms[i] > 0:
            stf [term_list[i]] = int(terms[i])
    sample_data += [['https://doi.org/'+get(l)['doi'], stf, np.log(j)]]
    return sample_data

In [65]:
with open('final_filter_TFIDF_result.json') as f:
    doc_score_data = json.load(f)

In [66]:
len(doc_score_data)

277810

In [108]:
ct0 = 0
ct10 = 0
for i, g in doc_score_data.items():
    if g <= 0.1:
        ct0 += 1
    if g <= 13:
        ct10 += 1
print(ct0, ct10)

169799 15650


In [79]:
low0 = sorted(doc_score_data.items(), key=lambda x:x[1])[:10]
low_n0 = sorted(doc_score_data.items(), key=lambda x:x[1])[ct0+11:ct0+21]
middle = sorted(doc_score_data.items(), key=lambda x:x[1])[ct10+1:ct10+11]
high = sorted(doc_score_data.items(), key=lambda x:x[1])[-10:]

In [87]:
sample_data = []
sdata = low0+low_n0+middle+high# 

for i in range(len(sdata)):
    l, j = sdata[i]
    stf = {}
    for i in range(len(term_list)):
        word_options = term_list[i]
        d = objID_data[ObjectId(l)].lower()
        fq = sum([d.count(w.lower()) for w in word_options])
        if fq >=1 :
            stf[term_list[i][0]] = fq    
    sample_data += [['https://doi.org/'+get(l)['doi'], stf, j]]

In [81]:
sd = pd.DataFrame(sample_data, columns = ['url', 'term:frequency', 'log score'])

In [82]:
sd['score category'] = ['zero']*10+['low']*10+['middle']*10+['high']*10

In [83]:
sd

Unnamed: 0,url,term:frequency,log score,score category
0,https://doi.org/10.5281/zenodo.3545811,{},0.0,zero
1,https://doi.org/10.5281/zenodo.3515458,{},0.0,zero
2,https://doi.org/10.5281/zenodo.3597391,{},0.0,zero
3,https://doi.org/10.5281/zenodo.344492,{},0.0,zero
4,https://doi.org/10.5281/zenodo.1133037,{},0.0,zero
5,https://doi.org/10.5281/zenodo.1314827,{},0.0,zero
6,https://doi.org/10.5281/zenodo.376969,{},0.0,zero
7,https://doi.org/10.5281/zenodo.177215,{},0.0,zero
8,https://doi.org/10.5281/zenodo.1099416,{},0.0,zero
9,https://doi.org/10.5281/zenodo.16414,{},0.0,zero


In [84]:
sd.to_csv('final_samples_w_scores.csv')