This notebook is a walk-through for loading the text documents and TF-IDF table from a SQLite database and run the text retrieval applications

In [1]:
# Copy the dataset to the VM file system.
!wget https://storage.googleapis.com/pet-detect-239118/text_retrieval/textDoc.db textDoc.db

--2021-10-29 02:31:21--  https://storage.googleapis.com/pet-detect-239118/text_retrieval/textDoc.db
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.197.128, 64.233.191.128, 173.194.74.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.197.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8753152 (8.3M) [application/octet-stream]
Saving to: ‘textDoc.db’


2021-10-29 02:31:22 (101 MB/s) - ‘textDoc.db’ saved [8753152/8753152]

--2021-10-29 02:31:22--  http://textdoc.db/
Resolving textdoc.db (textdoc.db)... failed: Name or service not known.
wget: unable to resolve host address ‘textdoc.db’
FINISHED --2021-10-29 02:31:22--
Total wall clock time: 0.3s
Downloaded: 1 files, 8.3M in 0.08s (101 MB/s)


In [2]:
import sqlite3
import os
import nltk
nltk.download('popular');
from nltk.corpus import stopwords
# from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import numpy as np
import re

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

Load the data from the SQL database

In [3]:
conn = sqlite3.connect('textDoc.db') 
cur = conn.cursor()
print("Opened database successfully")

Opened database successfully


In [5]:
doc_dict ={}
for row in cur.execute('SELECT doc_name, raw_text FROM raw_data'):
  doc_dict[row[0]] = row[1] 

In [14]:
tf_idf_dict = {}

for row in cur.execute('SELECT DISTINCT doc_name FROM tf_idf'):
  tf_idf_dict[row[0]] = {}

for row in cur.execute('SELECT doc_name, term, score from tf_idf'):
  for doc_id in tf_idf_dict.keys():
    if doc_id==row[0]:
      tf_idf_dict[doc_id][row[1]] = row[2]
  

In [11]:
def vectorSpaceModel(query, doc_dict,tfidf_dict):
  query_vocab = []
  query = query.lower()
  query = re.sub(r"\s+", " ", query)
  stopwords_english = stopwords.words('english')

  for word in query.split():
    if (word not in string.punctuation and word not in stopwords_english):
        query_vocab.append(word)

  query_wc = {}
  for word in query_vocab:
    query_wc[word] = query.split().count(word)

  relevance_scores = {}
  for doc_id in doc_dict.keys():
    score = 0
    for word in query_vocab:
      score += query_wc[word] * tf_idf_dict[doc_id][word]
    relevance_scores[doc_id] = round(score,4)

  # sort the relevance score and get the top-k ranking
  # sort the keys of the relevance score by value
  sort_keys = sorted(relevance_scores, key=relevance_scores.get , reverse = True)
  top_keys = sort_keys[:5]
  top_5 = {}
  for key in top_keys:
    top_5[key] = relevance_scores[key]

  return top_5


In [15]:
query1 = "Natural Language"
result1 = vectorSpaceModel(query1, doc_dict,tf_idf_dict)
print(result1)

{'A00-1001.pdf.txt': 2.294, 'A00-1007.pdf.txt': 1.849, 'A00-1005.pdf.txt': 1.2414, 'A00-1016.pdf.txt': 0.951, 'A00-1009.pdf.txt': 0.9114}


In [16]:
query2 = "Data mining"
result2 = vectorSpaceModel(query2, doc_dict,tf_idf_dict)

query3 = "I like text retrieval"
result3 = vectorSpaceModel(query3, doc_dict,tf_idf_dict)

query4 = "probability model and language model"
result4 = vectorSpaceModel(query4, doc_dict,tf_idf_dict)

print(result2)
print()
print(result3)
print()
print(result4)

{'A00-1004.pdf.txt': 21.748, 'A00-1020.pdf.txt': 6.334, 'A00-1017.pdf.txt': 5.199, 'A00-1005.pdf.txt': 1.5597, 'A00-1009.pdf.txt': 0.8665}

{'A00-1003.pdf.txt': 60.3656, 'A00-1012.pdf.txt': 12.483, 'A00-1004.pdf.txt': 10.4346, 'A00-1018.pdf.txt': 7.1472, 'A00-1020.pdf.txt': 4.8524}

{'A00-1004.pdf.txt': 90.612, 'A00-1019.pdf.txt': 63.0706, 'A00-1007.pdf.txt': 13.7778, 'A00-1012.pdf.txt': 12.0552, 'A00-1017.pdf.txt': 11.8802}
