In [None]:
pip install whoosh

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl.metadata (3.1 kB)
Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/468.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/468.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


# Preparing the data

In [None]:
!kaggle datasets download -d stackoverflow/stacksample


Dataset URL: https://www.kaggle.com/datasets/stackoverflow/stacksample
License(s): other
Downloading stacksample.zip to /content
 99% 1.09G/1.11G [00:13<00:00, 158MB/s]
100% 1.11G/1.11G [00:13<00:00, 87.2MB/s]


In [None]:
 !unzip stacksample.zip

Archive:  stacksample.zip
  inflating: Answers.csv             
  inflating: Questions.csv           
  inflating: Tags.csv                


In [None]:
 import pandas as pd
 questions=pd.read_csv("Questions.csv", nrows=20000)
 questions

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
19995,1114470,82266.0,2009-07-11T19:37:06Z,,0,"Trim all chars off file name after first ""_""",<p>I'd like to trim these purchase order file ...
19996,1114540,2288585.0,2009-07-11T20:16:06Z,,7,Xcode question: Quickly jump to a particular s...,<p>What is the quickest way to jump to a parti...
19997,1114550,131128.0,2009-07-11T20:20:11Z,,3,Serializing a generic collection with XMLSeria...,<p>Why won't XMLSerializer process my generic ...
19998,1114580,87271.0,2009-07-11T20:35:46Z,,1,Using Yahoo Fire Eagle on Grails / Java,<p>Has anyone implemented the Yahoo Fire Eagle...


# The Index and Schema objects

 Getting started with Whoosh involves creating an index object. When creating an
 index for the first time, defining its schema is necessary. The schema outlines the
 f
 ields within the index, representing different pieces of information for each document.
 These fields can include the document’s title or its textual content. Each field can be
 indexed for searchability and/or stored, ensuring that the indexed value is returned
 with the search results, which is particularly useful for fields like the title.
 Let’s start by designing the schema for our index.

In [None]:
 from whoosh.fields import Schema, TEXT, ID
 # Defining index schema
 schema = Schema(Id=ID(stored=True), Title=TEXT(stored=True),
 Body=TEXT(stored=True))

 Here we choose 3 columns form the questions dataframe to be used in our index:
 “Id”, “Title” and “Body” for keyword based search.
 Now let’s create our index. To index documents we need define folder where to
 save needed files.

In [None]:
 import os.path
 index_dir = "indexdir"
 if not os.path.exists(index_dir):
   os.mkdir(index_dir)

Then we can simply create an index and add documents to be indexed

In [None]:
 from whoosh.index import create_in
 from whoosh.index import open_dir
 # Creating the index
 ix = create_in(index_dir, schema)
 # Open the index writer
 writer = ix.writer()
 # Iterate over the DataFrame and add documents to the index
 # we have indexed title, title_body and doc_id
 for index, row in questions.iterrows():
  writer.add_document(Id=str(row['Id']), Title = row['Title'],Body=row['Body'])
 # Commit and close the writer
 writer.commit()

 1.2.4 How to search
 Once you’ve created an index and added documents to it, you can search for those
 documents. The Searcher object is the main high-level interface for reading the
 index. It has lots of useful methods for getting information about the index, how
ever, the most important method on the Searcher object is search(), which takes a
 whoosh.query.Query object and returns a Results object.
 Normally the list of result documents is sorted by score. The whoosh.scoring mod
ule contains implementations of various scoring algorithms. You can set the scoring
 object to use when you create the searcher using the weighting keyword argument.
 The following code search the index we created and rank the results based on the
 Term Frequency-Inverse Document Frequency (TF-IDF) score.

In [None]:
from whoosh.qparser import QueryParser
from whoosh.scoring import TF_IDF
from whoosh import scoring
qp = QueryParser("Title", schema=schema)
 # parse the query
query_sentence = "How to install"
query = qp.parse(query_sentence)
 # create a searcher object
searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())
 # search documents and store them
 # we are returing top 3 documents
results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)
 # print the documents
for hit in results_tfidf:
  print(hit["Id"])
  print('\n')
  print(hit["Title"])
  print('\n')
  print('------------------\n')

102850


How can I install CPAN modules locally without root access (DynaLoader.pm line 229 error)?


------------------

145900


How can I determine that Windows Installer is performing an upgrade rather than a first time install?


------------------

351640


How to install Hibernate Tools in Eclipse?


------------------



Task 1: Test the previous search code with different queries. For each one check how
 many matched results are returned.

In [None]:
from whoosh.qparser import QueryParser
from whoosh import scoring


queries = ["ASP.NET", "Yahoo Fire", "How to share"]


searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())

for query_sentence in queries:
    print(f"Query: {query_sentence}")


    qp = QueryParser("Title", schema=schema)
    query = qp.parse(query_sentence)


    results_tfidf = searcher_tfidf.search(query, limit=10, scored=True)


    print(f"Number of matched results: {len(results_tfidf)}")
    print("-" * 30)


Query: ASP.NET
Number of matched results: 751
------------------------------
Query: Yahoo Fire
Number of matched results: 1
------------------------------
Query: How to share
Number of matched results: 13
------------------------------


 Task 2: Repeat the previous search using the BM25F scoring algorithm, which is used
 in probabilistic retrieval model. Do you see any difference in the returned results?

In [None]:
from whoosh.qparser import QueryParser
from whoosh.scoring import BM25F
from whoosh import scoring
qp = QueryParser("Title", schema=schema)
 # parse the query
query_sentence = "How to install"
query = qp.parse(query_sentence)
 # create a searcher object
searcher_bm25f = ix.searcher(weighting=scoring.BM25F())
 # search documents and store them
 # we are returing top 3 documents
results_bm25f = searcher_bm25f.search(query, limit=3, scored=True)
 # print the documents
for hit in results_bm25f:
  print(hit["Id"])
  print('\n')
  print(hit["Title"])
  print('\n')
  print('------------------\n')

921780


How to install ImageMagick on MAMP?


------------------

998260


How do you install JDK?


------------------

351640


How to install Hibernate Tools in Eclipse?


------------------



# 1.2.5 Query expansion
 query expansion involves evaluating a user’s input (what words were typed into the
 search query area, and sometimes other types of data) and expanding the search
 query to match additional documents. Query expansion involves techniques such as:
 1. Linguistic query expansion such as finding synonyms of words, and searching
 for the synonyms as well
 2. Corpus-based query expansion, by searching a single query term at a time and
 counting the most common words in the top returned documents. Repeating
 this process for all query terms will give you a list of terms that co-occur fre
quently with your query terms. Thus, they can be used as to expand the query
 in the domain of that corpus.
 3. Pseudo-relevance feedback, by expanding the original query with the most fre
quent terms of the top retrieved documents.
 Whoosh provides methods for computing the “key terms” of a set of documents.
 For these methods, “key terms” basically means terms that are frequent in the given
 documents, but relatively infrequent in the indexed collection as a whole. These
 methods can be useful for query expansion.
 For example, the following code retrieve more results similar to the first returned
 item in the previous example

In [None]:
more_results = results_tfidf[0].more_like_this("Title")
for hit in more_results:
 print(hit["Id"])
 print('\n')
 print(hit["Title"])
 print('\n')
 print('------------------\n')

459590


What is the difference betwen including modules and embedding modules?


------------------

423330


Why can't DynaLoader.pm load SSleay.dll for Net::SSLeay and Crypt::SSLeay?


------------------

540640


How can I install a CPAN module into a local directory?


------------------

172040


How do you develop against OpenID locally


------------------

566290


Silverlight Development - Service URL while developing locally


------------------

766830


How can I locally manage C manuals?


------------------

799860


Using Mercurial locally, only with Subversion server


------------------

852280


Ubuntu: "Could not find rails locally or in a repository"


------------------

78900


How to check for memory leaks in Guile extension modules?


------------------

199180


Is there any way to get python omnicomplete to work with non-system modules in vim?


------------------



 Wecanalso extract keywords for the top N documents in a whoosh.searching.Results
 object. This requires that the field is either vectored or stored. For example, to ex
tract five key terms from the Title field of the top ten documents of the results object
 in the previous example

In [None]:
keywords = [keyword for keyword, score
           in results_tfidf.key_terms("Title", docs=10, numterms=5)]
keywords

['install', '229', 'cpan', 'dynaloader.pm', 'locally']

1.2.6 Evaluating IR systems
 There are several measures for evaluating IR systems, such as precision, recall and
 mean average precision (mAP). While precision and recall does not take the rank
 of the retrieved documents into consideration, mAP considers the order in which
 documents are ranked. Sometimes precision and recall are computed at cut-off value
 k of retrieved documents. In this case, it is called precision@k and recall@k, which
 means precision and recall computed when considering the first k retrieved documents
 only.
 In this part we will evaluate our IR system on a toy dataset. This dataset contains
 a set of documents, set of queries, and list of relevant documents for each query.

In [None]:
 queries = {
 'q1': "machine learning",
 'q2':"AI algorithms"
 }
 relevance = {
 'q1' : ["doc1", "doc2", "doc3"],
 'q2' : ["doc1", "doc2", "doc3", "doc4", "doc5"]
 }
 documents = {'doc1': '''Artificial Intelligence (AI) is transforming various industries through automation and advanced algorithms. Machine
 learning, a subset of AI, enables computers to learn from data and
 make predictions. Algorithms are at the core of AI systems, guiding
 decision-making and problem-solving processes. AI-powered systems
 are increasingly used in healthcare for diagnosis and treatment
 planning. The ethical implications of AI algorithms, such as bias
 and fairness, are important considerations in their development.''',
 'doc2': '''Deep learning, a branch of machine learning, uses neural
 networks to process complex data. AI algorithms are capable of
 analyzing large datasets to extract meaningful insights. Natural
 Language Processing (NLP) algorithms enable computers to understand
 and generate human language. AI-driven recommendation algorithms
 personalize user experiences in e-commerce and content platforms.
 Ensuring the transparency and accountability of AI algorithms is
 essential for building trust in AI technologies.''',
 'doc3': '''Reinforcement learning algorithms enable AI agents to learn
 through trial and error interactions with their environment. AI
 algorithms are used in financial markets for high-frequency trading
 and risk management. Computer vision algorithms enable machines to
 interpret and analyze visual information. AI algorithms can enhance
 cybersecurity by detecting and mitigating cyber threats in
 real-time. Continuous research and development are essential for
 advancing AI algorithms and overcoming their limitations.''',
 'doc4': '''Evolutionary algorithms, inspired by natural selection, are
 used to optimize complex systems and processes. AI algorithms play
 a crucial role in autonomous vehicles for navigation and
 decision-making. Quantum computing algorithms have the potential to
 revolutionize AI by solving complex problems exponentially faster.
 AI algorithms are employed in predictive maintenance to anticipate
 equipment failures and reduce downtime. Ethical guidelines and
 regulations are needed to govern the development and deployment of
 AI algorithms.''',
 'doc5': '''Genetic algorithms are used to evolve solutions to
 optimization and search problems inspired by natural selection. AI
 algorithms enable personalized content recommendations in streaming
 services and social media platforms. Swarm intelligence algorithms
 mimic the collective behavior of social insects to solve
 optimization problems. AI algorithms are used in drug discovery to
 accelerate the identification of potential treatments.
 Collaborative efforts are essential for advancing AI algorithms and
 harnessing their full potential for societal benefit.'''
 }

In [None]:
 from whoosh.fields import Schema, TEXT, ID
 from whoosh.index import create_in
 from whoosh.index import open_dir
 # Defining index schema
 schema = Schema(Id=ID(stored=True), Body=TEXT(stored=True))
 import os.path
 index_dir = "indexdir_toy"
 if not os.path.exists(index_dir):
  os.mkdir(index_dir)
 # Creating the index
 ix = create_in(index_dir, schema)
 # Open the index writer
 writer = ix.writer()
 for doc in documents:
  writer.add_document(Id=doc, Body=documents[doc])
 # Commit and close the writer
 writer.commit()

In [None]:
 from whoosh.qparser import QueryParser
 from whoosh.scoring import TF_IDF
 from whoosh import scoring
 # create the query parser
 qp = QueryParser("Body", schema=schema)
 # parse the query
 query_sentence = queries['q2']
 query = qp.parse(query_sentence)
 # create a searcher object
 searcher_tfidf = ix.searcher(weighting=scoring.TF_IDF())
 # search documents and store them
 # we are returing top 3 documents
 results_tfidf = searcher_tfidf.search(query, limit=3, scored=True)
 # print the documents
 for hit in results_tfidf:
  print(hit["Id"])
  print('\n')
  print(hit["Body"])
  print('\n')
 print('------------------\n')

doc3


Reinforcement learning algorithms enable AI agents to learn
through trial and error interactions with their environment. AI
algorithms are used in financial markets for high-frequency trading
and risk management. Computer vision algorithms enable machines to
interpret and analyze visual information. AI algorithms can enhance
cybersecurity by detecting and mitigating cyber threats in
real-time. Continuous research and development are essential for
advancing AI algorithms and overcoming their limitations.


doc4


Evolutionary algorithms, inspired by natural selection, are
used to optimize complex systems and processes. AI algorithms play
a crucial role in autonomous vehicles for navigation and
decision-making. Quantum computing algorithms have the potential to
revolutionize AI by solving complex problems exponentially faster.
AI algorithms are employed in predictive maintenance to anticipate
equipment failures and reduce downtime. Ethical guidelines and
regulations are needed to 

Task 3: Compute the precision and recall for the retrieved documents in the previous
 example.

In [None]:
relevant_docs = relevance['q2']

# Retrieved documents
retrieved_docs = [hit["Id"] for hit in results_tfidf]

# Precision calculation
precision = len(set(relevant_docs) & set(retrieved_docs)) / len(retrieved_docs)

# Recall calculation
recall = len(set(relevant_docs) & set(retrieved_docs)) / len(relevant_docs)

# Output results
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Precision: 1.00
Recall: 0.60


Task 4: Modify the last code to test all queries and then report the precision and
 recall.

In [None]:
def calculate_precision_recall(query, relevant_docs):
    qp = QueryParser("Body", schema=schema)
    query_parsed = qp.parse(query)

    # TF-IDF scoring
    searcher_tfidf = ix.searcher(weighting=TF_IDF())
    results_tfidf = searcher_tfidf.search(query_parsed, limit=3, scored=True)

    # Retrieved documents
    retrieved_docs = [hit["Id"] for hit in results_tfidf]

    # Calculate precision
    precision = len(set(relevant_docs) & set(retrieved_docs)) / len(retrieved_docs)

    # Calculate recall
    recall = len(set(relevant_docs) & set(retrieved_docs)) / len(relevant_docs)

    return precision, recall

In [None]:
results = {}
for query_key in queries:
    query = queries[query_key]
    relevant_docs = relevance[query_key]
    precision, recall = calculate_precision_recall(query, relevant_docs)
    results[query_key] = (precision, recall)


# Report results
for query_key in results:
    precision, recall = results[query_key]
    print(f"Query '{query_key}': Precision = {precision:.2f}, Recall = {recall:.2f}")

Query 'q1': Precision = 1.00, Recall = 0.67
Query 'q2': Precision = 1.00, Recall = 0.60
