## **Importing needed Libraries**

In [1]:
#Pyterrier framework
!pip install python-terrier
!pip install nltk

Collecting python-terrier
  Downloading python-terrier-0.10.1.tar.gz (110 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting 

In [2]:
import pyterrier as pt
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [3]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import os
pd.set_option('display.max_colwidth', 150)

In [4]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

Cloning into 'terrier-prf'...
remote: Enumerating objects: 196, done.[K
remote: Counting objects: 100% (196/196), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 196 (delta 52), reused 173 (delta 36), pack-reused 0[K
Receiving objects: 100% (196/196), 28.00 KiB | 7.00 MiB/s, done.
Resolving deltas: 100% (52/52), done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libaopalliance-java libapache-pom-java libatinject-jsr330-api-java libcdi-api-java
  libcommons-cli-java libcommons-io-java libcommons-lang3-java libcommons-parent-java
  libgeronimo-annotation-1.3-spec-java libgeronimo-interceptor-3.0-spec-java libguava-java
  libguice-java libhawtjni-runtime-java libjansi-java libjansi-native-java libjsr305-java
  libmaven-parent-java libmaven-resolver-java libmaven-shared-utils-java libmaven3-core-java
  libplexus-cipher-java libplexus-classworlds-java libpl

## **First Step : Data Collection**

---


In [5]:
import zipfile
zip_file_name = 'cisi.zip'
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('cisi_dataset')
!ls cisi_dataset

CISI.ALL  CISI.QRY  CISI.REL


In [6]:
def load_cisi_dataset(data_dir):
    documents_path = os.path.join(data_dir, 'CISI.ALL')
    queries_path = os.path.join(data_dir, 'CISI.QRY')
    qrels_path = os.path.join(data_dir, 'CISI.REL')

    documents_df = read_documents(documents_path)
    queries_df = read_queries(queries_path)
    qrels_df = read_qrels(qrels_path)
    return documents_df, queries_df, qrels_df

# Read documents from CISI.ALL file
def read_documents(documents_path):
    with open(documents_path, 'r') as file:
        lines = file.readlines()
    documents = []
    current_document = None
    for line in lines:
        if line.startswith('.I'):
            if current_document is not None:
                current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
                documents.append(current_document)
            current_document = {'ID': line.strip().split()[1], 'Text': ''}
        elif line.startswith('.T'):
            continue
        elif line.startswith('.A') or line.startswith('.B') or line.startswith('.W') or line.startswith('.X'):
            continue
        else:
            current_document['Text'] += line.strip() + ' '

    # Append the last document
    if current_document is not None:
        current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
        documents.append(current_document)
    documents_df = pd.DataFrame(documents)
    return documents_df

# Read queries from CISI.QRY file
def read_queries(queries_path):
    with open(queries_path, 'r') as file:
        lines = file.readlines()
    query_texts = []
    query_ids = []
    current_query_id = None
    current_query_text = []
    for line in lines:
        if line.startswith('.I'):
            if current_query_id is not None:
                query_texts.append(' '.join(current_query_text))
                current_query_text = []
            current_query_id = line.strip().split()[1]
            query_ids.append(current_query_id)
        elif line.startswith('.W'):
            continue
        elif line.startswith('.X'):
            break
        else:
            current_query_text.append(line.strip())
    # Append the last query
    query_texts.append(' '.join(current_query_text))
    queries_df = pd.DataFrame({
        'qid': query_ids,
        'raw_query': query_texts})
    return queries_df

# Read qrels from CISI.REL file
def read_qrels(qrels_path):
    qrels_df = pd.read_csv(qrels_path, sep='\s+', names=['qid','Q0','docno','label'])
    return qrels_df

In [7]:
data_dir = '/content/cisi_dataset'
documents_df, queries_df, qrels_df = load_cisi_dataset(data_dir)
documents_df['Text'][0]

"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. 1"

In [8]:
documents_df

Unnamed: 0,ID,Text
0,1,"18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification. The first edi..."
1,2,"Use Made of Technical Libraries Slater, M. This report is an analysis of 6300 acts of use in 104 technical libraries in the United Kingdom. Librar..."
2,3,"Two Kinds of Power An Essay on Bibliographic Control Wilson, P. The relationships between the organization and control of writings and the organiz..."
3,4,"Systems Analysis of a University Library; final report and research project Buckland, M.K. The establishment of nine new universities in the 1960'..."
4,5,"A Library Management Game: a report on a research project Brophy, P. Although the use of games in professional education has become widespread onl..."
...,...,...
1455,1456,"World Dynamics Forrester, J.W. Over the last several decades interest in economic development, population growth, and the world environment has ex..."
1456,1457,"World Trends in Library Education Bramley, G. One of the most significant aspects of the evolution of librarianship in the twentieth century has b..."
1457,1458,"Legal Restrictions on Exploitation of the Patent Monopoly: An Economic Analysis Baxter, W.A. The patent laws confer on a patentee power to exclude..."
1458,1459,"Language and Thought Poluskin, V.A. This book considers the basic aspects of this complex problem - the historical and social essence of language ..."


In [9]:
qrels_df

Unnamed: 0,qid,Q0,docno,label
0,1,28,0,0.0
1,1,35,0,0.0
2,1,38,0,0.0
3,1,42,0,0.0
4,1,43,0,0.0
...,...,...,...,...
3109,111,422,0,0.0
3110,111,448,0,0.0
3111,111,485,0,0.0
3112,111,503,0,0.0


In [10]:
queries_df

Unnamed: 0,qid,raw_query
0,1,What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from app...
1,2,"How can actually pertinent data, as opposed to references or entire articles themselves, be retrieved automatically in response to information req..."
2,3,What is information science? Give definitions where possible.
3,4,Image recognition and any other methods of automatically transforming printed text into computer-ready form.
4,5,What special training will ordinary researchers and businessmen need for proper information management and unobstructed use of information retriev...
...,...,...
107,108,".T A Program for Machine-Mediated Searching .A Toliver, D. A technique of online instruction and assistance to bibliographic data base searchers c..."
108,109,".T Author Cocitation: A Literature Measure of Intellectual Structure .A White, H.D. Griffith, B.C. It is shown that the mapping of a particular a..."
109,110,".T Progress in Documentation. Word Processing: An Introduction and Appraisal .A Whitehead, J. The ""Office of the Future,"" ""Office Technology,"" ""W..."
110,111,".T Document Clustering Using an Inverted File Approach .A Willett, P. An automated document clustering procedure is described which does not requi..."


In [11]:
queries_df["qid"]=queries_df["qid"].astype(str)
documents_df["docno"]=documents_df["ID"].astype(str)

## **Second Step : Preprocessing**

---



In [12]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [13]:
def allPreprocessing(text):
    # stem text
    def stem_text(text):
        tokens = word_tokenize(text)
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(stemmed_tokens)

    # cleaning
    def clean(text):
        text = re.sub(r"[\.\,\#_\|\:\?\?\/\=\@]", " ", text)  # remove special characters
        text = re.sub(r'\t', ' ', text)  # remove tabs
        text = re.sub(r'\n', ' ', text)  # remove line jump
        text = re.sub(r"\s+", " ", text)  # remove extra white space
        text = text.strip()
        return text

    # remove stopwords
    def remove_stopwords(text):
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        return ' '.join(filtered_tokens)

    # caLL all the previous functions
    def preprocess(sentence):
        sentence = clean(sentence)
        sentence = remove_stopwords(sentence)
        sentence = stem_text(sentence)
        return sentence

    return preprocess(text)


In [14]:
documents_df['processed_text'] = documents_df['Text'].apply(allPreprocessing)

In [15]:

queries_df["query"]=queries_df["raw_query"].apply(allPreprocessing)

## **Step Three : Indexing**

---


In [16]:
indexer = pt.DFIndexer("./MyProjectIndex", overwrite=True)

index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])

In [17]:
index = pt.IndexFactory.of(index_ref)

## **Step Four : Query Processing**

---



In [18]:
tfidf = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})

In [19]:
# preprocess the query and ranking the doc:
query="food"
query = allPreprocessing(query)
res = tfidf.search(query)
if not res.empty:
  print(res)
else:
  print("No results found for the given query.")


  qid  docid docno  rank     score query
0   1    383   384     0  9.403942  food


## **Step 5 :Query expansion:**

In [20]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

tokenized_documents = [word_tokenize(doc) for doc in documents_df['processed_text']]

# a CBOW model
cbow_model = Word2Vec(sentences=tokenized_documents,
                      sg=0,
                      vector_size=100,
                      window=5,
                      min_count=1,
                      workers=4,
                      epochs=20)

word_embeddings = cbow_model.wv


In [21]:
print(word_embeddings.key_to_index)

{'librari': 0, 'inform': 1, 'system': 2, 'use': 3, ')': 4, '``': 5, '(': 6, 'index': 7, 'research': 8, 'retriev': 9, 'data': 10, 'studi': 11, 'scienc': 12, 'document': 13, 'develop': 14, 'book': 15, ';': 16, 'servic': 17, 'search': 18, 'problem': 19, 'one': 20, 'gener': 21, 'user': 22, "'s": 23, 'paper': 24, 'scientif': 25, 'base': 26, 'journal': 27, 'method': 28, 'present': 29, 'term': 30, 'subject': 31, 'need': 32, 'result': 33, 'literatur': 34, 'work': 35, 'comput': 36, 'analysi': 37, 'new': 38, 'process': 39, 'discuss': 40, 'public': 41, 'catalog': 42, 'commun': 43, 'program': 44, 'describ': 45, 'provid': 46, 'oper': 47, 'number': 48, '-': 49, 'relat': 50, 'univers': 51, 'structur': 52, 'year': 53, 'cost': 54, 'titl': 55, 'field': 56, 'two': 57, 'j': 58, 'classif': 59, 'evalu': 60, 'design': 61, 'collect': 62, 'time': 63, 'languag': 64, 'organ': 65, 'select': 66, 'refer': 67, 'measur': 68, 'e': 69, 'differ': 70, 'theori': 71, 'may': 72, 'abstract': 73, 'effect': 74, 'report': 75, '

In [22]:
print(word_embeddings.similarity('system', 'research'))
print(word_embeddings.most_similar('data'))

0.25279775
[('upon', 0.8870627880096436), ('127', 0.8594834804534912), ('soundli', 0.8270571231842041), ('n-ari', 0.8023766279220581), ('on-the-premis', 0.7902129292488098), ('special-purpos', 0.7812813520431519), ('ful', 0.7737325429916382), ('600', 0.7557498216629028), ("'marshal", 0.7361974716186523), ('barcer', 0.728572428226471)]


In [23]:
query="on the premis"
query = allPreprocessing(query)


In [24]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)

results = bm25.search(query)
results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,1258,1259,0,9.921656,premi
1,1,639,640,1,9.813091,premi
2,1,480,481,2,8.632372,premi
3,1,1112,1113,3,8.416337,premi
4,1,30,31,4,7.480322,premi
5,1,589,590,5,7.277969,premi


In [25]:
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the BM25 will be fed into the RM3 expander for query expansion.
rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]


print(expanded_query)

for s in expanded_query.split()[1:]:
  print(s)

print("\n" + query)


applypipeline:off affect^0.025536682 theori^0.060778100 flexibl^0.028919188 activ^0.036936983 storag^0.030431943 person^0.032221626 scientist^0.044533957 premi^0.674184561 relationship^0.029519919 creativ^0.036936983
affect^0.025536682
theori^0.060778100
flexibl^0.028919188
activ^0.036936983
storag^0.030431943
person^0.032221626
scientist^0.044533957
premi^0.674184561
relationship^0.029519919
creativ^0.036936983

premi


In [26]:
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = bm25.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))


   Before Expansion    After Expansion
   docid_1   score_1  docid_2    score_2
0     1258  9.921656     1112  10.568625
1      639  9.813091     1258  10.305098
2      480  8.632372      639  10.237941
3     1112  8.416337      480   9.420991
4       30  7.480322      589   8.389621


In [27]:
query2=" I am a Scientist"
query2 = allPreprocessing(query2)

results2 = bm25.search(query2)
results2

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,104,105,0,6.488601,scientist
1,1,601,602,1,6.431215,scientist
2,1,532,533,2,6.4086,scientist
3,1,107,108,3,5.855813,scientist
4,1,685,686,4,5.852888,scientist
5,1,1345,1346,5,5.784767,scientist
6,1,240,241,6,5.607753,scientist
7,1,759,760,7,5.598374,scientist
8,1,543,544,8,5.566454,scientist
9,1,1112,1113,9,5.552384,scientist


## **Step 7 :Evaluation**

---



> Evaluate the performance of the search engine:
Test with various queries to assess retrieval accuracy and speed.



In [28]:
# test_qu = documents_df['processed_text'].tolist()
# results = tfidf.transform(test_qu)

# eval_metrics = pt.Utils.evaluate(results, qrels_df)

# print("Evaluation Metrics:")
# print(eval_metrics)

  results = tfidf.transform(test_qu)


JavaException: JVM exception occurred: Failed to process qid 1 '18 edit dewey decim classif comaromi j p present studi histori dewey decim classif first edit ddc publish 1876 eighteenth edit 1971 futur edit continu appear need spite ddc 's long healthi life howev full stori never told biographi dewey briefli describ system first attempt provid detail histori work spur growth librarianship countri abroad 1' -- Lexical error at line 1, column 174.  Encountered: "\'" (39), after : "" org.terrier.querying.parser.QueryParserException

ELMO

In [None]:
# import tensorflow as tf
# import tensorflow_hub as hub

# elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
# def elmo_embedding(query):
#     embedding = elmo.signatures["default"](tf.constant([query]))["elmo"]
#     return embedding.numpy()

# ex_queries = queries_df["raw_query"].sample(n=5, random_state=42).tolist()

# for query in ex_queries:
#     query_embedding = elmo_embedding(query)
#     print("Query:", query)
#     print("ELMo Embedding:", query_embedding)



Mono Bert

In [None]:
# #install the transformer library
# !pip install transformers

In [None]:
# from transformers import AutoTokenizer, AutoModel

# model_name = "bert-base-uncased"

# bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
# bert_model = AutoModel.from_pretrained(model_name)

In [None]:
# sep_token =bert_tokenizer.sep_token

# # print sep token of the tokenizer
# print("Sep token : ", sep_token)

# # print the token id of sep token
# print('Token ID of sep token : ',  bert_tokenizer.convert_tokens_to_ids(sep_token))

In [None]:
# text =
# encoding= bert_tokenizer.encode_plus(                  text,                      # Sentence to encode.
#                   add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                   truncation=True,
#                   max_length = 32,           # Pad & truncate all sentences.
#                   padding="max_length",
#                   return_attention_mask = True,   # Construct attention mask
#                   return_tensors = 'pt',     # Return pytorch tensors.
#               )


# # Print the input ids and attention mask of the encoded sentence
# print("Original text: ", text)
# print("Input ids: ", encoding["input_ids"].flatten(),)
# print("Attention mask: ", encoding["attention_mask"].flatten(),)
# # Note in the output of the next line that the cls, sep,and pad tokens were added automatically
# print("Tokenized text: ",bert_tokenizer.convert_ids_to_tokens(encoding["input_ids"].flatten()))

## **Step 6 :User Interface**

In [29]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.32.1-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.17.0 (from gradio)
  Downloading gradio_client-0.17.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [30]:
import gradio as gr

def search_documents(query):
    query = allPreprocessing(query)
    res = tfidf.search(query)
    if not res.empty:
        return res.to_string()
    else:
        return "No results found for the given query."

search_interface = gr.Interface(
    fn=search_documents,
    inputs="text",
    outputs="text",
    title="Document Search Engine",
    description="Enter your query to search documents from the CISI dataset."
)

search_interface.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://3ec8e572a2d0ad4f9d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




### **This is a draft of my first trials with getting the corpus using web scraping**
### **Step 1 :Data Collection**
---

>I extracted the corpus using **Web Scraping**







In [None]:
# !pip install requests
# !pip install bs4
# !pip install google-search-results
# from serpapi import GoogleSearch
# import requests
# import bs4
# from bs4 import BeautifulSoup
# import requests

In [None]:
# GoogleSearch.SERP_API_KEY = "d348865ead539de03e13a4096e618d4ed71451f80e3a30b3d4b264e018e9af85"  # my private api key

In [None]:
# from serpapi import GoogleSearch
# import requests

# # Function to fetch search results using pagination
# def fetch_search_results(query, num_pages=3):
#     api_key = "d348865ead539de03e13a4096e618d4ed71451f80e3a30b3d4b264e018e9af85"
#     search_results = []

#     for page in range(1, num_pages + 1):
#         params = {
#             "q": query,
#             "engine": "google",
#             "api_key": api_key,
#             "start": (page - 1) * 10,  # Calculate the start index for pagination
#             "output": "json"
#         }

#         response = requests.get("https://serpapi.com/search", params=params)
#         if response.status_code == 200:
#             data = response.json()
#             search_results.extend(data.get("organic_results", []))
#         else:
#             print(f"Failed to fetch search results for page {page}")

#     return search_results

# # Fetch search results for the query "easy food recipes" with pagination
# search_results = fetch_search_results("easy food recipes", num_pages=20)
# print("Number of search results:", len(search_results))
# print("Sample search results:")
# n = len(search_results)
# for result in (search_results[:n]):
#     print(result,result.get("title", "N/A"), "-", result.get("link", "N/A"))


In [None]:
# search_param = GoogleSearch({
#    "q": "easy food recipes",
#     "output": "json",
#     "engine": "google",
#     "start" : 20
#     "api_key":"d348865ead539de03e13a4096e618d4ed71451f80e3a30b3d4b264e018e9af85"
#     })
# #
# results1 = search_param.get_json()
# results1

In [None]:
# loc_res = results1["organic_results"]
# loc_res


In [None]:
# links = []
# for i in range(len(loc_res)):
#   links.append(loc_res[i]["link"])

In [None]:
# links

In [None]:
# from bs4 import BeautifulSoup
# import requests

# corpus = {}
# failed = []

# for url in links:
#     response = requests.get(url)
#     if response.status_code == 200:  # 200 indicates success
#         # using BeautifulSoup
#         soup = BeautifulSoup(response.text, 'html.parser')

#         main_content = soup.find_all('p')    # The <p> HTML tag contains most of the important elements
#         extracted_text = ''
#         for element in main_content:
#             extracted_text += element.get_text() + '\n'

#         # Clean empty lines/spaces
#         cleaned_text = ''
#         for line in extracted_text.splitlines():
#             if line.strip():
#                 cleaned_text += line.strip() + '\n'

#         # Add the link and its corresponding text snippet to the corpus dictionary
#         corpus[url] = cleaned_text
#     else:
#         print(f"Failed to fetch URL: {url}")
#         failed.append(url)

# corpus
# print(failed)




# corpus = ""  # I will concatenate all the text to this string to get the whole corpus
# failed = []

# for url in links:
#     response = requests.get(url)
#     if response.status_code == 200:  # 200 indicates success
#         # using BeautifulSoup
#         soup = BeautifulSoup(response.text, 'html.parser')

#         main_content = soup.find_all('p')    # The <p> HTML tag contains most of the important elements
#         extracted_text = ''
#         for element in main_content:
#             extracted_text += element.get_text() + '\n'

#         # clean empty lines/spaces
#         cleaned_text = ''
#         for line in extracted_text.splitlines():
#             if line.strip():
#                 cleaned_text += line.strip() + '\n'

#         corpus += cleaned_text + "\n"
#     else:
#         print(f"Failed to fetch URL: {url}")
#         failed.append(url)


In [None]:
# print(corpus)

In [None]:
# # Save the corpus to a file
# with open("corpus.txt", "w") as file:
#     file.write(corpus)

In [None]:
# print(failed)
# #  so i can delete these links from the list
# # del links[4]

In [None]:
# with open("corpus.txt", "r") as file:
#     lines = file.readlines()

In [None]:
# # just skipping here the first 2 lines (avoiding charac like ::)
# lines = lines[2:]
# data = []
# #  I read the file into lines & each line to a document to have the whole collection as df
# for i, line in enumerate(lines, start=1):
#     data.append([i, line.strip()])

# df = pd.DataFrame(data, columns=["docno", "raw_text"])

# df

In [None]:
# df['processed_text'] = df['raw_text'].apply(allPreprocessing)
# df.drop(columns='raw_text', inplace=True)
# #inplace to delete the original col
# print("data frame after preprocessing:")
# df