<a href="https://colab.research.google.com/github/NourhanDeifSayed/Search-Engine/blob/main/copy_of_final_project_data_minig_and_retrial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
zip_file_name = 'cisi.zip'
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('cisi_dataset')
!ls cisi_dataset


In [None]:
import re
import os

In [None]:
def load_cisi_dataset(data_dir):
    documents_path = os.path.join(data_dir, 'CISI.ALL')
    queries_path = os.path.join(data_dir, 'CISI.QRY')
    qrels_path = os.path.join(data_dir, 'CISI.REL')

    documents_df = read_documents(documents_path)
    queries_df = read_queries(queries_path)
    qrels_df = read_qrels(qrels_path)
    return documents_df, queries_df, qrels_df

# Read documents from CISI.ALL file
def read_documents(documents_path):
    with open(documents_path, 'r') as file:
        lines = file.readlines()
    documents = []
    current_document = None
    for line in lines:
        if line.startswith('.I'):
            if current_document is not None:
                current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
                documents.append(current_document)
            current_document = {'ID': line.strip().split()[1], 'Text': ''}
        elif line.startswith('.T'):
            continue
        elif line.startswith('.A') or line.startswith('.B') or line.startswith('.W') or line.startswith('.X'):
            continue
        else:
            current_document['Text'] += line.strip() + ' '

    # Append the last document
    if current_document is not None:
        current_document['Text'] = current_document['Text'].split('\t')[0].strip()  # Remove anything after the first tab
        documents.append(current_document)
    documents_df = pd.DataFrame(documents)
    return documents_df

# Read queries from CISI.QRY file
def read_queries(queries_path):
    with open(queries_path, 'r') as file:
        lines = file.readlines()
    query_texts = []
    query_ids = []
    current_query_id = None
    current_query_text = []
    for line in lines:
        if line.startswith('.I'):
            if current_query_id is not None:
                query_texts.append(' '.join(current_query_text))
                current_query_text = []
            current_query_id = line.strip().split()[1]
            query_ids.append(current_query_id)
        elif line.startswith('.W'):
            continue
        elif line.startswith('.X'):
            break
        else:
            current_query_text.append(line.strip())
    # Append the last query
    query_texts.append(' '.join(current_query_text))
    queries_df = pd.DataFrame({
        'qid': query_ids,
        'raw_query': query_texts})
    return queries_df

# Read qrels from CISI.REL file
def read_qrels(qrels_path):
    qrels_df = pd.read_csv(qrels_path, sep='\s+', names=['qid','Q0','docno','label'])
    return qrels_df

In [None]:
import pandas as pd

In [None]:
data_dir = '/content/cisi_dataset'
documents_df, queries_df, qrels_df = load_cisi_dataset(data_dir)
documents_df['Text'][0]

In [None]:
documents_df

In [None]:
qrels_df

In [None]:
documents_df["docno"]=documents_df["ID"].astype(str)
documents_df

In [None]:
queries_df["qid"]=queries_df["qid"].astype(str)
queries_df

In [None]:
!pip install python-terrier
!pip install nltk

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pyterrier as pt
nltk.download('punkt')

In [None]:
if not pt.started():

  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stopwords.words('english'))

In [None]:
stemmer = PorterStemmer()


In [None]:
def Steem_text(text):

    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    # print (tokens)
    return ' '.join(stemmed_tokens)


#a function to clean the tweets
def clean(text):
   text = re.sub(r"http\S+", " ", text) # remove urls
   text = re.sub(r"RT ", " ", text) # remove rt
   text = re.sub(r"@[\w]*", " ", text) # remove handles
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
   text = re.sub(r'\t', ' ', text) # remove tabs
   text = re.sub(r'\n', ' ', text) # remove line jump
   text = re.sub(r"\s+", " ", text) # remove extra white space
   text = text.strip()
   return text



def remove_stopwords(text):

    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words] #Lower is used to normalize al the words make them in lower case
    print('Tokens are:',tokens,'\n')
    return ' '.join(filtered_tokens)



#we need to process the query also as we did for documents
def preprocess(sentence):
  sentence = remove_stopwords(sentence)
  sentence = clean(sentence)
  sentence = Steem_text(sentence)

  return sentence


In [None]:
documents_df['processed_text'] = documents_df['Text'].apply(preprocess)
documents_df

In [None]:
queries_df["query"]=queries_df["raw_query"].apply(preprocess)
queries_df

In [None]:
indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])
print(index_ref.toString())

In [None]:
print(index_ref.toString())

index = pt.IndexFactory.of(index_ref)

print(index.getCollectionStatistics().toString())

In [None]:
for kv in index.getLexicon():
  print("%s -> %s " % (kv.getKey(), kv.getValue().toString()))

In [None]:
que="problems"
que=preprocess(que)

In [None]:
try:
    pointer = index.getLexicon()[que]
    for posting in index.getInvertedIndex().getPostings(pointer):
      print(posting.toString(), "doclen=%d" % posting.getDocumentLength())
      posting_info = posting.toString()
      doc_id = posting_info.split()[0].split("(")[1]




except:
    print("term not found")


In [None]:
Query="I love reading reading books"
Query=preprocess(Query)

In [None]:
tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"},num_results=15)

In [None]:
results=tfidf_retr.search(Query)
results

In [None]:
documents_df[['Text']][documents_df['docno'].isin(results['docno'].loc[0:4].tolist())]

In [None]:

!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

In [None]:

rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

rm3_qe = tfidf_retr>> rm3_expander
expanded_query = rm3_qe.search(Query).iloc[0]["query"]

expanded_query

In [None]:
for s in expanded_query.split()[1:]:
  print(s)

print("\n" + Query)

In [None]:
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = tfidf_retr.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

#Let's check the tweets text for the top 5 retrieved tweets
documents_df[['Text']][documents_df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

In [None]:
Sentences=documents_df[['Text']][documents_df['docno'].isin(results_wqe['docno'].loc[0:10].tolist())]

In [None]:

docnos = results_wqe['docno'].loc[0:10].tolist()


sentences_list = documents_df.loc[documents_df['docno'].isin(docnos), 'Text'].tolist()


print(sentences_list)

In [None]:
from google.colab.output import eval_js
print(eval_js('google.colab.kernel.proxyPort(5000)'))

In [None]:
from flask import Flask, request, send_from_directory
import os
import pyterrier as pt
from datetime import datetime
app = Flask(__name__)


indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
index_ref = indexer.index(documents_df["processed_text"], documents_df["docno"])
index = pt.IndexFactory.of(index_ref)


@app.route("/")
def home():
    html_search = """
    <!DOCTYPE html>
    <html>
    <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Search Engine</title>
    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" rel="stylesheet" id="bootstrap-css">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js"></script>
    <script>
    $(document).ready(function(){
        $('#search_button').click(function(){
            var query = $('#search_text').val();
            load_data(query);
        });
        $('#search_text').keyup(function(e){
            if(e.which == 13){
                var query = $(this).val();
                load_data(query);
            }
        });
        function load_data(query) {
            $.ajax({
                url: "/search",
                method: "POST",
                data: {query: query},
                success: function(data) {
                    $('#result').html(data);
                },
                error: function(xhr, status, error) {
                    console.error('Error:', error);
                }
            });
        }
    });
    </script>
    </head>
    <body>
    <div class="container search-table">
        <p><h2 align="center">Search for anything on your mind</h2></p>
        <div class="search-box">
            <div class="row">
                <div class="col-md-3">
                    <h5>Enter Your Queries</h5>
                </div>
                <div class="col-md-6">
                    <input type="text" name="search_text" id="search_text" class="form-control" placeholder="Search For Everything here">
                </div>
                <div class="col-md-3">
                    <button id="search_button" class="btn btn-primary" style="background-color: #007bff; border-color: #007bff;">Search</button>
                </div>
            </div>
        </div>
        <div id="result"></div>
    </div>
    <style>
    .search-table{
        padding: 10%;
        margin-top: -6%;
    }
    .search-box{
        background:  #add8e6;
        border: 1px solid #ababab;
        padding: 3%;
    }
    .search-box input:focus{
        box-shadow:none;
        border:2px solid #eeeeee;
    }
    .search-list{
        background: #add8e6;
        border: 1px solid #ababab;
        border-top: none;
    }
    .search-list h3{
        background: #add8e6;
        color: #000080;
        padding: 3%;
        margin-bottom: 0%;
    }
    </style>
    </body>
    </html>
    """
    return html_search
@app.route("/search", methods=['POST'])
def search():
    try:

        if 'query' in request.form:
            query = request.form['query']
        else:
            raise ValueError("Query not found in form data.")

        query = preprocess(query)

        start_time = datetime.now()
        results = tfidf_retr.transform(query)
        end_time = datetime.now()

        execution_time = (end_time - start_time).total_seconds()


        if not results.empty:
            results = results.head(6)

            relevant_docnos = results['docno'].tolist()
            relevant_documents = documents_df[documents_df['docno'].isin(relevant_docnos)]


            output_html = "<h3>Search Results:</h3>"
            output_html += "<ul>"
            for _, row in results.iterrows():
                output_html += "<li>Document: {}</li>".format(row['docno'])
            output_html += "</ul>"

            output_html += "<h3>Relevant Documents:</h3>"
            output_html += "<ul>"
            for _, doc in relevant_documents.iterrows():
                output_html += "<li>Document: {}, Text: {}</li>".format(doc['docno'], doc['Text'])
            output_html += "</ul>"

            output_html += "<p>Execution Time: {:.4f} seconds</p>".format(execution_time)
        else:
            output_html = "<p>No results found.</p>"
    except Exception as e:
        output_html = f"<p>An error occurred: {e}</p>"
    return output_html



@app.route('/favicon.ico')
def favicon():
  return send_from_directory(os.path.join(app.root_path, 'static'), 'favicon.ico', mimetype='image/vnd.microsoft.icon')

if __name__ == '__main__':
    app.run(port=5000)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [None]:
embeddings = elmo.signatures["default"](tf.constant(sentences_list))["elmo"]

In [None]:
embedding_1 = embeddings.numpy()[0][7]
embedding_2= embeddings.numpy()[1][5]
featuresOf_3= embeddings.numpy()[2][8]


print("Embedding vector embedding_1:", embedding_1)
print("Embedding vector embedding_2:",embedding_2)
print("Embedding vector featuresOf_3:",featuresOf_3)

In [None]:

import gensim
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(sentences=sentences_list,
                 sg=1,
                 vector_size=100,
                 window=2,
                 min_count=1,
                 workers=4,
                 epochs=20)

In [None]:
word_embeddings = model.wv

In [None]:
eval = pt.Evaluate(results_wqe,qrels_df,metrics=["map","recall","P"], perquery=True)
eval