**Semantic search**

Using the latest insights from NLP research, it is possible to train a Language Model on a large corpus of documents. Afterwards, the model is able represent documents based on their “semantic” content. In particular, this includes the possibility to search for documents with semantically similar content.

Semantic search means understanding the intent behind the query and representing the “knowledge in a way suitable for meaningful retrieval.


In [None]:
from google.colab import drive
import pandas as pd
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install farasapy

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


**Keyword Search Vs Semantic Search**

At first, search engines were lexical: the search engine looked for literal matches of the query words, without understanding of the query’s meaning and only returning links that contained the exact query.By using regular keyword search, a document either contains the given word or not, and there is no middle ground On the other hand, 

**“Semantic** **Search”** can simplify query building, because it is supported by automated natural language processing programs i.e. using Latent Semantic Indexing — a concept that search engines use to discover how a keyword and content work together to mean the same thing.

In [None]:
import nltk
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import pandas as pd
data = pd.read_json('/content/drive/MyDrive/Freelancing/Project- 26/All_services_list_ar.json')
# data = data[['name','unique_id','transaction_id','description','eligibility']]
data.head(1)

Unnamed: 0,id,name,parent_id,transaction_id,unique_id,description,eligibility,required_documents,average_waiting_time,output,...,process,process_time,related_documents,faqs,apply_now_link,most_popular,service_classification,disclaimer,updated_at,channels
0,3704682,طلب تجديد تصريح تعلم القيادة,534a01e4-d350-453f-a035-decf9d67676b,786,70203,تُمكِّنك هذه الخدمة من تجديد تصريح تعلُّم قياد...,يحقُّ للأفراد المواطنين والمقيمين الحصول على ه...,<ol><li>الهوية الإماراتية الأصليّة.</li></ol>,<ul><li>10 دقائق كحدّ أقصى.</li></ul>,<p>تصريح تعلُّم قيادة المركبات المُجدَّد.</p>,...,<ul><li>يقدّم المتعامل الهوية الإماراتية الأصل...,<ul><li>زيارة واحدة إلى معهد التعلّم.</li></ul>,,<ul><li><strong>السؤال الأول:</strong><span> ف...,,False,ترخيص السائقين,,2021-10-22 06:20:31,[{'title': 'القنوات - الشركاء ومزودو الخدمات /...


In [None]:
# !pip install spacy
# !pip install requests --upgrade
# !pip install git+https://github.com/ozgur/python-firebase
import nltk
nltk.download('punkt')
!pip install farasapy

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import string
import os
import re
# import preprocessor as p
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag


class CleaningText:

    def __init__(self):
        
        self.punctuation = list(string.punctuation)
        self.stop  = stopwords.words('english') + self.punctuation + ['rt', 'via', 'with', 'new', 'get', 'it', 'go',"you"]
        
        # self.pos_dict = {'J':wordnet.ADJ,'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
        self.not_stopwords = ["not", "no", "neither","don't","wouldn't","wouldn","did'nt",
                         "off","didn't","hadn't","mightn't","wasn't","isn't","couldn't","shouldn't", "won't","isn't",
                         "nor","weren't","doesn't","hasn't","haven't","shouldn","mustn't"] 
        
        self.stop = set([word for word in self.stop if word not in self.not_stopwords])
        self.table = str.maketrans({key: None for key in string.punctuation})
        self.lemma = WordNetLemmatizer()
        
 
  
    def remove_numbers(self, text):

        try:
            return re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
        except:
            return text

    def clean_text(self, x):
        x = x.lower()
        x = self.remove_numbers(x)
        x = [i for i in x.lower().split() if i not in self.stop]
        x = " ".join(x)
#         x = re.sub(r"[^A-Z/a-z0-9(),!?\'\`.]", " ", x)
        x = re.sub((r"^[\W]*"), "", x)
        x = re.sub((r"\s[\W]\s"), ", ", x)
        x = [i for i in x.split() if len(i) > 1]
        x = " ".join(x)
        x = re.sub(r"[^A-Z/a-z0-9(),!?\'\`.]", " ", x)
        x = x.translate(self.table)

        normalized = " ".join(self.lemma.lemmatize(word) for word in x.split())
        return x
    

In [None]:
clean_text_ = CleaningText()

In [None]:
clean_text_.clean_text("I am Noman khan playing")

'noman khan playing'

In [None]:
from gensim.similarities import MatrixSimilarity
from operator import itemgetter

import string
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import string
import gensim
import operator
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from gensim import corpora
from gensim import corpora
import nltk


data = pd.read_json('/content/drive/MyDrive/Freelancing/Project- 26/All_services_list_ar.json')
stop_word_path = "stop.txt"
class SearchTerm:
  def __init__(self):
   
    #self.spacy_nlp = spacy.load('en_core_web_sm')
    #create list of punctuations and stopwords
    self.punctuations = string.punctuation
    self.stop_words = pd.read_csv(stop_word_path, header=None)
    self.stop_words = self.stop_words[0].unique().tolist()
    self.clean = CleaningText()


  def search_similar_terms(self,search_term):
    legal_tfidf_model,legal_lsi_model,legal_lsi_corpus,dictionary =self.get_models(data)

    legal_index = MatrixSimilarity(legal_lsi_corpus, num_features = legal_lsi_corpus.num_terms)
    query_bow = dictionary.doc2bow(self.tokenizer(search_term))
    query_tfidf = legal_tfidf_model[query_bow]
    query_lsi = legal_lsi_model[query_tfidf]

    legal_index.num_best = 1

    legal_list = legal_index[query_lsi]

    legal_list.sort(key=itemgetter(1), reverse=True)
    legal_names = []

    for j, legal in enumerate(legal_list):
    
        legal_names.append (
            {
                'Relevance_score': int(round((legal[1] * 100),2)),
                'unique_id': data['unique_id'][legal[0]],
                'id': data['id'][legal[0]],
                'name': data['name'][legal[0]]
            }

        )
        if j == (legal_index.num_best-1):
            break

    return legal_names

  def get_data(self,data):
    try:
      for index, row in data.iterrows():
        description = data['description'][index]
        name = data['name'][index]
        output = data['output'][index]
        eligibility = data['eligibility'][index]
        data.loc[index, 'Content'] = name + output + eligibility
      return data
    except Exception as e:
      print("There's something while getting data",e)


  def tokenizer(self,sentence):
    #remove distracting single quotes
    sentence = self.clean.arabic_preprocessing(sentence)
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)

    tokens = nltk.word_tokenize(sentence)
    #return tokens
    return tokens
  def get_models(self,data):
    try:
      data['output'] = data['output'].apply(self.clean.cleanhtml)
      data = self.get_data(data)
      data['content_tokenized'] = data['Content'].map(lambda x: self.tokenizer(x))
      legal_content = data['content_tokenized']
      dictionary = corpora.Dictionary(legal_content)
      stoplist = set('hello and if this can would should could tell ask stop come go')
      stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
      dictionary.filter_tokens(stop_ids)
      dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 50]]
      corpus = [dictionary.doc2bow(desc) for desc in legal_content]
      word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]
      legal_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
      legal_lsi_model = gensim.models.LsiModel(legal_tfidf_model[corpus], id2word=dictionary, num_topics=300)
      gensim.corpora.MmCorpus.serialize('legal_tfidf_model_mm', legal_tfidf_model[corpus])
      gensim.corpora.MmCorpus.serialize('legal_lsi_model_mm',legal_lsi_model[legal_tfidf_model[corpus]])
      legal_tfidf_corpus = gensim.corpora.MmCorpus('legal_tfidf_model_mm')
      legal_lsi_corpus = gensim.corpora.MmCorpus('legal_lsi_model_mm')
    
      return legal_tfidf_model,legal_lsi_model,legal_lsi_corpus,dictionary
    except Exception as e:
      print("Can't able to execute the model",e)






In [None]:
search = SearchTerm()




In [None]:
text = "تجديد تصريح سائق باص مدرسة"
result = search.search_similar_terms(text)
result

[{'Relevance_score': 71,
  'id': 3705033,
  'name': 'طلب استئجار باص دبي المائي',
  'unique_id': 70351}]

In [None]:
test = data[data['id']==3705041]
print(test['name'])
print(test['description'])
print(test['output'])
print(test['eligibility'])


240    طلب إصدار تصريح مزاولة مهنة سائق جديد للنقل ال...
Name: name, dtype: object
240    تُمكّنك هذه الخدمة من الحصول على تصريح مزاولة ...
Name: description, dtype: object
240    تصريح مزاولة مهنة سائق للنقل المدرسي.
Name: output, dtype: object
240    يحقُّ لسائقي المدارس والشركات العاملة في النقل...
Name: eligibility, dtype: object


In [None]:
test1 = data[data['id']==3705033]
print(test1['name'])
print(test1['description'])
print(test1['output'])
print(test1['eligibility'])


164    طلب استئجار باص دبي المائي
Name: name, dtype: object
164    تُمكّنك هذه الخدمة من استئجار الباص المائي. هن...
Name: description, dtype: object
164    استئجار باص دبي المائي.
Name: output, dtype: object
164    يحقُّ للأفراد والشركات الحصول على هذه الخدمة.
Name: eligibility, dtype: object


In [None]:
result

[{'Relevance_score': 71,
  'id': 3705033,
  'name': 'طلب استئجار باص دبي المائي',
  'unique_id': 70351},
 {'Relevance_score': 60,
  'id': 3705027,
  'name': 'التنقل بباص دبي المائي - العبرات المكيفة',
  'unique_id': 70350},
 {'Relevance_score': 36,
  'id': 3704704,
  'name': 'طلب تجديد تصريح سائق مهني',
  'unique_id': 70217}]

In [None]:
result[0]['Relevance_score']

71

In [None]:
type(result)

list

In [None]:
result[0]

{'Relevance_score': 71,
 'id': 3705033,
 'name': 'طلب استئجار باص دبي المائي',
 'unique_id': 70351}

In [None]:
int(result[0]['id'])

3705033

In [None]:
test = [result[0]]

In [None]:
test

[{'Relevance_score': 71,
  'id': 3705033,
  'name': 'طلب استئجار باص دبي المائي',
  'unique_id': 70351}]

In [None]:
books = [
    {'id': 0,
     'title': 'A Fire Upon the Deep',
     'author': 'Vernor Vinge',
     'first_sentence': 'The coldsleep itself was dreamless.',
     'year_published': '1992'},
    {'id': 1,
     'title': 'The Ones Who Walk Away From Omelas',
     'author': 'Ursula K. Le Guin',
     'first_sentence': 'With a clamor of bells that set the swallows soaring, the Festival of Summer came to the city Omelas, bright-towered by the sea.',
     'published': '1973'},
    {'id': 2,
     'title': 'Dhalgren',
     'author': 'Samuel R. Delany',
     'first_sentence': 'to wound the autumnal city.',
     'published': '1975'}
]

In [None]:
type(books)

list

In [None]:
print(type(result))
print(type(result[0]))

<class 'list'>
<class 'dict'>


In [None]:
dict_obj = {}
dict_obj["result_1"] = result[0]
dict_obj["result_2"] = result[0]
dict_obj["result_3"] = result[0]
# for i in range(len(result)):
#   print(i)
#   print(result[i])
#   dict_obj["color"] = result[i]

In [None]:
dict_obj

{'result_1': {'Relevance_score': 71,
  'id': 3705033,
  'name': 'طلب استئجار باص دبي المائي',
  'unique_id': 70351},
 'result_2': {'Relevance_score': 71,
  'id': 3705033,
  'name': 'طلب استئجار باص دبي المائي',
  'unique_id': 70351},
 'result_3': {'Relevance_score': 71,
  'id': 3705033,
  'name': 'طلب استئجار باص دبي المائي',
  'unique_id': 70351}}

In [None]:
from Search import SearchTerm
test = SearchTerm()

ModuleNotFoundError: ignored

In [None]:
result = test.search_similar_terms("طلب إصدار بدل فاقد أو تالف لتصريح تعلم القيادة")
result

In [None]:
%%writefile test.py
from flask import Flask

app = Flask(__name__)


@app.route('/')
def hello():
    return 'Hello, World!'

In [None]:

import os
from flask import Flask, jsonify, request
import traceback
from flask import Flask, render_template , request 
import os

from Search import SearchTerm
test = SearchTerm()


app = Flask(__name__)

dict_obj = {}


@app.route('/predict',methods=['GET','POST'])
def classify_review():
  try:
    search_term = request.args.get('text', type=None)
    result = test.search_similar_terms(search_term)
    dict_obj["result_1"] = result[0]
    dict_obj["result_2"] = result[0]
    dict_obj["result_3"] = result[0]
    return jsonify(dict_obj)
  except Exception as e:
    return jsonify({'error':e})


if __name__ == '__main__':
    # This is used when running locally only. When deploying to Google Cloud
    # Run, a webserver process such as Gunicorn will serve the app.

    app.run(debug=False, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
 


In [None]:
from flask import Flask, jsonify, request
request.args.get()

In [None]:
# flask_ngrok_example.py
from flask import Flask
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run

@app.route("/")
def hello():
    return "Hello World!"

if __name__ == '__main__':
    app.run()