#### Import required libraries

In [None]:
import json
from elasticsearch5 import Elasticsearch

#### Create elasticsearch connection

In [None]:
es = Elasticsearch(hosts="localhost:9200")

In [None]:
#### Deleting the final index added for every time the program is run and index is created

# es.indices.delete(index="shakespeare", ignore=404)

####  Document mapping

In [None]:
# initial document mapping

document_mappings = {
    "properties": {
        "line_id": {"type": "long"},
        "play_name": {"type": "text"},
        "line_number": {"type": "text"},
        "speaker": {"type": "text"},
        "speech_number": {"type": "text"},
        "speaker": {"type": "text"},
        "text_entry": {"type": "text"},
    }
}

elastic_search_config = {
    "mappings": {
        "line": document_mappings,
        "scene": document_mappings,
        "act": document_mappings,
    }
}

#### Create index

In [None]:
#Indexing the document with mapping

shakespeare_index = "shakespeare"
es.indices.create(index=shakespeare_index, body=elastic_search_config, ignore=400)

#### View current mapping

In [None]:
#Viewing the created index's mapping

mapping = es.indices.get_mapping(index=shakespeare_index)
pprint_mapping = json.dumps(mapping, indent=4)
print(pprint_mapping)

#### Add document to index

In [None]:
"""
The dataset file has its index as Shakespeare, three different types of fields 
and ids on its metadata and detailed information of the document is on its 
subsequent line with fields, id of the line, name of the play, speech number, 
line number of the speech, name of the speaker and speaker's text entry
"""

file = "shakespeare.json"

# Open the file containing works of Shakespeare
with open(file) as works_of_shakespeare:
    current_line = 1  # Initialize the current line counter

    # Iterate over each line in the file
    for line in works_of_shakespeare:
        # Check if the current line number is odd (line numbers start from 1)
        if current_line % 2 > 0:
            index_info = json.loads(line.strip())  # Parse the line as JSON and store it in index_info
        else:
            # Parse the line as JSON and store it in document
            document = json.loads(line.strip())

            # Extract the index, document type, and document ID from index_info
            index = index_info["index"]["_index"]
            doc_type = index_info["index"]["_type"]
            doc_id = index_info["index"]["_id"]

            # Index the document in Elasticsearch using the extracted information
            es.index(index=index, doc_type=doc_type, id=doc_id, body=document)

        current_line += 1  # Increment the current line counter

        # Check if we have processed more than 300 lines
        if current_line > 30:
            print("done")  # Print "done" to indicate the process is complete
            break  # Exit the loop

#### Count of the document

In [None]:
# Get the index stats for the "shakespeare_index"
index_stats = es.indices.stats(index=shakespeare_index)

# Access the count of documents in the index
doc_count = index_stats["_all"]["primaries"]["docs"]["count"]

# Print the count of documents
print(doc_count)

#### Tokenization and Case folding using custom analyzer

In [None]:
# create custom analyzer with tokenization and casefolding

custom_analyzer = {
    "analysis": {
        "analyzer": {
            "my_custom_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["lowercase"],
            }
        }
    }
}


new_document_mappings = {
    "properties": {
        "line_id": {"type": "long"},
        "play_name": {"type": "text"},
        "line_number": {"type": "text"},
        "speaker": {"type": "text"},
        "speech_number": {"type": "text"},
        "speaker": {"type": "text"},
        "text_entry": {
            "type": "text",
            "analyzer": "my_custom_analyzer"
        },
    }
}

elastic_search_config = {
    "settings": custom_analyzer,
    "mappings": {
        "line": new_document_mappings,
        "scene": new_document_mappings,
        "act": new_document_mappings,
    }
}


#### Delete old index and create new index with new configuration

In [None]:
# delete old index
es.indices.delete(index=shakespeare_index)

# create new index
es.indices.create(
    index=shakespeare_index, body=elastic_search_config, ignore=400
)

### Test analyzer with tokenizations and casefolding

In [None]:
# Testing custom analyzer with tokenization and casefolding on a random text as an example

analyzer_output = es.indices.analyze(
    index=shakespeare_index,
    body={
        "text": "Whereas few PEOPLE set out deliBratELY to defraud in THIS waY, theRe is a RISK of unintentional PlaGiarISm.",
        "analyzer": "standard",
    },
)
tokens = analyzer_output["tokens"]

t = [token["token"] for token in tokens]
print(t)

#### Stemming or Morphological Analysis

In [None]:

# create custom analyzer including stemming in existing custom analyzer of Tokenization and CaseFolding
custom_analyzer = {
    "analysis": {
        "analyzer": {
            "my_custom_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["lowercase", "porter_stem"],
            }
        }
    }
}

# update elastic search config

elastic_search_config["settings"] = custom_analyzer

elastic_search_config

#### Create new index with stemming analyzer

In [None]:
#delete old index
es.indices.delete(index=shakespeare_index)


# create new index with updated document mapping of custom analyzer
es.indices.create(index=shakespeare_index, body=elastic_search_config, ignore=400)

### Test analyzer for stemming

In [None]:
# Example of custom analyzer with stemming, tokenization, case folding on a sentence

analyzer_output = es.indices.analyze(
    index=shakespeare_index,
    body={
        "text": "Many of his paintings show the setting sun.",
        "analyzer": "my_custom_analyzer",
    },
)
tokens = analyzer_output["tokens"]
t = [token["token"] for token in tokens]
print(t)

#### Removing stopwords 

In [None]:
# adding stopword removal to the existing custom analyser
custom_analyzer = {
    "analysis": {
        "analyzer": {
            "my_custom_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["lowercase", "stop", "porter_stem"],
            }
        }
    }
}

# update elastic search config

elastic_search_config["settings"] = custom_analyzer

elastic_search_config

#### Create new index with stopword analyzer

In [None]:
#delete old index
es.indices.delete(index=shakespeare_index)

#create new index with updated custom analyser of stopword
es.indices.create(index=shakespeare_index, body=elastic_search_config, ignore=400)

#### Test Analyzer for stopword removal

In [None]:
#test on a sample sentence fot the filters of Tokenization, Casefolding, Stemming, Removing Stopwords

analyzer_output = es.indices.analyze(
    index=shakespeare_index,
    body={"text": "The national library has always carried huge symbolic weight as a measure of attitudes towards literature and learning 22-07-19!!@98.", "analyzer": "my_custom_analyzer"},
)
tokens = analyzer_output["tokens"]
t = [token["token"] for token in tokens]
print(t)

### N-Gram

In [None]:
# add N-Gram to custom analyzer

# create custom analyzer for bigram assigning max and min values

custom_analyzer = {
    "analysis": {
        "analyzer": {
            "my_custom_analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter":  ["lowercase", "stop", "porter_stem", "bigram"],
            }
        },
        "filter": {"bigram": {"type": "ngram", "min_gram": 2, "max_gram": 2}},
    }
}


# update elastic search config

elastic_search_config["settings"] = custom_analyzer

elastic_search_config

#### Delete old index and creating new index with ngrams

In [None]:
#delete old index
es.indices.delete(index=shakespeare_index)

#create new index with updated settings
es.indices.create(index=shakespeare_index, body=elastic_search_config, ignore=400)

### Test analyzer for ngrams

In [None]:
#test on a sample sentence fot the filters of Tokenization, Casefolding, Stemming, Removing Stopwords and bigrams

analyzer_output = es.indices.analyze(
    index=shakespeare_index,
    body={"text": "The national library has always carried huge symbolic weight as a measure of attitudes towards literature and learning 22-07-19!!@98.", "analyzer": "my_custom_analyzer"},
)
tokens = analyzer_output["tokens"]
t = [token["token"] for token in tokens]
print(t)

#### Adding TF.IDF to the updated index with ngrams 

In [None]:
# add tfidf to mapping to text entry mapping in elasticsearch config

similarity_property = {"similarity": "classic"}

elastic_search_config["mappings"]["line"]["properties"]["text_entry"].update(
    similarity_property
)

### Create new index

In [None]:
#delete old index
es.indices.delete(index=shakespeare_index)

#create new index with updated settings
es.indices.create(index=shakespeare_index, body=elastic_search_config, ignore=400)

#### Importing the dataset file to perform search queries 

In [None]:
# Open the "shakespeare.json" file for reading
with open("shakespeare.json") as works_of_shakespeare:
    current_line = 1  # Initialize the current line counter

    # Iterate over each line in the file
    for line in works_of_shakespeare:

        # Check if the current line number is odd (line numbers start from 1)
        if current_line % 2 > 0:
            index_info = json.loads(line.strip())  # Parse the line as JSON and store it in index_info
        else:
            # Parse the line as JSON and store it in document
            document = json.loads(line.strip())

            # Extract the index, document type, and document ID from index_info
            index = index_info["index"]["_index"]
            doc_type = index_info["index"]["_type"]
            doc_id = index_info["index"]["_id"]

            # Index the document in Elasticsearch using the extracted information
            es.index(index=index, doc_type=doc_type, id=doc_id, body=document)

        current_line += 1  # Increment the current line counter

        # Check if we have processed more than 1000 lines
        if current_line > 7000:
            print("done")  # Print "done" to indicate the process is complete
            break  # Exit the loop


#### Pagination to display desired number of documents from default

In [None]:
# Define the pagination parameters
pagination = {"query": {"match_all": {}}, "from": 12, "size": 5}

# Perform the search using Elasticsearch with the defined pagination
pagi = es.search(index="shakespeare", body=pagination)

# Retrieve the paginated results
pagi


#### Search query for Full text

In [None]:
# Define the search query to search for documents containing the term "pagans" in the "text_entry" field
query = {"query": {"match": {"text_entry": {"query": "pagans"}}}}

# Perform the search operation using Elasticsearch with the defined query on the "shakespeare" index
resp = es.search(index="shakespeare", body=query)

# Iterate over the search results
for hit in resp["hits"]["hits"]:
    # Print each hit
    print(hit)


#### Search query to match exact phrases 

In [None]:
# Define the search query to search for documents containing the exact phrase "thy love" in the "text_entry" field
match_phrase = {"query": {"match_phrase": {"text_entry": {"query": "thy love"}}}}

# Perform the search operation using Elasticsearch with the defined query on the "shakespeare" index
resp = es.search(index="shakespeare", body=match_phrase)

# Iterate over the search results
for hit in resp["hits"]["hits"]:
    # Print each hit
    print(hit)


#### Search query to match phrases on multipe fields using operator

In [None]:
# Define the search query to search for documents containing the phrase "ever valiant" in the "speaker" or "text_entry" fields
match_phrase = {
    "query": {
        "multi_match": {
            "query": "ever valiant",
            "operator": "and",
            "fields": ["speaker", "text_entry"],
        }
    }
}

# Perform the search operation using Elasticsearch with the defined query on the "shakespeare" index
resp = es.search(index="shakespeare", body=match_phrase)

# Iterate over the search results
for hit in resp["hits"]["hits"]:
    # Print each hit
    print(hit)


#### Search query to match part of phrase with multiple fields

In [None]:
# Define the search query to search for documents containing the partial phrase "hol" in the "speaker" or "text_entry" fields
match_part_phrase = {
    "query": {
        "multi_match": {
            "query": "hol",
            "fields": ["speaker", "text_entry"],
            "type": "phrase_prefix",
        }
    }
}

# Perform the search operation using Elasticsearch with the defined query on the "shakespeare" index
resp = es.search(index="shakespeare", body=match_part_phrase)

# Iterate over the search results
for hit in resp["hits"]["hits"]:
    # Print each hit
    print(hit)


#### Sorting in descinding order for field line_id

In [None]:
# Define the search query to retrieve all documents and sort them in descending order based on the "line_id" field
sorted_query = {"query": {"match_all": {}}, "sort": {"line_id": {"order": "desc"}}}

# Perform the search operation using Elasticsearch with the defined query on the "shakespeare" index
resp = es.search(index="shakespeare", body=sorted_query)

# Iterate over the search results
for hit in resp["hits"]["hits"]:
    # Print each hit
    print(hit)


#### Filtering part of phrase on Speaker field

In [None]:
# Define the search query to find documents where the "speaker" field matches "FALSTAFF" and the "text_entry" field contains the exact term "thy"
match_part_phrase_filter = {
    "query": {
        "bool": {
            "must": [
                {"match": {"speaker": "FALSTAFF"}}
            ],
            "filter": [
                {"term": {"text_entry": "thy"}}
            ],
        }
    }
}

# Perform the search operation using Elasticsearch with the defined query on the "shakespeare" index
respFilter = es.search(index="shakespeare", body=match_part_phrase_filter)

# Iterate over the search results
for hit in respFilter["hits"]["hits"]:
    # Print each hit
    print(hit)
