# Student Id: 2211876
## CE706 - Information Retrieval Assignment

#### Importing warnings to ignore warnings

In [1]:
import warnings
warnings.filterwarnings("ignore")

### Loading the Documents to Elastic search
* Import elasticsearch
* import json
* Insantiating Elasticsearch to es
* Reading the json file by lines since the first line has the index and the second line has the data
* Loading index line and document from a for loop
* Using bulk to load the document in index shakespeare

In [2]:
#Imports
from elasticsearch import Elasticsearch
import json

#Insantiating Elasticsearch to es
es = Elasticsearch(hosts=['http://localhost:9200'])

#Reading the json file by lines since the first line has the index and the second line has the data
with open('shakespeare.json', 'r') as file:
    data = file.readlines()

bulk_data = []
#Loading index line and document from a for loop
for i in range(0, len(data), 2):
    index_line = json.loads(data[i])
    document = json.loads(data[i+1])

    bulk_data.append({
        'index': {
            '_index': index_line['index']['_index'],
            '_id': index_line['index']['_id']
        }
    })
    bulk_data.append(document)
#Using bulk to load the document in index shakespeare
responses = es.bulk(index='shakespeare', body=bulk_data, refresh=True)

print(f"Successfully loaded {len(responses['items'])} documents")

Successfully loaded 111396 documents


### Printing the Indices

In [3]:
#Loading the indices
indices = es.indices.get(index='*')

# Printing the list of indices
print("Indices:")
for index in indices:
    print(index)


Indices:
shakespeare
sheikspeare


### Looking at the mappings of the index shakespeare

In [4]:
# Getting the mappings for the index
mappings = es.indices.get_mapping(index="shakespeare")

mappings_dict = mappings["shakespeare"]["mappings"]

# Print the mappings as formatted JSON
print("Mappings:")
print(json.dumps(mappings_dict, indent=4))

Mappings:
{
    "properties": {
        "line_id": {
            "type": "long"
        },
        "line_number": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "play_name": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "speaker": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "speech_number": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
 

### Defining Analyzers and similarity module for text processing
#### Defining analyzers
* Using a custom analyzer
* Added a standard tokenizer
* Added a lowercase filter to convert the text to lowercase
* Added a stop filter to remove stop words
* Added a custom shingle filter for better retrieval
* Added a porter stem filter for stemming
* Added a unique filter to remove duplicate words
* Mapped the analyzer to text_entry field
* Added a DFR similarity algorithm
* Added all the above features to a new index 'sheikspeare'

In [5]:
# Defining the analyzer settings
analyzer_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "custom_analyzer": { #Using a custom analyzer
                    "type": "custom",
                    "tokenizer": "standard", #Added a standard tokenizer
                    "filter": ["lowercase", "stop", "custom_shingle_filter", "porter_stem", "unique"]  # Added the lowercase, custom_shingle_filter, porter_stem, and unique filters
                }
            },
            "filter": {
                "custom_shingle_filter": {
                    "type": "shingle",
                    "min_shingle_size": 2,  # keeping the minimum size of the generated shingles to the default value
                    "max_shingle_size": 3  # keeping the maximum size of the generated shingles to the default value
                },
                "porter_stem": {
                    "type": "porter_stem" #Added a porter stem
                },
                "unique": {
                    "type": "unique" #Added a unique filter
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "text_entry": {
                "type": "text",
                "analyzer": "custom_analyzer"  # Mapping the custom analyzer to the text_entry field
            }
        }
    }
}

similarity_settings = {
    "index": {
        "similarity": {
            "custom_similarity": {
                "type": "DFR", #Added a DFR similarity
                "basic_model": "g", #Gaussian approximation
                "after_effect": "l", #Laplace after-effect
                "normalization": "h2" #H2 normalization
            }
        }
    }
}

#Defining a new index name
new_index = "sheikspeare"

#Just a if condition to delete the index if it exists. Saved valuable time during testing
if es.indices.exists(index= new_index):
    print("Index already Exists. Deleting Old index and creating a new index")
    es.indices.delete(index=new_index, body=analyzer_settings)

#Creating a new index.
es.indices.create(index=new_index, body=analyzer_settings)

#Closing index before applying the similarity settings
es.indices.close(index=new_index)

#Updating the similarity settings for the index
es.indices.put_settings(index=new_index, body=similarity_settings)

#Opening the index again
es.indices.open(index=new_index)

#Reindexing the data from the existing index to the new index with the custom analyzer
old_index = "shakespeare"
body = {
    "source": {
        "index": old_index
    },
    "dest": {
        "index": new_index
    }
}
es.reindex(body=body, refresh=True)

#Checking if the reindexing was successful
if es.indices.exists(index=new_index):
    print("Reindexing completed successfully with the custom analyzer.")
else:
    print("Failed to reindex the data.")

Index already Exists. Deleting Old index and creating a new index
Reindexing completed successfully with the custom analyzer.


### Looking at the mappings of the new index sheikspeare to see if the analyzer has been applied to the text_entry field

In [6]:
#Getting the mappings for the index
mappings = es.indices.get_mapping(index="sheikspeare")

mappings_dict = mappings["sheikspeare"]["mappings"]

#Printing the mappings as formatted JSON
print("Mappings:")
print(json.dumps(mappings_dict, indent=4))

Mappings:
{
    "properties": {
        "line_id": {
            "type": "long"
        },
        "line_number": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "play_name": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "speaker": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "speech_number": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
 

### Testing out the analyzer with a custom text
#### From the output
* Stopwords are removed
* Words are stemmed
* tri-gram shingles
* Lowercase

In [7]:
#Index name
index_name = "sheikspeare"

#Custom text
text = "Sentencing wording testing stemming masses for the losses got on the busses"

#Using the defined custom analyzer on the text
tokens = es.indices.analyze(index=index_name, body={"text": text}, params={"analyzer": "custom_analyzer"})

#for loop to iterate over the tokens to print
for token in tokens["tokens"]:
    print(token["token"])

sentenc
sentencing word
sentencing wording test
word
wording test
wording testing stem
test
testing stem
testing stemming mass
stem
stemming mass
stemming masses _
mass
masses _
masses _ _
_ _ loss
_ loss
_ losses got
loss
losses got
losses got _
got
got _
got _ _
_ _ buss
_ buss
buss


In [8]:
#Index name
index_name = "sheikspeare"

#Custom text
text = "This is a custom TeXt with stop words and I'm still typing even-though I should stop"

#Using the defined custom analyzer on the text
tokens = es.indices.analyze(index=index_name, body={"text": text}, params={"analyzer": "custom_analyzer"})

#for loop to iterate over the tokens to print
for token in tokens["tokens"]:
    print(token["token"])

_ _ custom
_ custom
_ custom text
custom
custom text
custom text _
text
text _
text _ stop
_ stop
_ stop word
stop
stop word
stop words _
word
words _
words _ i'm
_ i'm
_ i'm stil
i'm
i'm stil
i'm still typ
still
still typ
still typing even
type
typing even
typing even though
even
even though
even though i
though
though i
though i should
i
i should
i should stop
should
should stop


## Search Queries

#### 1. A match phrase search query to search for a phrase

In [9]:
#Defining a search query
search_query = {
  "query": {
    "match_phrase": {
      "text_entry": "none or little"
    }
  }
}

#Executing the search query
response = es.search(index=index_name, body=search_query)

#Processing the search results
hits = response["hits"]["total"]["value"]

#Printing the results
print(f"Hits: {hits}")
for hit in response["hits"]["hits"]:
    score = hit["_score"]  #Get the score of the hit
    source = hit["_source"]  #Get the source document
    #Printing the necessary information from the hits in a json format
    print(f"Score: {score}, Document: {json.dumps(source, indent=4)}")

Hits: 2
Score: 61.69443, Document: {
    "line_id": 92154,
    "play_name": "The Tempest",
    "speech_number": 37,
    "line_number": "2.1.50",
    "speaker": "SEBASTIAN",
    "text_entry": "Of that theres none, or little."
}
Score: 59.887383, Document: {
    "line_id": 109342,
    "play_name": "A Winters Tale",
    "speech_number": 36,
    "line_number": "3.2.209",
    "speaker": "PAULINA",
    "text_entry": "To be or none or little; though a devil"
}


#### 2. A multi match query to search in multiple fields

In [10]:
#Defining a search query
search_query = {
  "query": {
    "multi_match": {
      "query": "Hamlet",
      "fields": ["text_entry", "title"]
    }
  }
}

#Executing the search query
response = es.search(index=index_name, body=search_query)

#Processing the search results
hits = response["hits"]["total"]["value"]

#Printing the results
print(f"Hits: {hits}")
for hit in response["hits"]["hits"]:
    score = hit["_score"]  #Get the score of the hit
    source = hit["_source"]  #Get the source document
    #Printing the necessary information from the hits in a json format
    print(f"Score: {score}, Document: {json.dumps(source, indent=4)}")

Hits: 105
Score: 11.304642, Document: {
    "line_id": 36184,
    "play_name": "Hamlet",
    "speech_number": 99,
    "line_number": "5.1.269",
    "speaker": "QUEEN GERTRUDE",
    "text_entry": "Hamlet, Hamlet!"
}
Score: 10.8655815, Document: {
    "line_id": 34229,
    "play_name": "Hamlet",
    "speech_number": 18,
    "line_number": "",
    "speaker": "LORD POLONIUS",
    "text_entry": "Enter HAMLET"
}
Score: 10.8655815, Document: {
    "line_id": 34870,
    "play_name": "Hamlet",
    "speech_number": 8,
    "line_number": "",
    "speaker": "KING CLAUDIUS",
    "text_entry": "Enter HAMLET"
}
Score: 10.8655815, Document: {
    "line_id": 34910,
    "play_name": "Hamlet",
    "speech_number": 3,
    "line_number": "",
    "speaker": "QUEEN GERTRUDE",
    "text_entry": "Enter HAMLET"
}
Score: 10.8655815, Document: {
    "line_id": 35195,
    "play_name": "Hamlet",
    "speech_number": 7,
    "line_number": "",
    "speaker": "KING CLAUDIUS",
    "text_entry": "Enter HAMLET"
}
Score: 

#### 3. A boolean should match query to match tragedy and comedy in the text field

In [11]:
# Defining a search query
search_query = {
  "query": {
      "bool": {
          "should": [
              {"match": {"text_entry": "tragedy"}},
              {"match": {"text_entry": "comedy"}}
          ]
      }
    }
  }

#Executing the search query
response = es.search(index=index_name, body=search_query)
print("Hits:")

#Processing the search results
hits = response["hits"]["total"]["value"]

#Printing the results
print(f"Hits: {hits}")
for hit in response["hits"]["hits"]:
    score = hit["_score"]  #Get the score of the hit
    source = hit["_source"]  #Get the source document
    #Printing the necessary information from the hits in a json format
    print(f"Score: {score}, Document: {json.dumps(source, indent=4)}")

Hits:
Hits: 21
Score: 13.424231, Document: {
    "line_id": 33943,
    "play_name": "Hamlet",
    "speech_number": 134,
    "line_number": "2.2.406",
    "speaker": "LORD POLONIUS",
    "text_entry": "comedy, history, pastoral, pastoral-comical,"
}
Score: 12.521685, Document: {
    "line_id": 49522,
    "play_name": "King Lear",
    "speech_number": 31,
    "line_number": "1.2.142",
    "speaker": "EDMUND",
    "text_entry": "comedy: my cue is villanous melancholy, with a"
}
Score: 12.521685, Document: {
    "line_id": 68831,
    "play_name": "A Midsummer nights dream",
    "speech_number": 14,
    "line_number": "4.2.41",
    "speaker": "BOTTOM",
    "text_entry": "comedy. No more words: away! go, away!"
}
Score: 12.398785, Document: {
    "line_id": 3748,
    "play_name": "Henry VI Part 1",
    "speech_number": 13,
    "line_number": "1.4.77",
    "speaker": "TALBOT",
    "text_entry": "That hath contrived this woful tragedy!"
}
Score: 12.398785, Document: {
    "line_id": 7606,
    

In [12]:
search_query = {
  "query": {
    "match": {
      "text_entry": "tragedy"
    }
  }
}
response = es.search(index=index_name, body=search_query)
print("Hits:")

#Processing the search results
hits = response["hits"]["total"]["value"]

#Printing the results
print(f"Hits: {hits}")
for hit in response["hits"]["hits"]:
    score = hit["_score"]  #Get the score of the hit
    source = hit["_source"]  #Get the source document
    #Printing the necessary information from the hits in a json format
    print(f"Score: {score}, Document: {json.dumps(source, indent=4)}")

Hits:
Hits: 11
Score: 12.398785, Document: {
    "line_id": 3748,
    "play_name": "Henry VI Part 1",
    "speech_number": 13,
    "line_number": "1.4.77",
    "speaker": "TALBOT",
    "text_entry": "That hath contrived this woful tragedy!"
}
Score: 12.398785, Document: {
    "line_id": 7606,
    "play_name": "Henry VI Part 2",
    "speech_number": 25,
    "line_number": "3.1.155",
    "speaker": "GLOUCESTER",
    "text_entry": "Will not conclude their plotted tragedy."
}
Score: 12.398785, Document: {
    "line_id": 8049,
    "play_name": "Henry VI Part 2",
    "speech_number": 40,
    "line_number": "3.2.195",
    "speaker": "WARWICK",
    "text_entry": "Even so suspicious is this tragedy."
}
Score: 12.398785, Document: {
    "line_id": 10577,
    "play_name": "Henry VI Part 3",
    "speech_number": 7,
    "line_number": "2.3.28",
    "speaker": "WARWICK",
    "text_entry": "And look upon, as if the tragedy"
}
Score: 12.398785, Document: {
    "line_id": 34529,
    "play_name": "Hamle