### Define Import

In [49]:

try:
    import json
    import os
    import uuid

    import pandas as pd
    import numpy as np

    import elasticsearch
    from elasticsearch import Elasticsearch
    from elasticsearch import helpers
    from sentence_transformers import SentenceTransformer, util
    from tqdm import tqdm
    import elastic_test
except Exception as e:
    print("Some Modules are Missing :{}".format(e))

### Creating helper class

In [50]:
# this code will laod in data from csv format to an DataFrame in chunks 3000
# as the data we push in the elasticsearch index takes a lot of time for such big
# dataset with over 59k datapoints.
class Reader(object):

    def __init__(self, file_name):
        self.file_name = file_name

    def run(self):

        df = pd.read_csv(self.file_name, chunksize=3000)
        df = next(df)
        df = df.fillna("")
        return df

### This class will help in tokenization of data

In [51]:
# this class takes in our data tokennize it with the help of sentence transformer
# and returns a numpy array vector embeddings and then flattens
# the vector and return a numpy array with length of 384
class Tokenizer(object):
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def get_token(self, documents):
        sentences  = [documents]
        sentence_embeddings = self.model.encode(sentences)
        _ = list(sentence_embeddings.flatten())
        encod_np_array = np.array(_)
        encod_list = encod_np_array.tolist()
        return encod_list

### Setting up our Elasticsearch Client

In [52]:
 
client = Elasticsearch(
    cloud_id=elastic_test.CLOUD_ID,
    basic_auth=("elastic", elastic_test.ELASTIC_PASSWORD)
)

client.info()

ObjectApiResponse({'name': 'instance-0000000001', 'cluster_name': '7caa1d2d4f3a452d8ac72762a3987a0e', 'cluster_uuid': 'ikS0wj86SjOdoGrFzjtlIw', 'version': {'number': '8.8.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'c01029875a091076ed42cdb3a41c10b1a9a5a20f', 'build_date': '2023-05-23T17:16:07.179039820Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Creating Elasticsearch index with custom mappings

In [54]:
# this will help in creating our index on our client and the our data
# will be stored in database according to this mapping only
config = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "embeddings": {
                    "type": "dense_vector",
                    "dims": 384,
                    "index": True,
                    "similarity":"cosine",
                    "index_options": {
                          "type": "hnsw",
                          "m": 32,
                          "ef_construction": 100
                    }
                
                }
            }
    },
    "settings": {
        "number_of_shards": 2,
        "number_of_replicas": 1
    }
}



client.indices.create(
        index="arkansascaselogs2",
        settings = config["settings"],
        mappings = config["mappings"]
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'arkansascaselogs2'})

### Loading in our csv with the help of Helper class

In [55]:
# this csv contains combined caselogs and vector embeddings of those caselogs

helper = Reader(file_name=r"C:\Users\Pratham\tokenized_caselogs.csv")
df = helper.run()

In [56]:
df.rename(columns = {'Unnamed: 0':'id'}, inplace = True)

In [57]:
df.shape

(3000, 3)

In [58]:
df.head(10)

Unnamed: 0,id,Combined_caselog,Caselog_vectors
0,0,Case ID: 11640036 Title of the case: fisher v....,"[-0.08333617448806763, 0.003288048319518566, -..."
1,1,Case ID: 11638634 Title of the case: fikes v. ...,"[-0.051469676196575165, 0.15750525891780853, -..."
2,2,Case ID: 11641817 Title of the case: fletcher ...,"[0.002264400478452444, 0.04330931976437569, -0..."
3,3,Case ID: 243503 Title of the case: michael nor...,"[-0.07217243313789368, 0.04474251717329025, -0..."
4,4,Case ID: 243557 Title of the case: roger allen...,"[-0.012044079601764679, 0.06460518389940262, -..."
5,5,Case ID: 243402 Title of the case: joe louis d...,"[-0.04602688178420067, 0.06391077488660812, -0..."
6,6,Case ID: 243536 Title of the case: david mcgre...,"[-0.05042765662074089, 0.06301756203174591, -0..."
7,7,Case ID: 243418 Title of the case: st. paul fi...,"[-0.04300766438245773, 0.10811009258031845, -0..."
8,8,Case ID: 243398 Title of the case: sylvester r...,"[-0.058369215577840805, 0.06306301802396774, -..."
9,9,Case ID: 243483 Title of the case: patricia os...,"[-0.04090781509876251, 0.09158284962177277, -0..."


#### Converting caselog vectors back to list type

In [59]:
for i in range(len(df)):
    
    df["Caselog_vectors"][i]=list(map(float,df["Caselog_vectors"][i][1:-1].split(", ")))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Caselog_vectors"][i]=list(map(float,df["Caselog_vectors"][i][1:-1].split(", ")))


In [60]:
type(df['Caselog_vectors'][0])

list

### Now we'll push our data to the index we made earlier

In [61]:
## converting df to dictionary
caselog_dict = df.to_dict("records")


In [62]:
type(caselog_dict[0]['Caselog_vectors'])

list

In [67]:
caselog_dict[0]

{'id': 0,
 'Combined_caselog': "Case ID: 11640036 Title of the case: fisher v. reider Docket Number: Case No. 4,822a Case URL: https://api.capapi.org/v1/cases/11640036/ Date of decision: 1829-11 Citation of the case: '9 F. Cas. 137' Name of the court: Superior Court of the Territory of Arkansas Court ID: 9132 Case Author: None Name of Judges: Before JOHNSON, ESKRIDGE, BATES, and TRIMBLE, Judges. Name of the Attorneys: ",
 'Caselog_vectors': [-0.08333617448806763,
  0.003288048319518566,
  -0.045671965926885605,
  -0.03435898944735527,
  -0.005166919901967049,
  -0.007628907915204763,
  -0.028674382716417313,
  0.018058855086565018,
  0.029897673055529594,
  0.05648696422576904,
  0.036302629858255386,
  -0.037146586924791336,
  -0.05684299394488335,
  -0.012270716018974781,
  -0.021484481170773506,
  0.030118469148874283,
  -0.003232513554394245,
  0.024529362097382545,
  0.030872253701090813,
  0.018711017444729805,
  -0.06671999394893646,
  0.1359708607196808,
  0.021005354821681976,

### Pushing our data on to the created index

In [68]:
for case in caselog_dict:
    doc = {
        "text" : case['Combined_caselog'],
        "embeddings" : case['Caselog_vectors']
    }
    client.index(index="arkansascaselogs2", id = case["id"] , body = doc)

  client.index(index="arkansascaselogs2", id = case["id"] , body = doc)


In [69]:
result = client.count(index="arkansascaselogs2")

#print the total number of documents in the index
result

ObjectApiResponse({'count': 3000, '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0}})

### Creating our query for input

In [111]:
## this code will take in user query convert it in vector embeddings using Tokenizer class above
helper_token = Tokenizer()
INPUT = input("Enter the Input Query ")
token_vector = helper_token.get_token(INPUT)

# checking for integers in our input
all_int = False
for i in INPUT:
    if i in list(map(str,(list(range(10))))):
        all_int = True

# Creating our query dict
# if our input contains numerical values like ids then we go for normal math query
if all_int:
    print('loop')
    query = {
        "query": {
            "match": {
                "text": {
                "query": INPUT,

                }
            }
        }
    }
    search_result1 = client.search(index="arkansascaselogs2", body=query)
    print(search_result1,"search_result_1")

query_dict = {
        "field": "embeddings",
        "query_vector": token_vector,
        "k": 5,
        "num_candidates": 5,
    }


search_result2 = client.knn_search(index = "arkansascaselogs2", knn=query_dict)

## this will return out top 5 index no. of our search_results with the max_score of the top match

print(f"max_score: {search_result2['hits']['max_score']}")
#adding ids to the list to get all the information of those ids form the dataset
id_list=[]
for hit in search_result2["hits"]["hits"]:
        
        print(f"Document ID: {hit['_id']}")
        id_list.append(hit['_id'])
        


Enter the Input Query case id
max_score: 0.731353
Document ID: 2445
Document ID: 866
Document ID: 534
Document ID: 933
Document ID: 554


  search_result2 = client.knn_search(index = "arkansascaselogs2", knn=query_dict)


### Extracting the remaining information of the output IDs from bigger dataset

In [105]:
df_new = pd.read_csv(r"C:\Users\Pratham\arkansas_new.csv")

In [108]:
df_new["id"] = (list(range(59718)))

In [109]:
df_new.set_index("id", inplace = True)

In [115]:
df_new = df_new.drop(['Caselog_vectors'], axis="columns")

In [116]:
print(id_list)

['2445', '866', '534', '933', '554']


In [117]:
for i in id_list:
    print(df_new.iloc[int(i)])

Case_ID                                                          236513
url                             https://api.capapi.org/v1/cases/236513/
name                                                Ex parte John Smith
decision_date                                                   1832-07
docket_number                                                       NaN
Party A                                                             NaN
Party B                                                             NaN
Civil Cases                                              Ex parte Smith
Citation                                      ['1 Ark. Terr. Rep. 201']
Volume_url            https://api.capapi.org/v1/volumes/32044078438371/
Volume Number                                                         1
Reporter_full_name    Reports of Cases Argued and Determined in the ...
Reporter_id                                                         677
Court_name                  Superior Court of the Territory of A

In [119]:
df_new.to_csv("arkansas_whole_data.csv",index=False)