In [11]:
import gensim
from sklearn.datasets import fetch_20newsgroups
from elasticsearch import Elasticsearch
import numpy as np

# Load the Word2Vec model (you will need to download a suitable model)
model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/Dell/Downloads/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True)

# Load the dataset (e.g., 20 Newsgroups dataset)
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the documents
preprocessed_docs = []
for doc in newsgroups.data:
    # Tokenize the document
    tokens = gensim.utils.simple_preprocess(doc.lower())
    # Remove stop words and stem the tokens
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    # Join the stemmed tokens back into a string
    preprocessed_doc = ' '.join(stemmed_tokens)
    preprocessed_docs.append(preprocessed_doc)

es = Elasticsearch("https://localhost:9200/", ca_certs="C:\elastic stack\elasticsearch-8.10.4-windows-x86_64\elasticsearch-8.10.4\config\certs\http_ca.crt", basic_auth=("elastic", "c-31JKWgK3cordv3k89n"))

# Delete the index if it already exists
index_name = 'vec'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index with appropriate mappings
index_mappings = {
    'mappings': {
        'properties': {
            'text': {
                'type': 'text'
            },
            'vector': {
                'type': 'dense_vector',
                'dims': 300  # Adjust the dimension to match your model
            }
        }
    }
}
es.indices.create(index=index_name, body=index_mappings)

# Iterate over preprocessed documents and generate vectors
for i, doc in enumerate(preprocessed_docs):
    # Split the preprocessed document into tokens
    tokens = doc.split()
    # Initialize an array to store the word vectors
    word_vectors = [model[token] for token in tokens if token in model]
    if word_vectors:
        # Compute the average vector for the document
        doc_vector = np.mean(word_vectors, axis=0)
        # Store the document and its vector in the Elasticsearch index
        es.index(index=index_name, id=i, body={'text': doc, 'vector': doc_vector.tolist()})
    
    # Get a user query
user_query = "usa"

# Preprocess the user query
tokens = gensim.utils.simple_preprocess(user_query.lower())
stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
preprocessed_query = ' '.join(stemmed_tokens)

# Convert the query to a vector (similar to the document vectorization step)
# Initialize an array to store the word vectors
word_vectors = []

# Compute vectors for query tokens and aggregate them
for token in preprocessed_query.split():
    if token in model:  # Check if the token is in the Word2Vec model's vocabulary
        word_vector = model[token]  # Retrieve the vector for the token
        word_vectors.append(word_vector)

# Calculate the query vector by averaging the word vectors
if word_vectors:
    query_vector = sum(word_vectors) / len(word_vectors)
    print(f'Query vector: {query_vector}')
# Use Elasticsearch to retrieve similar documents
search_body = {
    'query': {
        'script_score': {
            'query': {
                'match_all': {}
            },
            'script': {
                'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                'params': {
                    'query_vector': query_vector.tolist()
                }
            }
        }
    },
    '_source': {
        'includes': ['text']
    }
}
search_results = es.search(index=index_name, body=search_body)['hits']['hits']

# Print the top relevant documents
for i, hit in enumerate(search_results[:10]):
    print(f'{i+1}. {hit["_source"]["text"]}')


Query vector: [-0.1796875  -0.15332031  0.03198242  0.13476562  0.07128906  0.01904297
  0.33789062 -0.08300781  0.12304688  0.16503906 -0.31445312 -0.46679688
 -0.1328125  -0.19921875  0.17871094  0.21191406  0.359375    0.46875
 -0.14160156  0.03320312 -0.05688477 -0.1328125   0.07714844 -0.04345703
 -0.32617188  0.23828125 -0.22363281  0.07275391 -0.03320312 -0.18652344
  0.17871094  0.15234375 -0.16992188 -0.12451172 -0.09521484  0.1484375
 -0.46875     0.09130859  0.34179688 -0.13476562 -0.24023438 -0.1328125
  0.38867188  0.1171875   0.24414062  0.07421875  0.1640625  -0.19238281
 -0.265625    0.16015625 -0.05444336  0.22363281  0.23828125  0.17285156
  0.07080078 -0.14941406 -0.24902344  0.01647949 -0.08789062 -0.25585938
 -0.22363281 -0.01544189 -0.11621094 -0.09667969 -0.02905273 -0.27539062
 -0.03393555 -0.171875   -0.16601562  0.00909424  0.00531006  0.39257812
 -0.0480957  -0.01831055 -0.18652344 -0.08740234  0.06640625  0.01263428
 -0.08154297  0.02124023 -0.30273438  0.02

In [32]:
import csv
from elasticsearch import Elasticsearch
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, ndcg_score
import numpy as np

# Initialize Elasticsearch client with URL
es = Elasticsearch("https://localhost:9200/", ca_certs="C:\elastic stack\elasticsearch-8.10.4-windows-x86_64\elasticsearch-8.10.4\config\certs\http_ca.crt", basic_auth=("elastic", "c-31JKWgK3cordv3k89n"))

# Define your Elasticsearch index name
index_name = 'vec'

# Define your queries and relevance judgments
queries = [
    'Deep learning in computer vision',
    'Recent advances in renewable energy',
    'History of ancient civilizations',
    'Space exploration and future missions',
    'Health benefits of meditation',
    'Best practices in software development',
    'computer marvels of modern era',
    'The rise of artificial intelligence',
    'The power of quantum computing',
    'nanotechnology',
    'genetic engineering',
    'artificial intelligence',
    'robotics',
    'space exploration',
    'neuroscience',
    'cyber security',
    'augmented reality',
    'virtual reality',
    'machine learning',
]
relevance_judgments = [
    [1, 2, 3, 4, 5],
    [2, 4, 6, 8, 10],
    [4, 6, 8, 10],
    [1, 2, 3],
    [4, 5, 6, 7],
    [8, 9, 10],
    [1, 2, 4, 6],
    [3, 5, 7, 9, 10],
    [1, 3, 5, 7],
    [2, 4, 6, 8, 10],
    [1, 2, 3, 4],
    [5, 6, 7, 8, 9],
    [10],
    [1, 3, 5, 7, 10],
    [2, 4, 6, 8],
    [1, 2, 4],
    [3, 5, 7, 9],
    [6, 8, 9, 10],
    [1, 2],
    [4, 5, 7, 8]
]

# Initialize lists to store evaluation results
precision_results = []
recall_results = []
f1_results = []
ap_results = []
ndcg_results = []

# Initialize a list to store data for CSV
csv_data = []

# Iterate through the queries
for i, query in enumerate(queries):
    # Preprocess the query as shown in your reference

    # Convert the query to a vector (similar to the document vectorization step)
    # Initialize an array to store the word vectors
    word_vectors = []

    # Compute vectors for query tokens and aggregate them
    for token in query.split():
        if token in model:  # Check if the token is in the Word2Vec model's vocabulary
            word_vector = model[token]  # Retrieve the vector for the token
            word_vectors.append(word_vector)

    # Calculate the query vector by averaging the word vectors
    if word_vectors:
        query_vector = np.mean(word_vectors, axis=0)

    # Search for similar documents using Elasticsearch
    search_body = {
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                    'params': {
                        'query_vector': query_vector.tolist()
                    }
                }
            }
        },
        '_source': {
            'includes': ['text']
        }
    }
    search_results = es.search(index=index_name, body=search_body)['hits']['hits']

    # Retrieve relevant documents using relevance judgments
    y_true = np.zeros(len(search_results))
    for relevance_index in relevance_judgments[i]:
        if relevance_index - 1 < len(y_true):
            y_true[relevance_index - 1] = 1

    # Calculate IR metrics
    y_pred = np.zeros(len(search_results))
    for j, hit in enumerate(search_results):
        if j in relevance_judgments[i]:
            y_pred[j] = 1
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)
    ndcg = ndcg_score([y_true], [y_pred])

    # Append results to lists
    precision_results.append(precision)
    recall_results.append(recall)
    f1_results.append(f1)
    ap_results.append(ap)
    ndcg_results.append(ndcg)

    # Append the data to the CSV list
    csv_data.append([query, precision, recall, f1, ap, ndcg])

    # Print the results
    print(f'Query: {query}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1: {f1:.4f}')
    print(f'MAP: {ap:.4f}')
    print(f'NDCG: {ndcg:.4f}')
    print()

    # Print relevant documents
    print(f'Similar documents for query: {query}')
    for hit in search_results:
        print(f'Document id: {hit["_id"]}, Text: {hit["_source"]["text"]}')
    print()

# Define the file name for the CSV
csv_file = 'retrieval_results.csv'

# Define the header for the CSV
header = ['Query', 'Precision', 'Recall', 'F1', 'MAP', 'NDCG']

# Write the data to the CSV file
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(csv_data)

print(f'CSV file "{csv_file}" has been generated with retrieval results.')


Query: Deep learning in computer vision
Precision: 0.8000
Recall: 0.8000
F1: 0.8000
MAP: 0.7400
NDCG: 0.9082

Similar documents for query: Deep learning in computer vision
Document id: 2353, Text: krsear ulkyvx louisvil edu kendal opusii sear subject hacker ethic line nntp post host ulkyvx louisvil edu organ univers louisvil hacker ethic hacker comfort emploi support famili gener take radic lead look radic old on gone hacker ick profession notic associ progress hacker profession distast occur seri thing happen ego outgrow talent knowledg financi situat take preced chang prioriti esp famili possibl lead hacker attitud make shift fun work vocat burn awai creativ need hobbi biggest killer imo dream sadli shatter hard rock societi version realiti dream motiv di motiv effort useless set problem stem children rememb complet differ system children grow choic msdo mac amiga enjoi divers rememb great fallout earli eighti vividli forc stop skill develop system dead divers system allow wide diver

  _warn_prf(average, modifier, msg_start, len(result))
