In [None]:
import csv
import gensim
from elasticsearch import Elasticsearch
from gensim.models import FastText
from sklearn.conftest import fetch_20newsgroups

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the documents
preprocessed_docs = []
for doc in newsgroups.data:
    # Tokenize the document
    tokens = gensim.utils.simple_preprocess(doc.lower())
    # Remove stop words and stem the tokens
    stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
    # Join the stemmed tokens back into a string
    preprocessed_doc = ' '.join(stemmed_tokens)
    preprocessed_docs.append(preprocessed_doc)

# Train the FastText model
model = FastText(preprocessed_docs, vector_size=300, window=5, min_count=5, workers=4)

# Save the model to a binary file
model.save('model.bin')

# Initialize Elasticsearch client with URL
es = Elasticsearch(['http://localhost:9200'])

# Delete the index if it already exists
index_name = 'my_index'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create index with appropriate mappings
index_mappings = {
    'mappings': {
        'properties': {
            'text': {
                'type': 'text'
            },
            'vector': {
                'type': 'dense_vector',
                'dims': 300
            }
        }
    }
}
es.indices.create(index=index_name, body=index_mappings)

# Iterate over preprocessed documents and generate vectors
for i, doc in enumerate(preprocessed_docs):
    # Split the preprocessed document into tokens
    tokens = doc.split()
    # Generate the vector for the document by averaging the vectors of its tokens
    vector_sum = 0
    count = 0
    for token in tokens:
        if token in model.wv:
            vector_sum += model.wv[token]
            count += 1
    if count > 0:
        vector = vector_sum / count
        # Store the document and its vector in the Elasticsearch index
        es.index(index=index_name, id=i, body={'text': doc, 'vector': vector.tolist()})


In [None]:
import csv
import gensim
import elasticsearch
from sklearn.datasets import fetch_20newsgroups
import numpy as np

# Define NDCG calculation function
def calculate_ndcg(ranked_relevance, k):
    # Ideal ranking (perfect relevance)
    ideal_ranking = sorted(ranked_relevance, reverse=True)

    # Calculate DCG (Discounted Cumulative Gain)
    dcg = sum((2**rel - 1) / np.log2(rank + 2) for rank, rel in enumerate(ranked_relevance[:k]))

    # Calculate ideal DCG (iDCG)
    idcg = sum((2**rel - 1) / np.log2(rank + 2) for rank, rel in enumerate(ideal_ranking[:k]))

    # Calculate NDCG
    ndcg = dcg / idcg if idcg > 0 else 0
    return ndcg

# Open the CSV file for writing
with open('precision_recall_ndcg_map.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Query', 'Precision', 'Recall', 'NDCG', 'MAP', 'Score'])

    # Define your Elasticsearch connection (es) and Word2Vec model (model) here

    # Load the newsgroups dataset for demonstration
    newsgroups = fetch_20newsgroups(subset='all')

    # Loop over user queries
    for user_query in ['nature', 'news', 'sports', 'weather', 'economices', 'stockmarket', 'geography', 'chemistry', 'football', 'basketball', 'book', 'play', 'report', 'knife', 'moon', 'murder', 'evening', 'state', 'county', 'bounty', 'pen', 'mail']:
        # Preprocess the user query
        tokens = gensim.utils.simple_preprocess(user_query.lower())
        stemmed_tokens = [gensim.parsing.porter.PorterStemmer().stem(token) for token in tokens if token not in gensim.parsing.preprocessing.STOPWORDS]
        preprocessed_query = ' '.join(stemmed_tokens)

        # Search for similar documents using Elasticsearch
        search_body = {
            'query': {
                'script_score': {
                    'query': {
                        'match_all': {}
                    },
                    'script': {
                        'source': 'cosineSimilarity(params.query_vector, "vector") + 1.0',
                        'params': {
                            'query_vector': model.wv[preprocessed_query.split()].mean(axis=0).tolist()
                        }
                    }
                }
            },
            '_source': {
                'includes': ['text']
            }
        }
        search_results = es.search(index='my_index', body=search_body)['hits']['hits']
        scores = [hit['_score'] for hit in search_results]  # Assuming '_score' field is present in Elasticsearch results

        # Calculate precision and recall
        relevant_docs = set([i for i, doc in enumerate(newsgroups.data) if user_query in doc.lower()])
        retrieved_docs = set([int(hit['_id']) for hit in search_results])
        relevant_and_retrieved = relevant_docs.intersection(retrieved_docs)
        precision = len(relevant_and_retrieved) / len(retrieved_docs) if len(retrieved_docs) > 0 else 0
        recall = len(relevant_and_retrieved) / len(relevant_docs) if len(relevant_docs) > 0 else 0

        # Calculate NDCG (Normalized Discounted Cumulative Gain)
        ranked_relevance = [1 if int(hit['_id']) in relevant_docs else 0 for hit in search_results]
        k = min(len(ranked_relevance), 10)  # Consider the top 10 results for NDCG
        ndcg = calculate_ndcg(ranked_relevance, k)

        # Calculate and add MAP (Mean Average Precision) score
        map_scores = []
        cumulative_precision = 0
        relevant_count = len(relevant_docs)
        for i, hit in enumerate(search_results):
            if int(hit['_id']) in relevant_docs:
                cumulative_precision += 1
                map_scores.append(cumulative_precision / (i + 1))

        if map_scores:
            map_score = sum(map_scores) / relevant_count
        else:
            map_score = 0

        # Write the Precision, Recall, NDCG, and MAP scores to the CSV file
        writer.writerow([user_query, precision, recall, ndcg, map_score])

        scores = [hit['_score'] for hit in search_results]  # Assuming '_score' field is present in Elasticsearch results

        # Write the precision, recall, and score to the CSV file
        for score in scores:
            writer.writerow([user_query, precision, recall, ndcg, map_score,score])

        # Print the top 10 most similar documents
        print(f'Top 10 most similar documents for query "{user_query}":')
        for i, hit in enumerate(search_results[:10]):
            print(f'{i+1}. {hit["_source"]["text"]}')

        # Print Precision, Recall, NDCG, and MAP
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'NDCG: {ndcg}')
        print(f'MAP:{map_score}')
