Importing

In [4]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sqlalchemy import create_engine

In [5]:
#Loading the dataframes
nepberta_df = pd.read_pickle("D:\\Python\\ML-Projects\\Intern-Synapse\\TextVectorization\\najirs_NepBERTa.pkl")
bm25_df = pd.read_pickle('D:\\Python\\ML-Projects\\Intern-Synapse\\TextVectorization\\najirs_BM25.pkl')

nepberta_df.head()

Unnamed: 0,cleaned_text,embeddings
0,्तव्य ज्यान प्रकरण नं भवितव्य ठहर्न ज्यान लिने...,"[[-0.023295283, 0.04019555, 0.22646093, 0.1248..."
1,्प्रेषण प्रकरण नं रिट निवेदकलाई श्रम कार्यालयल...,"[[-0.18046029, 0.052735027, 0.23040603, 0.1659..."
2,्प्रेषण परमादेशको आदेश जारी गरिपाउँ प्रनं धेरै...,"[[-0.2502869, -0.0183751, 0.117184184, 0.09710..."
3,्प्रेषण प्रकरण नं जिल्ला धनुषा जनकपुर नगर पंचा...,"[[-0.2658006, -0.10030958, 0.21205233, 0.08202..."
4,्दीप्रत्यक्षीकरण प्रकरण नं सार्वजनिक सुरक्षा ऐ...,"[[-0.11624412, -0.095075056, 0.28900784, 0.041..."


In [6]:
# Check initial shapes
print(nepberta_df.shape, bm25_df.shape)

# Preprocess: Convert embeddings into single flat lists
def flatten_embedding(embedding):
    # Flatten the nested list entirely if it exists
    if isinstance(embedding, list) and len(embedding) > 0 and isinstance(embedding[0], list):
        return np.array(embedding[0])
    return np.array(embedding)

# Apply flattening
nepberta_df["flattened_embeddings"] = nepberta_df["embeddings"].apply(flatten_embedding)

# Expand the flattened embeddings into separate columns
expanded_embeddings = np.vstack(nepberta_df["flattened_embeddings"].values)

# Create a new DataFrame with embeddings
embedding_df = pd.DataFrame(expanded_embeddings)

# Verify shape
print(embedding_df.shape)  # Should be (9355, 768)

(9355, 2) (9355, 68328)
(9355, 768)


In [7]:
nepberta_array = embedding_df.values.astype(np.float64)
bm25_vectors = bm25_df.values          

Connecting With The IDs

In [8]:
from configparser import ConfigParser

#Initialize config parser
config = ConfigParser()
config.read("D:\\Python\\ML-Projects\\Intern-Synapse\\TextVectorization\\config_vector.ini")

#Accessing database details
db_config = config['database']
username = db_config['username']
pwd = db_config['password']
hostname = db_config['hostname']
port_id = int(db_config['port_id'])
database = db_config['database']

In [9]:
#Creating SQLAlchemy engine
from sqlalchemy import create_engine, Column, String
engine = create_engine(f'postgresql://{username}:{pwd}@{hostname}:{port_id}/{database}')

from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

#Creating a session
Session = sessionmaker(bind=engine)
session = Session()

#Defining ORM model
Base = declarative_base()

class Najirs(Base):
    __tablename__ = "najirs_v4"

    id = Column(String, primary_key=True)
    content = Column(String)
    date = Column(String)
    data = Column(String)

#Query data safely
results = session.query(Najirs.id, Najirs.content, Najirs.data, Najirs.date).all()
df = pd.DataFrame(results, columns= ["id", "content", "data", "date"])

df.head()

  Base = declarative_base()


Unnamed: 0,id,content,data,date
0,१००३८,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या.डा.आनन्दमोहन भट्टराई : न्याय प...,२०७४-१२-१९
1,४६१५,"<div class=""col-md-8 para-sections"">\n<div id=...","{'body': ['न्या.गजेन्द्रकेशरी वास्तोला', '१. ...",२०४९-०९-०८
2,११२१,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या. धनेन्द्रबहादुर सिंह : प्रस्तु...,२०३५-०१-१२
3,२४०६,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या.बब्बरप्रसाद सिंहः नेपालको संवि...,२०४२-०५-१९
4,३१४४,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या.पृथ्वी बहादुर सिंहः नेपालको सं...,२०४४-०२-२५


In [10]:
df.drop(columns=['content', 'date'], inplace=True)

def is_principles_empty(content):
    return 'principles' not in content or not content['principles']

#Removing rows with empty 'principles'
empty_principles_mask = df['data'].apply(is_principles_empty)
empty_principles_count = empty_principles_mask.sum()
df = df[~empty_principles_mask]
print(f"Removed {empty_principles_count} rows with empty 'principles' key.")

df.drop(columns=['data'], inplace=True)

Removed 572 rows with empty 'principles' key.


In [11]:
df['id'] = df['id'].astype(str)
print(f"Shape: {df.shape}")
print(df.head())

Shape: (9355, 1)
      id
0  १००३८
1   ४६१५
2   ११२१
3   २४०६
4   ३१४४


In [13]:
from scipy.sparse import csr_matrix

# Convert your bm25_vectors into a numpy array if it's not already
bm25_vectors_array = np.array(bm25_vectors, dtype=np.float64)

# Convert the numpy array to a sparse CSR matrix
bm25_vectors_sparse = csr_matrix(bm25_vectors_array)

In [27]:
from joblib import Parallel, delayed

# Create an empty DataFrame to store the results
similarity_df = pd.DataFrame(columns=["document_id", "similar_document_ids"])

def get_top_similar_documents(query_id, top_n=10, weight_nepberta=0.6, weight_bm25=0.4, base_threshold=0.4, min_threshold = 0.3):
    global queries_with_few_similar, all_similarity_scores
    try:
        query_index = df.index[df['id'] == query_id].tolist()[0] 
        print(f"Query ID: {query_id}, Corresponding Index: {query_index}")
        
        query_vector = np.array(nepberta_array[query_index])
        #Compute cosine similarity for NepBERTa
        nepberta_sim = cosine_similarity(query_vector.reshape(1, -1), nepberta_array).flatten()
        
        #Compute cosine similarity for BM25 using sparse vectors
        query_bm25_vector = bm25_vectors_sparse[query_index]
        bm25_sim = query_bm25_vector.dot(bm25_vectors_sparse.T).toarray().flatten()

        #Normalize BM25 values to [-1, 1]
        bm25_sim_normalized = 2 * (bm25_sim / np.max(bm25_sim)) - 1
        
        #Ensembling similarity (weighted average)
        ensemble_sim = (weight_nepberta * nepberta_sim) + (weight_bm25 * bm25_sim_normalized)
        
        # Apply dynamic threshold adjustment
        threshold = base_threshold
        adjustment = 0.02
        iteration_count = 0
        max_iterations = 10

        while len(ensemble_sim[ensemble_sim >= threshold]) < 5 and iteration_count < max_iterations:
            threshold -= adjustment
                
            #Ensure threshold doesn't go below the minimum threshold
            if threshold < min_threshold:
                threshold = min_threshold
                break
                
            iteration_count += 1
        
        #Filter based on the adjusted threshold
        valid_indices = np.where(ensemble_sim >= threshold)[0]
        
        #Exclude the query index itself
        valid_indices = valid_indices[valid_indices != query_index]
        
        #Sort by similarity score and get top_n results
        top_indices = valid_indices[np.argsort(-ensemble_sim[valid_indices])]
        top_indices = top_indices[:top_n] if len(top_indices) >= top_n else top_indices
        
        #Map indices to document IDs
        top_similar_ids = [df.iloc[idx]['id'] for idx in top_indices]
        
        #Return the top similar document IDs
        return top_similar_ids
    
    except Exception as e:
        print(f"Error processing query_id {query_id}: {e}")
        return []

def populate_similarity_dataframe():
    global similarity_df
    
    for i in range(0, 8821, 1000):

        end_idx = min(i + 1000, 8821)
        query_ids = df['id'][i:end_idx].values.tolist()
        
        #Process in parallel
        results = Parallel(n_jobs=-1)(
            delayed(get_top_similar_documents)(query_id) for query_id in query_ids
        )
        
        temp_df = pd.DataFrame({
            "document_id": query_ids,
            "similar_document_ids": results
        })
        similarity_df = pd.concat([similarity_df, temp_df], ignore_index=True)
        
        print(f"Processed and added records from {i} to {end_idx - 1}")

populate_similarity_dataframe()

Processed and added records from 0 to 999
Processed and added records from 1000 to 1999
Processed and added records from 2000 to 2999
Processed and added records from 3000 to 3999
Processed and added records from 4000 to 4999
Processed and added records from 5000 to 5999
Processed and added records from 6000 to 6999
Processed and added records from 7000 to 7999
Processed and added records from 8000 to 8820


In [28]:
def check_similarity_values_exclude_query_with_threshold(start_index=8821, end_index=9354, base_threshold=0.35, top_n=10, min_threshold=0.25):
    global similarity_df 
    results = {}

    for i in range(start_index, end_index + 1):
        try:
            query_vector = np.array(nepberta_array[i])

            #Compute NepBERTa similarity
            nepberta_sim = cosine_similarity(query_vector.reshape(1, -1), nepberta_array).flatten()
            
            #Compute BM25 similarity
            query_bm25_vector = bm25_vectors_sparse[i]
            bm25_sim = query_bm25_vector.dot(bm25_vectors_sparse.T).toarray().flatten()
            
            #Normalize BM25 to [-1, 1]
            bm25_sim_normalized = 2 * (bm25_sim / np.max(bm25_sim)) - 1
            
            #Ensemble similarity
            ensemble_sim = (0.6 * nepberta_sim) + (0.4 * bm25_sim_normalized)
            
            #Exclude the query index itself
            ensemble_sim[i] = -np.inf
        
            #Apply dynamic threshold adjustment
            threshold = base_threshold
            adjustment = 0.02
            iteration_count = 0
            max_iterations = 10

            while len(ensemble_sim[ensemble_sim >= threshold]) < 5 and iteration_count < max_iterations:
                threshold -= adjustment 
                
                #Ensure threshold doesn't go below the minimum threshold
                if threshold < min_threshold:
                    threshold = min_threshold
                    break
                
                iteration_count += 1
            
            #Filter valid indices based on the adjusted threshold
            valid_indices = np.where(ensemble_sim >= threshold)[0]

            top_indices = valid_indices[np.argsort(-ensemble_sim[valid_indices])]

            #Keep only the top N similar documents, or fewer if there aren't enough
            if len(top_indices) > top_n:
                top_indices = top_indices[:top_n]

            #Store the results for valid indices within the threshold
            similar_document_ids = [df.iloc[idx]["id"] for idx in top_indices if idx != i]  # Exclude the query index itself

            #Create a DataFrame with the results to append to similarity_df
            temp_df = pd.DataFrame({
                "document_id": df.iloc[i]["id"],  # Return the initial query's index as document_id
                "similar_document_ids": [similar_document_ids]
            })

            #Append the temporary DataFrame to the main similarity DataFrame
            similarity_df = pd.concat([similarity_df, temp_df], ignore_index=True)

            #Save the result for the current document
            results[i] = {
                "document_id": df.iloc[i]["id"],  # The query's document ID
                "similar_document_ids": similar_document_ids,
                "threshold": threshold,
                "similarities": ensemble_sim[top_indices]
            }

        except Exception as e:
            print(f"Error processing index {i}: {e}")
            results[i] = {"error": str(e)}

    return results

similarity_results_exclude_query = check_similarity_values_exclude_query_with_threshold(8820, 9354)

#Print results
for idx, res in similarity_results_exclude_query.items():
    if "error" in res:
        print(f"Index {idx}: {res['error']}")
    else:
        print(f"Document ID: {res['document_id']} - Most Similar Document IDs: {res['similar_document_ids']}")

Document ID: ९४९ - Most Similar Document IDs: ['४१४२', '३६२८', '४६७२']
Document ID: ४६३६ - Most Similar Document IDs: ['४१२४', '६०७८', '४२२१', '४०६८', '४१२३', '८६१०', '३९८९', '४७८४', '३११४', '४६१९']
Document ID: ६२८७ - Most Similar Document IDs: ['७२८२', '६४३७', '१०२२९', '६७९३', '३७७४']
Document ID: १००१३ - Most Similar Document IDs: ['९६३३', '९३४६', '१०३६१', '१०३७५', '१०८९५', '१०१४४', '१०४०८', '९५८०', '९८६८']
Document ID: २४२५ - Most Similar Document IDs: ['२५५२', '३७३४', '७७५९', '४५६१', '६७८९', '३४८४']
Document ID: १३५२ - Most Similar Document IDs: ['४४३', '१६६५', '४११', '९९१०', '५३६', '२३७']
Document ID: ८८९४ - Most Similar Document IDs: ['९८३४', '९७४५', '८९७७', '१०७१३', '७९१२', '९४८१', '९३०८', '९५००', '९५३४', '७०१५']
Document ID: २४३९ - Most Similar Document IDs: ['२४५४', '१८४०', '४८४३', '१४८६', '४६२९', '११२', '७६०७', '३९१९', '१७५९']
Document ID: १३४६ - Most Similar Document IDs: ['६५५९', '१४२१', '४३६१', '१३६३', '८६५६']
Document ID: ६५८६ - Most Similar Document IDs: ['७५५३', '६४०४'

In [25]:
# Calculate the length of each list in the 'similar_document_ids' column
similarity_df['list_length'] = similarity_df['similar_document_ids'].apply(lambda x: len(x) if x else 0)

# Calculate the average length of the lists
average_length = similarity_df['list_length'].mean()
print(f"The average length of the lists in the 'similar_document_ids' column is: {average_length:.2f}")

# Count the number of rows with empty lists in 'similar_document_ids'
empty_list_count = similarity_df['similar_document_ids'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
print("Count of documents with no similar documents: ", empty_list_count)

# Get the indices of the rows with no similar documents
empty_list_indices = similarity_df[similarity_df['similar_document_ids'].apply(lambda x: isinstance(x, list) and len(x) == 0)].index.tolist()
print(f"Indices of documents with no similar documents: {empty_list_indices}")

The average length of the lists in the 'similar_document_ids' column is: 6.80
Count of documents with no similar documents:  20
Indices of documents with no similar documents: [11, 30, 42, 130, 139, 168, 176, 263, 291, 322, 354, 361, 376, 387, 399, 436, 482, 485, 516, 518]


In [29]:
from pathlib import Path
import json

#Your previous code
base_path = Path.cwd() 
config_path = base_path / "config.ini"

config = ConfigParser()
config.read(config_path)

#Access database details
db_config = config['database']
username = db_config['username']
pwd = db_config['password']
hostname = db_config['hostname']
port_id = int(db_config['port_id'])
database = db_config['database']

#Create engine
engine = create_engine(f'postgresql://{username}:{pwd}@{hostname}:{port_id}/{database}')

#Create session
Session = sessionmaker(bind=engine)
session = Session()

#Define ORM model
Base = declarative_base()

class Results(Base):
    __tablename__ = "document_similarities"

    document_id = Column(String, primary_key=True)
    similar_document_ids = Column(String)

#Assuming you have a DataFrame `similarity_df` loaded
similarity_df = similarity_df[["document_id", "similar_document_ids"]]

#Loop through rows and save to the database
for _, row in similarity_df.iterrows():
    similar_document_ids = row["similar_document_ids"]

    similar_document_ids = json.dumps([similar_document_ids])

    #Check if the document_id already exists
    existing_row = session.query(Results).filter_by(document_id=row["document_id"]).first()
    if existing_row:
        existing_row.similar_document_ids = similar_document_ids
    else:
        new_row = Results(
            document_id=row["document_id"],
            similar_document_ids=similar_document_ids
        )
        session.add(new_row)

#Commit the changes
session.commit()

print("Data added to the document_similarities table successfully!")

#Close the session
session.close()

  Base = declarative_base()


Data added to the document_similarities table successfully!
