In [None]:
!pip install torch
!pip install sentence-transformers
!pip install pandas
!pip install matplotlib
!pip install --upgrade transformers sentence-transformers
!pip install seaborn
!pip install tqdm
!pip install ipywidgets

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
from tqdm import tqdm

In [3]:
#Load csv files
df1 = pd.read_csv(r"C:\Users\Yasvanth.Pamidi\OneDrive - ENCORA\Desktop\DataMap\LHS.csv") #path to file1
df2 = pd.read_csv(r"C:\Users\Yasvanth.Pamidi\OneDrive - ENCORA\Desktop\DataMap\RHS.csv") #path to file2

In [4]:
#define columns to compare by converting columns in to string format

column1 = 'Description' #column name in df1
df1[column1] = df1[column1].astype(str)

column2 = 'Description' #column name in df2
df2[column2] = df2[column2].astype(str)

In [None]:
df1.head()
df2.shape

In [None]:

# Load a pre-trained model
model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')

In [None]:
# verifying  tokenization (Generic strategies followed by SBERT - wordpiece, byte pair encoding)
tokenizer= model.tokenizer
text = "products do not contain ingredients of meat, fish, fowl, animal by-products, eggs or egg products, milk or milk products, honey or honey bee products. Involve no animal testing of ingredients by supplier, producer, manufacturer or independent party."
tokens = tokenizer.encode(text, add_special_tokens = True)
print("Number of Tokens: ", len(tokens))
print("Tokens", tokens)

decoded_text = tokenizer.decode(tokens)
print(decoded_text)

In [None]:
#embeddings for selected rows

#function to encode sentences in batches
def batch_encode(column, batch_size, model):
    embeddings = []
    column = column.tolist()
    with torch.no_grad():
        for i in tqdm(range(0, len(column), batch_size), desc = "Encoding Batches"):
            batch = column[i:i+batch_size]
            batch_embeddings = model.encode(batch, convert_to_tensor= True,show_progress_bar=True)
            embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)


embeddings1 = batch_encode(df1[column1], batch_size=128, model=model)
embeddings2 = batch_encode(df2[column2], batch_size=128, model=model)


#embeddings1 = model.encode(df1[column1].to_list(), show_progress_bar=True,convert_to_tensor=True)
#embeddings2 = model.encode(df2[column2].to_list(), show_progress_bar=True,convert_to_tensor=True)

In [None]:
embeddings1.shape
embeddings2.shape


In [10]:
#compute the cosine_similarity_matrix here output will be in range of -1 to 1

similarity_matrix = util.cos_sim(embeddings1,embeddings2).cpu().numpy()

In [13]:
# normalizing the similarity matrix as the out values to range from 0 to 1 
normalized_similarity_matrix = (similarity_matrix + 1) / 2


In [None]:
normalized_similarity_matrix.shape

In [None]:
print(normalized_similarity_matrix)

In [20]:
# To verify the Data Types as we have data belonging to different databases 


type_mapping = {
    "CHAR": ["VARCHAR"],
    "NUMC": ["INTEGER", "BIGINT", "NUMERIC"],
    "DATS": ["DATE"],
    "TIMS": ["TIME"],
    "DEC": ["NUMERIC", "FLOAT"],
    "INT4": ["INTEGER", "BIGINT"]
}


In [21]:
import os
import json

mappings_dir = r"C:\Users\Yasvanth.Pamidi\OneDrive - ENCORA\Desktop\VSC\DataMapper\trails\mappings"

# Load mapping files from the mappings directory
def load_mappings(mappings_dir):
    mappings = {}
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json"):
            database_name = os.path.splitext(filename)[0]  # Get database name from filename
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
    return mappings


In [22]:
# Function to normalize data types using mappings
def normalize_data_type(data_type, database, mappings):
    database_mapping = mappings.get(database, {})
    for sql_2023_type, db_types in database_mapping.items():
        if data_type.upper() in db_types:
            return sql_2023_type
    return data_type.upper()  # Return original if no match found

ONE TO MANY - Below code gets the topN similarities from RHS for particular index on LHS and shows if data types are compatible or not

In [None]:
import json
import numpy as np
import pandas as pd

# Function to retrieve top-N most similar sentences with data type validation
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping):
    # Initialize variables
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top-N indices sorted in descending order
    
    lhs_field = df1.loc[selected_index, 'Field Name']  # LHS field name
    lhs_desc = df1.loc[selected_index, column1]        # LHS description
    lhs_type = df1.loc[selected_index, 'Data_Type']    # LHS data type
    
    # List to store all matches
    matches = [
        {
            "rank": rank + 1,
            "rhs_field_score": float(similarities[idx]),
            "rhs_index": int(df2.index[idx]),
            "rhs_field_name": df2.loc[idx, 'Attribute'],   # RHS attribute name
            "rhs_field_desc": df2.loc[idx, column2],       # RHS description
            "rhs_data_type": df2.loc[idx, 'Data_Type'],    # RHS data type
            "compatibility": "Compatible" if df2.loc[idx, 'Data_Type'] in type_mapping.get(lhs_type, []) else "Incompatible"
        }
        for rank, idx in enumerate(top_indices)  # Loop through top indices
    ]

    # Construct result dictionary
    result = {
                f"lhs_field_index": selected_index,
                f"lhs_field_name": lhs_field,
                f"lhs_field_description": lhs_desc,
                f"lhs_field_data_type": lhs_type,
                "matches": matches
            }

    # Return final JSON output
    return {"results": [result]}


# User inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve and display results
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, normalized_similarity_matrix, df1, df2, column1, column2, type_mapping
)

print(json.dumps(output_json, indent=2))


ONE TO MANY - Below code gets the topN similarities from RHS for particular index on LHS and shows if data types are compatible

In [None]:
import json
import numpy as np
import pandas as pd

# Function to retrieve top-N most similar sentences with data type validation
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping):
    # Initialize variables
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top-N indices sorted in descending order
    
    lhs_field = df1.loc[selected_index, 'Field Name']  # LHS field name
    lhs_desc = df1.loc[selected_index, column1]        # LHS description
    lhs_type = df1.loc[selected_index, 'Data_Type']    # LHS data type
    
    # List to store all matches
    matches = [
        {
            "rank": rank + 1,
            "rhs_field_score": float(similarities[idx]),
            "rhs_index": int(df2.index[idx]),
            "rhs_field_name": df2.loc[idx, 'Attribute'],   # RHS attribute name
            "rhs_field_desc": df2.loc[idx, column2],       # RHS description
            "rhs_data_type": df2.loc[idx, 'Data_Type'],    # RHS data type
            "compatibility": "Compatible" if df2.loc[idx, 'Data_Type'] in type_mapping.get(lhs_type, []) else "Incompatible"
        }
        for rank, idx in enumerate(top_indices)  # Loop through top indices
        if (rhs_type := df2.loc[idx, 'Data_Type']) in type_mapping.get(lhs_type, [])

    ]

    # Construct result dictionary
    result = {
                f"lhs_field_index": selected_index,
                f"lhs_field_name": lhs_field,
                f"lhs_field_description": lhs_desc,
                f"lhs_field_data_type": lhs_type,
                "matches": matches
            }

    # Return final JSON output
    return {"results": [result]}


# User inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve and display results
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, normalized_similarity_matrix, df1, df2, column1, column2, type_mapping
)

print(json.dumps(output_json, indent=2))


Refactored code by utilizing the datatype compatibility filter 

In [None]:
import json
import numpy as np
import pandas as pd

# Unified function to retrieve top-N most similar sentences
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping, filter_compatible=True):
    
    # Retrieve similarity scores and top-N indices in descending order
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]

    # Extract LHS details once
    lhs_field = df1.loc[selected_index, 'Field Name']
    lhs_desc = df1.loc[selected_index, column1]
    lhs_type = df1.loc[selected_index, 'Data_Type']

    # Generate matches based on the filter_compatible flag
    matches = []
    for rank, idx in enumerate(top_indices):
        rhs_type = df2.loc[idx, 'Data_Type']  # Extract RHS data type
        # Check compatibility
        if not filter_compatible or rhs_type in type_mapping.get(lhs_type, []):
            matches.append({
                "rank": rank + 1,
                "rhs_field_score": float(similarities[idx]),
                "rhs_index": int(df2.index[idx]),
                "rhs_field_name": df2.loc[idx, 'Attribute'],
                "rhs_field_desc": df2.loc[idx, column2],
                "rhs_data_type": rhs_type,
                "compatibility": "Compatible" if rhs_type in type_mapping.get(lhs_type, []) else "Incompatible"
            })

    # Construct and return the result
    return {
        "results": [
            {
                f"lhs_field_index": selected_index,
                f"lhs_field_name": lhs_field,
                f"lhs_field_description": lhs_desc,
                f"lhs_field_data_type": lhs_type,
                "matches": matches
            }
        ]
    }

# User inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)
filter_compatible = input("Do you want to filter only compatible matches? (yes/no): ").strip().lower() == "yes"

# Retrieve results
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, normalized_similarity_matrix, df1, df2, column1, column2, type_mapping, filter_compatible
)

# Print the output as formatted JSON
print(json.dumps(output_json, indent=2))


JSON Format result output for selected number of sequential rows from LHS and its selected number of similarities and ther fields based on descending order from RHS.


In [None]:
import json

# Function to retrieve top-N most similar sentences with data type validation
def retrieve_top_similar_sentences_json(num_sentences, top_n, normalized_similarity_matrix, df1, df2, column1, column2, type_mapping):
    results = []  # List to hold all results

    for selected_index in range(num_sentences):
        # Get similarity scores for the selected sentence
        similarities = normalized_similarity_matrix[selected_index]

        # Find the indices of the top-N most similar sentences in embeddings2
        top_indices = np.argsort(similarities)[::-1]  # Sort indices in descending order

        matches = []
        lhs_type = df1['Data_Type'].iloc[selected_index]  # Retrieve LHS data type for the selected field

        # Process matches in descending order of similarity
        for rank, idx in enumerate(top_indices, start=1):
            if len(matches) >= top_n:  # Stop if we've already collected top N compatible matches
                break

            rhs_type = df2['Data_Type'].iloc[idx]  # Retrieve RHS data type

            # Check data type compatibility
            if rhs_type in type_mapping.get(lhs_type, []):  # Only add compatible matches
                matches.append({
                    "rank": rank,
                    "rhs_field_score": float(similarities[idx]),
                    "rhs_index": int(df2.index[idx]),
                    "rhs_field_name": df2['Attribute'].iloc[idx],  # Assuming 'Attribute' is a column in df2
                    "rhs_field_desc": df2[column2].iloc[idx],  # Column2 is the RHS description
                    "rhs_data_type": rhs_type,
                    "compatibility": "Compatible"
                })

        # Construct result for the LHS field
        result = {
            f"lhs_field_{selected_index}": df1['Field Name'].iloc[selected_index],  # Assuming 'Field Name' is a column in df1
            f"lhs_desc_{selected_index}": df1[column1].iloc[selected_index],  # Column1 is the LHS description
            f"lhs_data_type_{selected_index}": lhs_type,
            "matches": matches
        }

        # Append to results
        results.append(result)

    # Return the final output as JSON
    final_output = {"results": results}
    return final_output


# User inputs: number of sentences to process and top-N similarities to retrieve
num_sentences_to_process = int(input("Enter the number of sentences from LHS to process: ") or 3)
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve results in the specified JSON format
output_json = retrieve_top_similar_sentences_json(num_sentences_to_process, top_n, normalized_similarity_matrix, df1, df2, column1, column2, type_mapping)

# Print the output as formatted JSON
print(json.dumps(output_json, indent=2))

practice from here, dont change above

In [88]:
mappings_dir = r"C:\Users\Yasvanth.Pamidi\OneDrive - ENCORA\Desktop\VSC\DataMapper\versions\mappings"


In [71]:
# Load mapping files from the mappings directory
def load_mappings(mappings_dir):
    """
    Load all JSON mapping files from the specified directory.
    Returns a dictionary where the key is the database name and the value is the mapping dictionary.
    """
    mappings = {}
    if not os.path.exists(mappings_dir):
        raise FileNotFoundError(f"Mappings directory '{mappings_dir}' does not exist.")
    
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json"):
            database_name = os.path.splitext(filename)[0]
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
    
    if not mappings:
        raise ValueError(f"No mapping files found in '{mappings_dir}'.")
    
    return mappings

In [72]:
# Function to normalize data types using mappings
#def normalize_data_type(data_type, database, mappings):
"""    Normalize a database-specific data type to a SQL:2023 standard using the provided mappings.
If no mapping is found, return the original data type.
"""
#   database_mapping = mappings.get(database, {})
#    return database_mapping.get(data_type.lower(), data_type.lower())  # Use .lower() for case-insensitive matching

def normalize_data_type_with_database(data_type, mappings):
    
    # Convert the input data type to lowercase for case-insensitive comparison.
    data_type_lower = data_type.lower()

    # Iterate through all databases and their mappings.
    for database, database_mapping in mappings.items():
        # Iterate through the database-specific mappings (key-value pairs).
        for key, value in database_mapping.items():
            # If the data_type matches the key (case-insensitive), return the normalized value and the database name.
            if data_type_lower == key.lower():
                return value.upper(), database

    # If no mapping is found, return the original data type in uppercase and None for the database.
    return data_type.upper(), None



In [None]:
import os
import json
import numpy as np
import pandas as pd

# Function to load mapping files from the mappings directory
def load_mappings(mappings_dir):
    """
    Load all JSON mapping files from the specified directory.
    Returns a dictionary where the key is the database name and the value is the mapping dictionary.
    """
    mappings = {}
    if not os.path.exists(mappings_dir):
        raise FileNotFoundError(f"Mappings directory '{mappings_dir}' does not exist.")
    
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json") and filename != "compatibilities.json":
            database_name = os.path.splitext(filename)[0]
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
    
    if not mappings:
        raise ValueError(f"No mapping files found in '{mappings_dir}'.")
    
    return mappings

# Function to load compatibilities from a JSON file
def load_compatibilities(file_path):
    """
    Load data type compatibilities from a JSON file.
    """
    with open(file_path, 'r') as file:
        return json.load(file)
    
# Function to normalize data types using mappings

def normalize_data_type_with_database(data_type, mappings):
    
    # Convert the input data type to lowercase for case-insensitive comparison.
    data_type_lower = data_type.lower()

    # Iterate through all databases and their mappings.
    for database, database_mapping in mappings.items():
        # Iterate through the database-specific mappings (key-value pairs).
        for key, value in database_mapping.items():
            # If the data_type matches the key (case-insensitive), return the normalized value and the database name.
            if data_type_lower == key.lower():
                return value.upper(), database

    # If no mapping is found, return the original data type in uppercase and None for the database.
    return data_type.upper(), None

# Function to check compatibility of data types using compatibilities
def are_compatible(lhs_type, rhs_type, compatibilities):
    """
    Check if two data types are compatible using the provided compatibilities.
    """
    return lhs_type == rhs_type or rhs_type in compatibilities.get(lhs_type, [])


# Function to retrieve top-N most similar fields with data type validation
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping, filter_compatible=True):
    
    # Ensure the similarity matrix dimensions match the datasets
    if similarity_matrix.shape != (len(df1), len(df2)):
        raise ValueError("The similarity matrix dimensions must match the LHS and RHS datasets.")
    
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top-N indices sorted in descending order

    # Extract metadata for the selected LHS field
    lhs_field = df1.loc[selected_index, 'Field Name']
    lhs_desc = df1.loc[selected_index, column1]
    lhs_type = df1.loc[selected_index, 'Data_Type']

    # Initialize list to store matches
    matches = []
    for rank, idx in enumerate(top_indices):
        rhs_field = df2.loc[idx, 'Attribute']
        rhs_desc = df2.loc[idx, column2]
        rhs_type = df2.loc[idx, 'Data_Type']
        database_context = df2.loc[idx, 'Database'] if 'Database' in df2.columns else "default_database"

        # Normalize the RHS data type using the mapping
        normalized_rhs_type = normalize_data_type(rhs_type, database_context, type_mapping)

        # Normalize both LHS and RHS data types to the same case
        lhs_type_normalized = lhs_type.upper()
        normalized_rhs_type = normalized_rhs_type.upper()

        # Debug prints for validation
        #print(f"LHS Type: {lhs_type}, RHS Type: {rhs_type}, Normalized RHS Type: {normalized_rhs_type}")
        print(f"LHS Type: {lhs_type} (Normalized: {lhs_type_normalized}), RHS Type: {rhs_type} (Normalized: {normalized_rhs_type})")

      
        #compatibility = "Compatible" if normalized_rhs_type == lhs_type else "Incompatible"
        #if filter_compatible and compatibility == "Incompatible":
        #    print(f"Filtered out: {rhs_field} (LHS: {lhs_type}, RHS: {normalized_rhs_type})")

        #    continue

        # Check compatibility using the compatibilities dictionary
        if not are_compatible(lhs_type_normalized, normalized_rhs_type, compatibilities):
            print(f"Filtered out: {rhs_field} (LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type})")
            continue

        # Mark as compatible if it passes the check
        compatibility = "Compatible"

        # Append match details, including the database name
        matches.append({
            "rank": rank + 1,
            "rhs_field_score": float(similarities[idx]),
            "rhs_index": int(df2.index[idx]),
            "rhs_field_name": rhs_field,
            "rhs_field_desc": rhs_desc,
            "rhs_data_type": rhs_type,
            "normalized_rhs_type": normalized_rhs_type,
            "compatibility": compatibility,
            "database_name": database_context if database_context else "Unknown" # Include database name
        })

    # Handle case where no matches are found
    if not matches:
        matches.append({"message": "No compatible matches found."})

    # Construct the result dictionary
    result = {
        "lhs_field_index": selected_index,
        "lhs_field_name": lhs_field,
        "lhs_field_description": lhs_desc,
        "lhs_field_data_type": lhs_type,
        "matches": matches
    }
    return {"results": [result]}

# Load mappings and compatibilities
mappings_dir = 'C:/Users/Yasvanth.Pamidi/OneDrive - ENCORA/Desktop/VSC/DataMapper/versions/mappings'
type_mapping = load_mappings(mappings_dir)  # Load type mappings dynamically
compatibilities_file = os.path.join(mappings_dir, 'compatibilities.json')
compatibilities = load_compatibilities(compatibilities_file)  # Load compatibilities

# Example user inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve results and print JSON output
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, similarity_matrix, df1, df2, "Description", "Description", type_mapping,compatibilities
)
print(json.dumps(output_json, indent=2))


In [None]:
print(df1["Data_Type"].unique())

In [None]:
print(df2["Data_Type"].unique())

fixing default database issue

In [None]:
import os
import json
import numpy as np
import pandas as pd

# Function to load mapping files from the mappings directory
def load_mappings(mappings_dir):
    """
    Load all JSON mapping files from the specified directory.
    Returns a dictionary where the key is the database name and the value is the mapping dictionary.
    """
    mappings = {}
    if not os.path.exists(mappings_dir):
        raise FileNotFoundError(f"Mappings directory '{mappings_dir}' does not exist.")
    
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json") and filename != "compatibilities.json":
            database_name = os.path.splitext(filename)[0]
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
                # Debug: Log each loaded database mapping
                print(f"Debug: Loaded Mapping for Database: {database_name}, Data: {mappings[database_name]}")
    
    if not mappings:
        raise ValueError(f"No mapping files found in '{mappings_dir}'.")
    
    return mappings


# Function to load compatibilities from a JSON file
def load_compatibilities(file_path):
    """
    Load data type compatibilities from a JSON file.
    """
    with open(file_path, 'r') as file:
        return json.load(file)
    
# Function to normalize data types using mappings
def normalize_data_type(rhs_type, database, mappings):
    """
    Normalize a database-specific data type to a SQL:2023 standard using the provided mappings.
    Return the normalized type and the database name.
    """
    print(f"Debug: RHS Type: {rhs_type}, Database: {database}")
    print(f"Debug: Mappings Available: {list(mappings.keys())}")
    
    # Iterate through all mappings for all databases
    rhs_type_lower = rhs_type.lower()  # Ensure input is in lowercase
    for db_name, db_mapping in mappings.items():
        # Normalize mapping keys for case-insensitivity
        normalized_type = {k.lower(): v for k, v in db_mapping.items()}.get(rhs_type_lower)
        if normalized_type:
            print(f"Debug: Match Found - Database: {db_name}, Type: {normalized_type}")
            return normalized_type, db_name  # Return normalized type and database name

    print(f"Debug: No Match Found for RHS Type: {rhs_type}")
    return rhs_type.upper(), None  # Return original type in uppercase if no match





# Function to check compatibility of data types using compatibilities
def are_compatible(lhs_type, rhs_type, compatibilities):
    """
    Check if two data types are compatible using the provided compatibilities.
    """
    lhs_type = lhs_type.upper()
    rhs_type = rhs_type.upper()
    return lhs_type == rhs_type or rhs_type in compatibilities.get(lhs_type, [])


# Function to retrieve top-N most similar fields with data type validation
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping, filter_compatible=True):
    
    # Ensure the similarity matrix dimensions match the datasets
    if similarity_matrix.shape != (len(df1), len(df2)):
        raise ValueError("The similarity matrix dimensions must match the LHS and RHS datasets.")
    
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top-N indices sorted in descending order

    # Extract metadata for the selected LHS field
    lhs_field = df1.loc[selected_index, 'Field Name']
    lhs_desc = df1.loc[selected_index, column1]
    lhs_type = df1.loc[selected_index, 'Data_Type']

    # Initialize list to store matches
    matches = []
    for rank, idx in enumerate(top_indices):
        rhs_field = df2.loc[idx, 'Attribute']
        rhs_desc = df2.loc[idx, column2]
        rhs_type = df2.loc[idx, 'Data_Type']
        database_context = df2.loc[idx, 'Database'] if 'Database' in df2.columns else "default_database"

        # Normalize the RHS data type using the mapping
        normalized_rhs_type, database_context = normalize_data_type(rhs_type, None, type_mapping)

        # Normalize both LHS and RHS data types to the same case
        lhs_type_normalized = lhs_type.upper()
        normalized_rhs_type = normalized_rhs_type.upper()

        # Debug prints for validation
        print(f"LHS Type: {lhs_type} (Normalized: {lhs_type_normalized}), "
            f"RHS Type: {rhs_type} (Normalized: {normalized_rhs_type}), "
            f"Database Context: {database_context if database_context else 'Unknown'}")

        # Check compatibility using the compatibilities dictionary
        is_compatible = are_compatible(lhs_type_normalized, normalized_rhs_type, compatibilities)

        # Debugging compatibility logic
        print(f"Debug: Compatibility Check - LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type}, Compatible: {is_compatible}")

        if not is_compatible:
            print(f"Filtered out: {rhs_field} (LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type})")
            continue



        # Mark as compatible if it passes the check
        compatibility = "Compatible"

        # Append match details, including the database name
        matches.append({
            "rank": rank + 1,
            "rhs_field_score": float(similarities[idx]),
            "rhs_index": int(df2.index[idx]),
            "rhs_field_name": rhs_field,
            "rhs_field_desc": rhs_desc,
            "rhs_data_type": rhs_type,
            "normalized_rhs_type": normalized_rhs_type,
            "compatibility": compatibility,
            "database_name": database_context if database_context else "Unknown" # Include database name
        })

    # Handle case where no matches are found
    if not matches:
        matches.append({"message": "No compatible matches found."})

    # Construct the result dictionary
    result = {
        "lhs_field_index": selected_index,
        "lhs_field_name": lhs_field,
        "lhs_field_description": lhs_desc,
        "lhs_field_data_type": lhs_type,
        "matches": matches
    }
    return {"results": [result]}

# Load mappings and compatibilities
mappings_dir = 'C:/Users/Yasvanth.Pamidi/OneDrive - ENCORA/Desktop/VSC/DataMapper/versions/mappings'
type_mapping = load_mappings(mappings_dir)  # Load type mappings dynamically
compatibilities_file = os.path.join(mappings_dir, 'compatibilities.json')
compatibilities = load_compatibilities(compatibilities_file)  # Load compatibilities

# Example user inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve results and print JSON output
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, similarity_matrix, df1, df2, "Description", "Description", type_mapping,compatibilities
)
print(json.dumps(output_json, indent=2))


In [None]:
import os
import json
import numpy as np
import pandas as pd

# Function to load mapping files from the mappings directory
def load_mappings(mappings_dir):
    """
    Load all JSON mapping files from the specified directory.
    Returns a dictionary where the key is the database name and the value is the mapping dictionary.
    """
    mappings = {}
    if not os.path.exists(mappings_dir):
        raise FileNotFoundError(f"Mappings directory '{mappings_dir}' does not exist.")
    
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json") and filename != "compatibilities.json":
            database_name = os.path.splitext(filename)[0]
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
                # Debug: Log each loaded database mapping
                print(f"Debug: Loaded Mapping for Database: {database_name}, Data: {mappings[database_name]}")
    
    if not mappings:
        raise ValueError(f"No mapping files found in '{mappings_dir}'.")
    
    return mappings


# Function to load compatibilities from a JSON file
def load_compatibilities(file_path):
    """
    Load data type compatibilities from a JSON file.
    """
    with open(file_path, 'r') as file:
        return json.load(file)
    
# Function to normalize data types using mappings
def normalize_data_type(rhs_type, database, mappings):
    """
    Normalize a database-specific data type to a SQL:2023 standard using the provided mappings.
    Return the normalized type and the database name.
    """
    print(f"Debug: RHS Type: {rhs_type}, Database: {database}")
    print(f"Debug: Mappings Available: {list(mappings.keys())}")
    
    # Iterate through all mappings for all databases
    rhs_type_lower = rhs_type.lower()  # Ensure input is in lowercase
    for db_name, db_mapping in mappings.items():
        # Normalize mapping keys for case-insensitivity
        normalized_type = {k.lower(): v for k, v in db_mapping.items()}.get(rhs_type_lower)
        if normalized_type:
            print(f"Debug: Match Found - Database: {db_name}, Type: {normalized_type}")
            return normalized_type, db_name  # Return normalized type and database name

    print(f"Debug: No Match Found for RHS Type: {rhs_type}")
    return rhs_type.upper(), None  # Return original type in uppercase if no match





# Function to check compatibility of data types using compatibilities
def are_compatible(lhs_type, rhs_type, compatibilities):
    """
    Check if two data types are compatible using the provided compatibilities.
    """
    lhs_type = lhs_type.upper()
    rhs_type = rhs_type.upper()
    return lhs_type == rhs_type or rhs_type in compatibilities.get(lhs_type, [])


# Function to retrieve top-N most similar fields with data type validation
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping, filter_compatible=True):
    
    # Ensure the similarity matrix dimensions match the datasets
    if similarity_matrix.shape != (len(df1), len(df2)):
        raise ValueError("The similarity matrix dimensions must match the LHS and RHS datasets.")
    
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top-N indices sorted in descending order

    # Extract metadata for the selected LHS field
    lhs_field = df1.loc[selected_index, 'Field Name']
    lhs_desc = df1.loc[selected_index, column1]
    lhs_type = df1.loc[selected_index, 'Data_Type']

    # Initialize list to store matches
    matches = []
    for rank, idx in enumerate(top_indices):
        rhs_field = df2.loc[idx, 'Attribute']
        rhs_desc = df2.loc[idx, column2]
        rhs_type = df2.loc[idx, 'Data_Type']
        database_context = df2.loc[idx, 'Database'] if 'Database' in df2.columns else "default_database"

        # Normalize the RHS data type using the mapping
        normalized_rhs_type, database_context = normalize_data_type(rhs_type, None, type_mapping)

        # Normalize both LHS and RHS data types to the same case
        lhs_type_normalized = lhs_type.upper()
        normalized_rhs_type = normalized_rhs_type.upper()

        # Debug prints for validation
        print(f"LHS Type: {lhs_type} (Normalized: {lhs_type_normalized}), "
            f"RHS Type: {rhs_type} (Normalized: {normalized_rhs_type}), "
            f"Database Context: {database_context if database_context else 'Unknown'}")

        # Check compatibility using the compatibilities dictionary
        is_compatible = are_compatible(lhs_type_normalized, normalized_rhs_type, compatibilities)

        # Debugging compatibility logic
        print(f"Debug: Compatibility Check - LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type}, Compatible: {is_compatible}")

        if not is_compatible:
            print(f"Filtered out: {rhs_field} (LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type})")
            continue



        # Mark as compatible if it passes the check
        compatibility = "Compatible"

        # Append match details, including the database name
        matches.append({
            "rank": rank + 1,
            "rhs_field_score": float(similarities[idx]),
            "rhs_index": int(df2.index[idx]),
            "rhs_field_name": rhs_field,
            "rhs_field_desc": rhs_desc,
            "rhs_data_type": rhs_type,
            "normalized_rhs_type": normalized_rhs_type,
            "compatibility": compatibility,
            "database_name": database_context if database_context else "Unknown" # Include database name
        })

    # Handle case where no matches are found
    if not matches:
        matches.append({"message": "No compatible matches found."})

    # Construct the result dictionary
    result = {
        "lhs_field_index": selected_index,
        "lhs_field_name": lhs_field,
        "lhs_field_description": lhs_desc,
        "lhs_field_data_type": lhs_type,
        "matches": matches
    }
    return {"results": [result]}

# Load mappings and compatibilities
mappings_dir = 'C:/Users/Yasvanth.Pamidi/OneDrive - ENCORA/Desktop/VSC/DataMapper/versions/mappings'
type_mapping = load_mappings(mappings_dir)  # Load type mappings dynamically
compatibilities_file = os.path.join(mappings_dir, 'compatibilities.json')
compatibilities = load_compatibilities(compatibilities_file)  # Load compatibilities

# Example user inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve results and print JSON output
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, similarity_matrix, df1, df2, "Description", "Description", type_mapping,compatibilities
)
print(json.dumps(output_json, indent=2))


In [None]:
import os
import json
import numpy as np
import pandas as pd

# Function to load mapping files from the mappings directory
def load_mappings(mappings_dir):
    """
    Load all JSON mapping files from the specified directory.
    Returns a dictionary where the key is the database name and the value is the mapping dictionary.
    """
    mappings = {}
    if not os.path.exists(mappings_dir):
        raise FileNotFoundError(f"Mappings directory '{mappings_dir}' does not exist.")
    
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json") and filename != "compatibilities.json":
            database_name = os.path.splitext(filename)[0]
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
                # Debug: Log each loaded database mapping
                print(f"Debug: Loaded Mapping for Database: {database_name}, Data: {mappings[database_name]}")
    
    if not mappings:
        raise ValueError(f"No mapping files found in '{mappings_dir}'.")
    
    return mappings

# Function to load compatibilities from a JSON file
def load_compatibilities(file_path):
    """
    Load data type compatibilities from a JSON file.
    """
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to normalize data types using mappings
def normalize_data_type(rhs_type, database, mappings):
    """
    Normalize a database-specific data type to a SQL:2023 standard using the provided mappings.
    Return the normalized type and the database name.
    """
    print(f"Debug: RHS Type: {rhs_type}, Database: {database}")
    print(f"Debug: Mappings Available: {list(mappings.keys())}")
    
    # Iterate through all mappings for all databases
    rhs_type_lower = rhs_type.lower()  # Ensure input is in lowercase
    for db_name, db_mapping in mappings.items():
        # Normalize mapping keys for case-insensitivity
        normalized_type = {k.lower(): v for k, v in db_mapping.items()}.get(rhs_type_lower)
        if normalized_type:
            print(f"Debug: Match Found - Database: {db_name}, Type: {normalized_type}")
            return normalized_type, db_name  # Return normalized type and database name

    print(f"Debug: No Match Found for RHS Type: {rhs_type}")
    return rhs_type.upper(), None  # Return original type in uppercase if no match

# Function to check compatibility of data types using compatibilities
def are_compatible(lhs_type, rhs_type, compatibilities):
    """
    Check if two data types are compatible using the provided compatibilities.
    """
    lhs_type = lhs_type.upper()
    rhs_type = rhs_type.upper()
    return lhs_type == rhs_type or rhs_type in compatibilities.get(lhs_type, [])

# Function to retrieve top-N most similar fields with data type validation
def retrieve_top_similar_sentences_json(selected_index, top_n, similarity_matrix, df1, df2, column1, column2, type_mapping, compatibilities, filter_compatible=True):
    # Ensure the similarity matrix dimensions match the datasets
    if similarity_matrix.shape != (len(df1), len(df2)):
        raise ValueError("The similarity matrix dimensions must match the LHS and RHS datasets.")
    
    similarities = similarity_matrix[selected_index]
    top_indices = np.argsort(similarities)[-top_n:][::-1]  # Top-N indices sorted in descending order

    # Extract metadata for the selected LHS field
    lhs_field = df1.loc[selected_index, 'Field Name']
    lhs_desc = df1.loc[selected_index, column1]
    lhs_type = df1.loc[selected_index, 'Data_Type']

    # Initialize list to store matches
    matches = []
    for rank, idx in enumerate(top_indices):
        rhs_field = df2.loc[idx, 'Attribute']
        rhs_desc = df2.loc[idx, column2]
        rhs_type = df2.loc[idx, 'Data_Type']
        database_context = df2.loc[idx, 'Database'] if 'Database' in df2.columns else "default_database"

        # Normalize the RHS data type using the mapping
        normalized_rhs_type, database_context = normalize_data_type(rhs_type, None, type_mapping)

        # Normalize both LHS and RHS data types to the same case
        lhs_type_normalized = lhs_type.upper()
        normalized_rhs_type = normalized_rhs_type.upper()

        # Debug prints for validation
        print(f"LHS Type: {lhs_type} (Normalized: {lhs_type_normalized}), "
            f"RHS Type: {rhs_type} (Normalized: {normalized_rhs_type}), "
            f"Database Context: {database_context if database_context else 'Unknown'}")

        # Check compatibility using the compatibilities dictionary
        is_compatible = are_compatible(lhs_type_normalized, normalized_rhs_type, compatibilities)

        # Debugging compatibility logic
        print(f"Debug: Compatibility Check - LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type}, Compatible: {is_compatible}")

        if not is_compatible:
            print(f"Filtered out: {rhs_field} (LHS: {lhs_type_normalized}, RHS: {normalized_rhs_type})")
            continue

        # Mark as compatible if it passes the check
        compatibility = "Compatible"

        # Append match details, including the database name
        matches.append({
            "rank": rank + 1,
            "rhs_field_score": float(similarities[idx]),
            "rhs_index": int(df2.index[idx]),
            "rhs_field_name": rhs_field,
            "rhs_field_desc": rhs_desc,
            "rhs_data_type": rhs_type,
            "normalized_rhs_type": normalized_rhs_type,
            "compatibility": compatibility,
            "database_name": database_context if database_context else "Unknown" # Include database name
        })

    # Handle case where no matches are found
    if not matches:
        matches.append({"message": "No compatible matches found."})

    # Construct the result dictionary
    result = {
        "lhs_field_index": selected_index,
        "lhs_field_name": lhs_field,
        "lhs_field_description": lhs_desc,
        "lhs_field_data_type": lhs_type,
        "matches": matches
    }
    return {"results": [result]}

# Load mappings and compatibilities
mappings_dir = 'C:/Users/Yasvanth.Pamidi/OneDrive - ENCORA/Desktop/VSC/DataMapper/versions/mappings'
type_mapping = load_mappings(mappings_dir)  # Load type mappings dynamically
compatibilities_file = os.path.join(mappings_dir, 'compatibilities.json')
compatibilities = load_compatibilities(compatibilities_file)  # Load compatibilities

# Example user inputs
enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

# Retrieve results and print JSON output
output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, similarity_matrix, df1, df2, "Description", "Description", type_mapping, compatibilities
)
print(json.dumps(output_json, indent=2))


This Python script is designed to analyze field similarities between two datasets while validating data type compatibility using mappings and predefined rules. It begins by importing necessary libraries like `os` for file handling, `json` for working with JSON files, `numpy` for numerical operations, and `pandas` for handling tabular data. The `load_mappings` function dynamically loads database-specific type mappings from a directory containing JSON files, while `load_compatibilities` loads data type compatibility rules from a specific JSON file. The `normalize_data_type` function standardizes database-specific data types into SQL:2023 standards, returning the normalized type and corresponding database context. The compatibility check between data types is handled by `are_compatible`, which verifies if two types align based on the compatibility dictionary. The main processing occurs in `retrieve_top_similar_sentences_json`, which identifies the top-N most similar fields from a similarity matrix between two datasets, validates their data type compatibility, and constructs a detailed JSON output. This function includes safeguards like ensuring similarity matrix dimensions match the datasets, normalizing data types, and filtering out incompatible matches. User inputs, such as the index of the field to process and the number of top matches to retrieve, guide the program's operation. Finally, results are output in JSON format for further analysis. Debugging messages are extensively used to log key steps, such as loading mappings, normalizing data types, and checking compatibility, enhancing traceability and troubleshooting.


In [None]:
import os
import json
import numpy as np
import pandas as pd

def load_mappings(mappings_dir):
    mappings = {}
    if not os.path.exists(mappings_dir):
        raise FileNotFoundError(f"Mappings directory '{mappings_dir}' does not exist.")
    for filename in os.listdir(mappings_dir):
        if filename.endswith(".json") and filename != "compatibilities.json":
            database_name = os.path.splitext(filename)[0]
            with open(os.path.join(mappings_dir, filename), "r") as file:
                mappings[database_name] = json.load(file)
    if not mappings:
        raise ValueError(f"No mapping files found in '{mappings_dir}'.")
    return mappings

def load_compatibilities(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def normalize_data_type(rhs_type, database, mappings):
    rhs_type_lower = rhs_type.lower()
    matching_databases = []
    normalized_type = None

    for db_name, db_mapping in mappings.items():
        normalized = {k.lower(): v for k, v in db_mapping.items()}.get(rhs_type_lower)
        if normalized:
            normalized_type = normalized
            matching_databases.append(db_name)

    if matching_databases:
        return normalized_type, matching_databases
    return rhs_type.upper(), None


def are_compatible(lhs_type, rhs_type, compatibilities):
    lhs_type = lhs_type.upper()
    rhs_type = rhs_type.upper()
    return lhs_type == rhs_type or rhs_type in compatibilities.get(lhs_type, [])

def retrieve_top_similar_sentences_json(selected_index, top_n, normalized_similarity_matrix, df1, df2, column1, column2, type_mapping, compatibilities, filter_compatible=True):
    if normalized_similarity_matrix.shape != (len(df1), len(df2)):
        raise ValueError("The similarity matrix dimensions must match the LHS and RHS datasets.")
    
    similarities = normalized_similarity_matrix[selected_index]
    matches = []
    for idx in range(len(similarities)):
        rhs_field = df2.loc[idx, 'Attribute']
        rhs_desc = df2.loc[idx, column2]
        rhs_type = df2.loc[idx, 'Data_Type']
        database_context = df2.loc[idx, 'Database'] if 'Database' in df2.columns else "default_database"
        normalized_rhs_type, database_context = normalize_data_type(rhs_type, None, type_mapping)
        lhs_type = df1.loc[selected_index, 'Data_Type'].upper()
        normalized_rhs_type = normalized_rhs_type.upper()
        is_compatible = are_compatible(lhs_type, normalized_rhs_type, compatibilities)
        if is_compatible:
            matches.append({
                "rank": None,  # Placeholder to set rank later
                "rhs_index": int(df2.index[idx]),
                "rhs_field_name": rhs_field,
                "rhs_field_desc": rhs_desc,
                "rhs_data_type": rhs_type,
                "normalized_rhs_type": normalized_rhs_type,
                "database_name": database_context if database_context else "Unknown",
                "similarity_score": float(similarities[idx]),
                "compatibility": "Compatible"
            })
    matches = sorted(matches, key=lambda x: x["similarity_score"], reverse=True)
    matches = matches[:top_n]
    for rank, match in enumerate(matches, start=1):
        match["rank"] = rank
    if not matches:
        matches.append({"message": "No compatible matches found."})
    lhs_field = df1.loc[selected_index, 'Field Name']
    lhs_desc = df1.loc[selected_index, column1]
    lhs_type = df1.loc[selected_index, 'Data_Type']
    result = {
        "lhs_field_index": selected_index,
        "lhs_field_name": lhs_field,
        "lhs_field_description": lhs_desc,
        "lhs_field_data_type": lhs_type,
        "matches": matches
    }
    return {"results": [result]}

mappings_dir = 'C:/Users/Yasvanth.Pamidi/OneDrive - ENCORA/Desktop/VSC/DataMapper/versions/mappings'
type_mapping = load_mappings(mappings_dir)
compatibilities_file = os.path.join(mappings_dir, 'compatibilities.json')
compatibilities = load_compatibilities(compatibilities_file)

enter_index = int(input("Enter the index from LHS to process: "))
top_n = int(input("Enter the number of top similar sentences to retrieve (N): ") or 3)

output_json = retrieve_top_similar_sentences_json(
    enter_index, top_n, normalized_similarity_matrix, df1, df2, "Description", "Description", type_mapping, compatibilities
)
print(json.dumps(output_json, indent=2))
