In [21]:
from pathlib import Path
from dotenv import dotenv_values
import os
from makersite_ml.query_utils import query_to_dataframe
import pandas as pd 


In [22]:
from pathlib import Path
from dotenv import dotenv_values
import os
makersite_ml_repo = Path("~/codebase/makersite-ml").expanduser()
assert makersite_ml_repo.exists()
config = dotenv_values(makersite_ml_repo / ".env")

for _env, _value in config.items():
    os.environ[_env] = _value

In [24]:
import pandas as pd
import json

# Sample data frame (replace this with your actual DataFrame 'ralevent_df')
data = {
    'JESD-30 Code': ['R-PDSO-G3', 'R-PDSO-G4'],
    'Configuration': ['SINGLE', 'DOUBLE'],
    'JESD-609 Code': ['e3', 'e4'],
    'Package Shape': ['RECTANGULAR', 'SQUARE'],
    'Package Style': ['SMALL OUTLINE', 'LARGE OUTLINE'],
    'Surface Mount': ['YES', 'NO'],
    'Terminal Form': ['GULL WING', 'LEADLESS'],
    'Terminal Finish': ['TIN', 'SILVER'],
    'Terminal Position': ['DUAL', 'SINGLE'],
    'mpn': ['123', '456'],
    'NumericColumn': [10, 20]  # Example of a numerical column
}

ralevent_df = pd.DataFrame(data)

# Identify categorical columns
def is_categorical(column):
    try:
        # Try converting the column to float
        ralevent_df[column].astype(float)
        return False
    except ValueError:
        return True

categorical_columns = [col for col in ralevent_df.columns if is_categorical(col) and col != 'mpn']

# Convert each row to JSON, only for categorical columns
print(ralevent_df.apply(
    lambda row: json.dumps({col: row[col] for col in categorical_columns}),
    axis=1
) )




0    {"JESD-30 Code": "R-PDSO-G3", "Configuration":...
1    {"JESD-30 Code": "R-PDSO-G4", "Configuration":...
dtype: object


In [27]:
# Load the dictionary from the pickle file
category_tech_specs_dict = pd.read_pickle('category_technical_specs_above_threshold.pkl')
category_name = "{Semiconductors, Transistors, Small Signal Bipolar Transistors}"
new_component_specs = [{"key": "VCEsat-Max", "value": "0.4", "unit": "V"}, {"key": "JESD-30 Code", "value": "R-PDSO-G3"}, {"key": "Configuration", "value": "SINGLE"}, {"key": "JESD-609 Code", "value": "e3"}, {"key": "Package Shape", "value": "RECTANGULAR"}, {"key": "Package Style", "value": "SMALL OUTLINE", "unit": "Meter"}, {"key": "Surface Mount", "value": "YES"}, {"key": "Terminal Form", "value": "GULL WING"}, {"key": "Terminal Finish", "value": "TIN"}, {"key": "Terminal Position", "value": "DUAL"}, {"key": "Number of Elements", "value": "1"}, {"key": "Reference Standard", "value": "AEC-Q101; IEC-60134"}, {"key": "Number of Terminals", "value": "3"}, {"key": "Package Body Material", "value": "PLASTIC/EPOXY"}, {"key": "Polarity/Channel Type", "value": "NPN"}, {"key": "Transistor Application", "value": "SWITCHING"}, {"key": "DC Current Gain-Min (hFE)", "value": "200"}, {"key": "Operating Temperature-Max", "value": "150", "unit": "Cel"}, {"key": "Operating Temperature-Min", "value": "-65", "unit": "Cel"}, {"key": "Collector Current-Max (IC)", "value": "0.1", "unit": "A"}, {"key": "Moisture Sensitivity Level", "value": "1"}, {"key": "Power Dissipation-Max (Abs)", "value": "0.2", "unit": "W"}, {"key": "Transistor Element Material", "value": "SILICON"}, {"key": "Collector-emitter Voltage-Max", "value": "45", "unit": "V"}, {"key": "Peak Reflow Temperature (Cel)", "value": "260"}, {"key": "Transition Frequency-Nom (fT)", "value": "100", "unit": "MHz"}, {"key": "Collector-base Capacitance-Max", "value": "1.5", "unit": "pF"}, {"key": "Time@Peak Reflow Temperature-Max (s)", "value": "30"}]

mpn =  'BC847BW-Q'

 
query = f'''
    SELECT
        t.mpn,
        t.sourcengine_details,
        c.sourcengine_category_names,
        s.substance_names,
        s.amount
    FROM
        makersite.technical_specifications_data t
    JOIN
        makersite.part_category_lookup c ON t.mpn = c.mpn
    JOIN
        makersite.substance_summaries_of_full_material_declarations s ON (
            REPLACE(c.manufacturer, ' ', '_') || '/' || REPLACE(c.mpn, '/', '%2F') || '.xml') = s.source_file
    WHERE
        c.sourcengine_category_names = '{category_name}'
        AND s.amount > 0 AND s.amount <= 1;
'''

df = query_to_dataframe(query)

df['substance_names'] = df['substance_names'].apply(lambda x: tuple(x) if x is not None else None)
df['sourcengine_category_names'] = df['sourcengine_category_names'].apply(lambda x: tuple(x) if x is not None else None)
df = df.drop_duplicates(subset=['mpn','substance_names','amount'], keep='first')

#display(df)



def convert_to_tuple(category_string):
    cleaned_string = category_string.strip('{}')
    words = cleaned_string.split(', ')
    formatted_words = [word.strip() for word in words]
    result_tuple = tuple(formatted_words)
    return result_tuple

def categorize_specs(category_name, new_component_specs, category_tech_specs_dict):
    
    numerical_specs_present = []
    categorical_specs_present = []

    if category_tuple in category_tech_specs_dict:
      
        numerical_specs_dict = category_tech_specs_dict[category_tuple].get('numerical', [])
        categorical_specs_dict = category_tech_specs_dict[category_tuple].get('categorical', [])

       
        for spec in new_component_specs:
            key = spec['key']
            value = spec['value']

           
            is_numerical = False
            try:
                float(value)  
                is_numerical = True
            except ValueError:
                pass

            
            if key in numerical_specs_dict and is_numerical:
                numerical_specs_present.append(spec)
            elif key in categorical_specs_dict and not is_numerical:
                categorical_specs_present.append(spec)
    else:
        print("Category name not found in the dictionary.")

    return numerical_specs_present, categorical_specs_present



category_tuple = convert_to_tuple(category_name)


numerical_specs_present, categorical_specs_present = categorize_specs(category_name, new_component_specs, category_tech_specs_dict)

print("Numerical Specs Present:", numerical_specs_present)
print("Categorical Specs Present:", categorical_specs_present)


def compare_specs(numerical_specs_present, categorical_specs_present, df, tolerance=20):
    match_results = []
    matched_mpns = set() 


    def is_within_range(new_value, old_value, tolerance):
        try:
            new_value = float(new_value)
            old_value = float(old_value)
            tolerance_factor = tolerance / 100
            lower_bound = new_value * (1 - tolerance_factor)
            upper_bound = new_value * (1 + tolerance_factor)
            return lower_bound <= old_value <= upper_bound
        except ValueError:
            return False 


    for index, row in df.iterrows():
        mpn = row['mpn'] 

        if mpn in matched_mpns:
            continue

        numerical_matches = 0
        categorical_matches = 0
        matched_numerical_specs = []  # List to store matched numerical specifications
        matched_categorical_specs = []  # List to store matched categorical specifications
        unmatched_numerical_specs = []  # List to store unmatched numerical specifications
        unmatched_categorical_specs = []  # List to store unmatched categorical specifications

      
        old_comp_specs = row['sourcengine_details']
        old_comp = {spec['key']: spec for spec in old_comp_specs if 'key' in spec}

        def compare_and_record_matches(spec_list, match_counter, matched_specs, unmatched_specs, is_numerical=False):
            for new_spec in spec_list:
                if new_spec['key'] in old_comp:
                    old_spec = old_comp[new_spec['key']]
                    if 'value' in new_spec and 'value' in old_spec:
                        match_found = False
                        if is_numerical:
                            match_found = is_within_range(new_spec['value'], old_spec['value'], tolerance)
                        else:
                            match_found = str(new_spec['value']) == str(old_spec['value'])
                        if match_found:
                            match_counter += 1
                            matched_specs.append(old_spec)
                        else:
                            unmatched_specs.append(new_spec)
                    else:
                        unmatched_specs.append(new_spec)
            return match_counter

        numerical_matches = compare_and_record_matches(numerical_specs_present, numerical_matches,
                                                       matched_numerical_specs, unmatched_numerical_specs,
                                                       is_numerical=True)
        categorical_matches = compare_and_record_matches(categorical_specs_present, categorical_matches,
                                                         matched_categorical_specs, unmatched_categorical_specs)

        total_matches = numerical_matches + categorical_matches
        if total_matches > 0: 
            match_results.append((mpn, total_matches, matched_numerical_specs, matched_categorical_specs,
                                  unmatched_numerical_specs, unmatched_categorical_specs))
            matched_mpns.add(mpn) 

  
    top_5_matches = sorted(match_results, key=lambda x: x[1], reverse=True)[:5]

    
    print("**Top 5 matches with their MPN and matched specifications:**")
    for mpn, count, num_specs, cat_specs, unmatched_num_specs, unmatched_cat_specs in top_5_matches:
        print(f"MPN: {mpn}, Matches: {count}")
        print("Matched Numerical Specs:")
        for spec in num_specs:
            print(spec)
        print("Matched Categorical Specs:")
        for spec in cat_specs:
            print(spec)
        print("Unmatched Numerical Specs:")
        for spec in unmatched_num_specs:
            print(spec)
        print("Unmatched Categorical Specs:")
        for spec in unmatched_cat_specs:
            print(spec)
        print() 





compare_specs(numerical_specs_present, categorical_specs_present, df, tolerance=20)







  return pd.read_sql(sql_query, con=connection)


Numerical Specs Present: [{'key': 'Collector Current-Max (IC)', 'value': '0.1', 'unit': 'A'}]
Categorical Specs Present: [{'key': 'JESD-609 Code', 'value': 'e3'}, {'key': 'Surface Mount', 'value': 'YES'}, {'key': 'Terminal Finish', 'value': 'TIN'}, {'key': 'Polarity/Channel Type', 'value': 'NPN'}]
**Top 5 matches with their MPN and matched specifications:**
MPN: EMG2DXV5T1G, Matches: 5
Matched Numerical Specs:
{'key': 'Collector Current-Max (IC)', 'unit': 'A', 'value': '0.1'}
Matched Categorical Specs:
{'key': 'JESD-609 Code', 'value': 'e3'}
{'key': 'Surface Mount', 'value': 'YES'}
{'key': 'Terminal Finish', 'value': 'TIN'}
{'key': 'Polarity/Channel Type', 'value': 'NPN'}
Unmatched Numerical Specs:
Unmatched Categorical Specs:

MPN: NSBC144WDP6T5G, Matches: 4
Matched Numerical Specs:
{'key': 'Collector Current-Max (IC)', 'unit': 'A', 'value': '0.1'}
Matched Categorical Specs:
{'key': 'JESD-609 Code', 'value': 'e3'}
{'key': 'Surface Mount', 'value': 'YES'}
{'key': 'Polarity/Channel Type

In [28]:

def convert_to_numerical(value):
    try:
        return float(value)
    except ValueError:
        return None

def process_row(row):
    processed_row = {}
    processed_row['mpn'] = row['mpn'] 
    processed_row['substance_names'] = row['substance_names'] 
    for spec in row['sourcengine_details']:
        key = spec['key']
        value = spec['value']
       
        if any(d['key'] == key for d in numerical_specs_present):
            numerical_value = convert_to_numerical(value)
            processed_row[key] = numerical_value if numerical_value is not None else value
        

        elif any(d['key'] == key for d in categorical_specs_present):
            processed_row[key] = value
    
    return processed_row

processed_specs_list = []


for index, row in df.iterrows():
    processed_specs_list.append(process_row(row))


relevant_df = pd.DataFrame(processed_specs_list)


display(relevant_df)


Unnamed: 0,mpn,substance_names,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC)
0,2SB1204T-TL-E,"(Ortho-Cresol Novolac Resin,)",e6,YES,TIN BISMUTH,PNP,8.0
1,2SB1204T-TL-E,"(Fused Silica (SiO2),)",e6,YES,TIN BISMUTH,PNP,8.0
2,2SB1204T-TL-E,"(Carbon Black (C),)",e6,YES,TIN BISMUTH,PNP,8.0
3,2SB1204T-TL-E,"(Antimony Trioxide (Sb2O3),)",e6,YES,TIN BISMUTH,PNP,8.0
4,2SB1204T-TL-E,"(Gold (Au),)",e6,YES,TIN BISMUTH,PNP,8.0
...,...,...,...,...,...,...,...
18246,SMUN5214DW1T1G,"(Boron zinc hydroxide oxide,)",e3,YES,MATTE TIN,NPN,0.1
18247,SMUN5214DW1T1G,"(Copper (Cu),)",e3,YES,MATTE TIN,NPN,0.1
18248,SMUN5214DW1T1G,"(Iron (Fe),)",e3,YES,MATTE TIN,NPN,0.1
18249,SMUN5214DW1T1G,"(Nickel (Ni),)",e3,YES,MATTE TIN,NPN,0.1


In [30]:
Substance_rankings = {
    5: [
        ('Lead (Pb)',),
        ('Lead Borate Glass',),
        ('Lead Monoxide',),
        ('Brominated epoxy resin',),
        ('Brominated Bisphenol A Diglycidyl Ether',),
        ('Ortho-Cresol Novolac Resin',),
        ('Antimony Trioxide (Sb2O3)',),
        ('Bismuth (Bi)',),
        ('Chromium (Cr)',),
        ('Nickel (Ni)',),
        ('Cobalt (Co)',),
        ('Phosphorus (P)',)
    ],
    4: [
        ('Phenol Resin',),
        ('Phenolic Resin (Novolac)',),
        ('Phenolic Resin',),
        ('Phenolic Resin-2',),
        ('Silica Crystalline (SiO2)',),
        ('Silica Amorphous (SiO2)',),
        ('Silica (SiO2)',),
        ('Fused Silica (SiO2)',),
        ('Silica vitreous',),
        ('Silica, vitreous',),
        ('Zinc Monoxide (ZnO)',),
        ('Zinc Stannate',),
        ('Zinc (Zn)',),
        ('Zinc(Zn)',),
        ('Copper (Cu)',),
        ('Copper(CU)',),
        ('Copper(Cu)', 'Copper(Cu)'),
        ('Copper Oxide',),
        ('Gold (Au)',),
        ('Gold(Au)',),
        ('Palladium (Pd)',),
        ('Palladium',),
        ('Silver (Ag)',),
        ('Silver  (Ag)', 'Silver (Ag)'),
        ('Silver(Ag)',),
        ('Silver  (Ag)', 'Silver(Ag)')
    ],
    3: [
        ('Aluminum (Al)',),
        ('Aluminium (Al)',),
        ('Aluminum Hydroxide (Al(OH)3)',),
        ('Aluminum oxide',),
        ('Aluminium(Al)',),
        ('Carbon Black (C)',),
        ('Carbon black',),
        ('carbon black',),
        ('Iron (Fe)',),
        ('Iron(Fe)',),
        ('Ferric Oxide',),
        ('Iron phosphide',),
        ('Iron Phosphide (FeP)',),
        ('Iron Cobalt Chromite Black Spinel',),
        ('Manganese (Mn)',),
        ('Manganese',),
        ('Manganese Dioxide',),
        ('Nickel(Ni)',),
        ('Tin (Sn)',),
        ('Tin',),
        ('Phosphorus metal',),
        ('Phosphorus(P)',),
        ('Diglycidyl Ether of Bisphenol-A',),
        ('Diglycidyl phenyl allyl ether',),
        ('Doped Silicon',),
        ('Doped Silicon', 'Silicon'),
        ('Organic Phosphorus',),
        ('Other Filler',),
        ('Other (Glass w/out Declarable Substance)',),
        ('Proprietary Non Halide Flame Retardant',),
        ('Proprietary Filler',),
        ('Random E-Glass',),
        ('Formaldehyde, polymer with amiline',),
        ('Formaldehyde Polymer',)
    ],
    2: [
        ('Acrylate polymer',),
        ('Amines, C36-alkylenedi-, polymers with 5,5\'-[(1-methylethylidene)bis(4,1-phenyleneoxy)]bis[1,3-isobenzofurandione], maleated',),
        ('Isobornyl Methacrylate',),
        ('Isobornyl acrylate',),
        ('Methyl acrylate polymer',),
        ('Dicyclopentenyloxyethyl methacrylate',),
        ('Epoxy',),
        ('Epoxy resin',),
        ('Epoxy Resin',),
        ('Epoxy resins',),
        ('Epoxy   Phenol Resin',),
        ('EpoxyNovolaCresins (Cresolic)',),
        ('Epoxy type resin',),
        ('Epoxy type resin', 'Phenol type resin'),
        ('Epoxy type resin', 'Phenol resin'),
        ('Epoxy Cresol Novolak',),
        ('Epoxy Cresol Novolak', 'Phenol type resin'),
        ('Epoxized Condensate Of Para-Hydrobenzaldehyde And Alkyl Phenol',),
        ('Biphenyl epoxy resin',),
        ('Bismaleimide',),
        ('Bismaleimide polymer',),
        ('Bismaleimide resin',),
        ('Boron zinc hydroxide oxide',),
        ('Boron Oxide',),
        ('Carbon',),
        ('Barium Sulfate',),
        ('Phenol Novolac resin',),
        ('Phenol, polymer with 1,4-bis(methoxymethyl)benzene',),
        ('Phenol type resin', 'additive'),
        ('Polyoxypropylenediamine',),
        ('Trimethoxysilylpropanethiol',),
        ('Oxiranylmethoxy-phenyl-methylene-bisoxirane',),
        ('Tert-Butyl peroxy(2-ethyl)-hexanoate',),
        ('Dicyandiamine',),
        ('2,4,6-triamino-1,3,5-triazine isocyanuric acid',),
        ('2,6-dibromo-4-[1-(3-bromo-4-hydroxyphenyl)-1-methylethyl]phenol',),
        ('Additive', 'phenol resin'),
        ('Additive',),
        ('Polymer',),
        ('Bismuth (Bi)',),
        ('Boron Oxide',),
        ('Boron zinc hydroxide oxide',),
        ('Silicate Acid, Zinc Salt (4:3)',)
    ],
    1: [
        ('Aluminum Hydroxide (Al(OH)3)',),
        ('Aluminum (Al)',),
        ('Aluminium(Al)',),
        ('Aluminum oxide',),
        ('Ferric Oxide',),
        ('Epoxy and Phenolic Resin',),
        ('Phenol resin',),
        ('Epoxy type resin',),
        ('Epoxypropoxy butane polymer',),
        ('Epoxycyclohexylethyltrimethoxysilane',),
        ('Formaldehyde Polymer',),
        ('Diglycidyl phenyl allyl ether',),
        ('Dicyclopentenyloxyethyl methacrylate',),
        ('Urethane Methacrylate Resin',),
        ('Isobornyl Methacrylate',),
        ('Dicyandiamine',),
        ('Methyl acrylate polymer',),
        ('Bisphenol A, epichlorohydrin polymer',),
        ('Bisphenol A_Epichlorohydrin Polymer',),
        ('Fused Silica',),
        ('Silicon (Si)',),
        ('Silicon(Si)',),
        ('Silicon Oxide',),
        ('Silicon oxide', 'Silicon oxide'),
        ('Silicon Dioxide',),
        ('Silicon Nitride (SiN)',),
        ('Silica (SiO2)',),
        ('Silica vitreous',),
        ('Silicone',),
        ('Silicone dioxide (Fused Silica)',),
        ('Silicate Acid, Zinc Salt (4:3)',),
        ('Iron phosphide',),
        ('Sulfur',),
        ('Magnesium  (Mg)',),
        ('Titanium oxide',),
        ('Titanium Oxide',),
        ('Chromium Oxide',),
        ('Chromium oxide',),
        ('Silicon Nitride',),
        ('Silicon oxide',),
        ('Iron',),
        ('Iron phosphide',),
        ('Iron Phosphide (FeP)',),
        ('Zinc',),
        ('Zinc Monoxide',),
        ('Bismaleimide',),
        ('Bismaleimide resin',),
        ('Bismaleimide polymer',),
        ('Phenol resin',),
        ('Diglycidyl Ether of Bisphenol-A',),
        ('Diglycidyl phenyl allyl ether',),
        ('2,4,6-triamino-1,3,5-triazine isocyanuric acid',),
        ('2,6-dibromo-4-[1-(3-bromo-4-hydroxyphenyl)-1-methylethyl]phenol',),
        ('Isobornyl Methacrylate',),
        ('Isobornyl acrylate',),
        ('Methyl acrylate polymer',),
        ('Dicyclopentenyloxyethyl methacrylate',),
        ('Oxiranylmethoxy-phenyl-methylene-bisoxirane',),
        ('Urethane Methacrylate Resin',),
        ('Polyoxypropylenediamine',),
        ('Biphenyl epoxy resin',),
        ('Tert-Butyl peroxy(2-ethyl)-hexanoate',),
        ('Carbon',),
        ('Carbon black',),
        ('Bismuth (Bi)',),
        ('Palladium',),
        ('Palladium (Pd)',),
        ('Silver  (Ag)',),
        ('Silver(Ag)',),
        ('Silica Crystalline',),
        ('Ortho-Cresol Novolac Epoxy resin',)
    ]
}

def get_ranking(substance_name):
    for rank, substances in Substance_rankings.items():
        if substance_name in substances:
            return rank
    return None

# Adding the new column
relevant_df['substance_name_rankings'] = relevant_df['substance_names'].apply(get_ranking)


In [31]:
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import json
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances


relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):

    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    #display(df )
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_specs):
    numerical_columns = [spec['key'] for spec in numerical_specs]
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        distances = cosine_distances(query_point, features_matrix)[0]
    else:
        distances = euclidean_distances(query_point, features_matrix)[0]
    return distances

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names']]
    results['Distance'] = nearest_distances  
    return results


features_matrix_cat = prepare_categorical_features(relevant_df, [spec['key'] for spec in categorical_specs_present])
features_matrix_num = prepare_numerical_features(relevant_df, numerical_specs_present)



query_cat = features_matrix_cat[0:1, :]
query_num = features_matrix_num[0:1, :]
#print(query_cat , query_num)

cat_distances = calculate_distances(features_matrix_cat, query_cat, 'cosine')
num_distances = calculate_distances(features_matrix_num, query_num, 'euclidean')


categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]


nearest_components_cat = find_nearest_components(relevant_df, cat_distances, categorical_columns)
nearest_components_num = find_nearest_components(relevant_df, num_distances, numerical_columns)


combined_distances = (cat_distances + num_distances) / 2
nearest_components_combined = find_nearest_components(relevant_df, combined_distances, categorical_columns + numerical_columns)

print("Nearest Components based on categorical data:")
display(nearest_components_cat)

print("Nearest Components based on numerical data:")
display(nearest_components_num)

print("Nearest Components based on combined distances:")
display(nearest_components_combined)


Nearest Components based on categorical data:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,mpn,substance_names,Distance
6804,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Lead (Pb),)",0.0
6963,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Ortho-Cresol Novolac Resin,)",0.0
6964,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Fused Silica (SiO2),)",0.0
6965,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Carbon Black (C),)",0.0
6966,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Antimony Trioxide (Sb2O3),)",0.0
6967,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Brominated epoxy resin,)",0.0
6968,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Silicon (Si),)",0.0
6969,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Silver (Ag),)",0.0
6796,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Silica Crystalline (SiO2),)",0.0
6797,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Ortho-Cresol Novolac Resin,)",0.0


Nearest Components based on numerical data:


Unnamed: 0,Collector Current-Max (IC),mpn,substance_names,Distance
0,8.0,2SB1204T-TL-E,"(Ortho-Cresol Novolac Resin,)",0.0
1,8.0,2SB1204T-TL-E,"(Fused Silica (SiO2),)",0.0
2,8.0,2SB1204T-TL-E,"(Carbon Black (C),)",0.0
3,8.0,2SB1204T-TL-E,"(Antimony Trioxide (Sb2O3),)",0.0
4,8.0,2SB1204T-TL-E,"(Gold (Au),)",0.0
5,8.0,2SB1204T-TL-E,"(Bismuth (Bi),)",0.0
6,8.0,2SB1204T-TL-E,"(Lead (Pb),)",0.0
7,8.0,2SB1204T-TL-E,"(Silver (Ag),)",0.0
8,8.0,2SB1204T-TL-E,"(Brominated epoxy resin,)",0.0
9,8.0,2SB1204T-TL-E,"(Copper (Cu),)",0.0


Nearest Components based on combined distances:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC),mpn,substance_names,Distance
0,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204T-TL-E,"(Ortho-Cresol Novolac Resin,)",0.0
1,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204T-TL-E,"(Fused Silica (SiO2),)",0.0
2,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204T-TL-E,"(Carbon Black (C),)",0.0
4,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204T-TL-E,"(Gold (Au),)",0.0
15395,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204S-TL-E,"(Silicon (Si),)",0.0
15385,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204S-TL-E,"(Fused Silica (SiO2),)",0.0
15386,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204S-TL-E,"(Carbon Black (C),)",0.0
15393,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204S-TL-E,"(Bismuth (Bi),)",0.0
15394,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204S-TL-E,"(Gold (Au),)",0.0
15390,e6,YES,TIN BISMUTH,PNP,8.0,2SB1204S-TL-E,"(Tin (Sn),)",0.0


In [32]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances



relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):
    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_columns):
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def get_new_data_point(df, index, categorical_columns, numerical_specs):
    numerical_columns = [spec['key'] for spec in numerical_specs]
    if index >= len(df):
        raise IndexError("Index out of bounds for the dataframe.")
    row_df = df.iloc[[index]]
    categorical_features = prepare_categorical_features(row_df, categorical_columns)
    numerical_features = prepare_numerical_features(row_df, numerical_columns)
    return np.vstack([categorical_features]), np.vstack([numerical_features])

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        return cosine_distances(query_point, features_matrix)[0]
    else:
        return euclidean_distances(query_point, features_matrix)[0]

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names','substance_name_rankings' ]]
    results['Distance'] = nearest_distances  
    return results


def evaluate_x_values(relevant_df, new_data_point_cat, new_data_point_num, categorical_columns, numerical_columns, substance_name, x_values):
    results = []
    for x in x_values:
        
        cat_distances = calculate_distances(features_matrix_cat, new_data_point_cat, 'cosine')
        num_distances = calculate_distances(features_matrix_num, new_data_point_num, 'euclidean')
        combined_distances = x * cat_distances + (1 - x) * num_distances
        
      
        nearest_components = find_nearest_components(relevant_df, combined_distances, categorical_columns + numerical_columns)
        
       
        filtered_nearest = nearest_components[nearest_components['substance_names'] == substance_name]
        
      
        results.append((x, filtered_nearest, combined_distances))
    
    return results


categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]

features_matrix_cat = prepare_categorical_features(relevant_df, categorical_columns)
features_matrix_num = prepare_numerical_features(relevant_df, numerical_columns)


index_of_new_data_point = 0
new_data_point_cat, new_data_point_num = get_new_data_point(relevant_df, index_of_new_data_point, categorical_columns, numerical_specs_present)

new_data_substance_name = relevant_df.iloc[index_of_new_data_point]['substance_names']


x_values = np.linspace(0, 1, 11) 


evaluation_results = evaluate_x_values(relevant_df, new_data_point_cat, new_data_point_num, categorical_columns, numerical_columns, new_data_substance_name, x_values)

best_result = max(evaluation_results, key=lambda item: len(item[1]))


best_x = best_result[0]
best_nearest_components = best_result[1]

print(f"Best value of x: {best_x}")
print("Nearest Components based on combined distances with the same substance name:")
display(best_nearest_components)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)


Best value of x: 0.7000000000000001
Nearest Components based on combined distances with the same substance name:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
5687,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.013852
15682,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.013852


In [33]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):
    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_columns):
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def get_new_data_point(df, index, categorical_columns, numerical_columns):
    if index >= len(df):
        raise IndexError("Index out of bounds for the dataframe.")
    row_df = df.iloc[[index]]
    categorical_features = prepare_categorical_features(row_df, categorical_columns)
    numerical_features = prepare_numerical_features(row_df, numerical_columns)
    return np.vstack([categorical_features]), np.vstack([numerical_features])

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        return cosine_distances(query_point, features_matrix)[0]
    else:
        return euclidean_distances(query_point, features_matrix)[0]

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names', 'substance_name_rankings']]
    results['Distance'] = nearest_distances
    return results

def evaluate_alpha_values(relevant_df, new_data_point_cat, new_data_point_num, categorical_columns, numerical_columns, new_data_substance_rank, alpha_values):
    results = []
    for alpha in alpha_values:
        cat_distances = calculate_distances(features_matrix_cat, new_data_point_cat, 'cosine')
        num_distances = calculate_distances(features_matrix_num, new_data_point_num, 'euclidean')
        
        combined_distances = alpha * cat_distances + (1 - alpha) * num_distances
        nearest_components = find_nearest_components(relevant_df, combined_distances, categorical_columns + numerical_columns)
        
        nearest_components['RankingScore'] = nearest_components['substance_name_rankings'].apply(
            lambda rank: 0 if rank == new_data_substance_rank else 
            1 if abs(rank - new_data_substance_rank) == 1 else
            2 if abs(rank - new_data_substance_rank) == 2 else 3
        )
        
        nearest_components = nearest_components.sort_values(by=['RankingScore', 'Distance'])
        results.append((alpha, nearest_components))
    
    return results

categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]

features_matrix_cat = prepare_categorical_features(relevant_df, categorical_columns)
features_matrix_num = prepare_numerical_features(relevant_df, numerical_columns)

index_of_new_data_point = 5
new_data_point_cat, new_data_point_num = get_new_data_point(relevant_df, index_of_new_data_point, categorical_columns, numerical_columns)

new_data_substance_rank = relevant_df.iloc[index_of_new_data_point]['substance_name_rankings']

alpha_values = np.linspace(0, 1, 11)

evaluation_results = evaluate_alpha_values(relevant_df, new_data_point_cat, new_data_point_num, categorical_columns, numerical_columns, new_data_substance_rank, alpha_values)

# Select the best result based on the RankingScore and Distance
best_result = min(evaluation_results, key=lambda item: (item[1]['RankingScore'].sum(), item[1]['Distance'].sum()))

best_alpha = best_result[0]
best_nearest_components = best_result[1].head(10)

print(f"Best alpha: {best_alpha}")

print("Nearest Components based on categorical data:")
nearest_components_cat = find_nearest_components(relevant_df, cat_distances, categorical_columns)
display(nearest_components_cat)

print("Nearest Components based on numerical data:")
nearest_components_num = find_nearest_components(relevant_df, num_distances, numerical_columns)
display(nearest_components_num)

print("Nearest Components based on combined distances and adjusted rankings:")
display(best_nearest_components)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)


Best alpha: 0.7000000000000001
Nearest Components based on categorical data:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,mpn,substance_names,substance_name_rankings,Distance
6804,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Lead (Pb),)",5.0,0.0
6963,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.0
6964,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Fused Silica (SiO2),)",4.0,0.0
6965,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Carbon Black (C),)",3.0,0.0
6966,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.0
6967,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Brominated epoxy resin,)",5.0,0.0
6968,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Silicon (Si),)",1.0,0.0
6969,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Silver (Ag),)",4.0,0.0
6796,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Silica Crystalline (SiO2),)",4.0,0.0
6797,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.0


Nearest Components based on numerical data:


Unnamed: 0,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
0,8.0,2SB1204T-TL-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.0
1,8.0,2SB1204T-TL-E,"(Fused Silica (SiO2),)",4.0,0.0
2,8.0,2SB1204T-TL-E,"(Carbon Black (C),)",3.0,0.0
3,8.0,2SB1204T-TL-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.0
4,8.0,2SB1204T-TL-E,"(Gold (Au),)",4.0,0.0
5,8.0,2SB1204T-TL-E,"(Bismuth (Bi),)",5.0,0.0
6,8.0,2SB1204T-TL-E,"(Lead (Pb),)",5.0,0.0
7,8.0,2SB1204T-TL-E,"(Silver (Ag),)",4.0,0.0
8,8.0,2SB1204T-TL-E,"(Brominated epoxy resin,)",5.0,0.0
9,8.0,2SB1204T-TL-E,"(Copper (Cu),)",4.0,0.0


Nearest Components based on combined distances and adjusted rankings:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance,RankingScore
5698,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.013852,0
12249,e6,YES,TIN BISMUTH,PNP,0.7,30A02CH-TL-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.013852,0
5687,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.013852,0
5688,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Brominated epoxy resin,)",5.0,0.013852,0
15679,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Brominated Bisphenol A Diglycidyl Ether,)",5.0,0.013852,0
15680,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.013852,0
15682,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.013852,0
5697,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Carbon Black (C),)",3.0,0.013852,2
12248,e6,YES,TIN BISMUTH,PNP,0.7,30A02CH-TL-E,"(Carbon Black (C),)",3.0,0.013852,2
15681,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Carbon Black (C),)",3.0,0.013852,2


In [34]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):
    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_columns):
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def get_new_data_point(df, index, categorical_columns, numerical_columns):
    if index >= len(df):
        raise IndexError("Index out of bounds for the dataframe.")
    row_df = df.iloc[[index]]
    categorical_features = prepare_categorical_features(row_df, categorical_columns)
    numerical_features = prepare_numerical_features(row_df, numerical_columns)
    return np.vstack([categorical_features]), np.vstack([numerical_features])

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        return cosine_distances(query_point, features_matrix)[0]
    else:
        return euclidean_distances(query_point, features_matrix)[0]

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names', 'substance_name_rankings']]
    results['Distance'] = nearest_distances
    return results

def evaluate_alpha_values(relevant_df, new_data_point_cat, new_data_point_num, categorical_columns, numerical_columns, new_data_substance_rank, alpha_values):
    results = []
    for alpha in alpha_values:
        cat_distances = calculate_distances(features_matrix_cat, new_data_point_cat, 'cosine')
        num_distances = calculate_distances(features_matrix_num, new_data_point_num, 'euclidean')
        
        combined_distances = alpha * cat_distances + (1 - alpha) * num_distances
        nearest_components = find_nearest_components(relevant_df, combined_distances, categorical_columns + numerical_columns)
        
        nearest_components['RankingScore'] = nearest_components['substance_name_rankings'].apply(
            lambda rank: 0 if rank == new_data_substance_rank else 
            1 if abs(rank - new_data_substance_rank) == 1 else
            2 if abs(rank - new_data_substance_rank) == 2 else 3
        )
        
        nearest_components = nearest_components.sort_values(by=['RankingScore', 'Distance'])
        results.append((alpha, nearest_components))
    
    return results

categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]

features_matrix_cat = prepare_categorical_features(relevant_df, categorical_columns)
features_matrix_num = prepare_numerical_features(relevant_df, numerical_columns)

index_of_new_data_point = 5
new_data_point_cat, new_data_point_num = get_new_data_point(relevant_df, index_of_new_data_point, categorical_columns, numerical_columns)

new_data_substance_rank = relevant_df.iloc[index_of_new_data_point]['substance_name_rankings']

alpha_values = np.linspace(0, 1, 11)

evaluation_results = evaluate_alpha_values(relevant_df, new_data_point_cat, new_data_point_num, categorical_columns, numerical_columns, new_data_substance_rank, alpha_values)

# Select the best result based on the RankingScore and Distance
best_result = min(evaluation_results, key=lambda item: (item[1]['RankingScore'].sum(), item[1]['Distance'].sum()))

best_alpha = best_result[0]
best_nearest_components = best_result[1].head(10)

print(f"Best alpha: {best_alpha}")

print("Nearest Components based on categorical data:")
nearest_components_cat = find_nearest_components(relevant_df, cat_distances, categorical_columns)
display(nearest_components_cat)

print("Nearest Components based on numerical data:")
nearest_components_num = find_nearest_components(relevant_df, num_distances, numerical_columns)
display(nearest_components_num)

print("Nearest Components based on combined distances and adjusted rankings:")
display(best_nearest_components)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)


Best alpha: 0.7000000000000001
Nearest Components based on categorical data:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,mpn,substance_names,substance_name_rankings,Distance
6804,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Lead (Pb),)",5.0,0.0
6963,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.0
6964,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Fused Silica (SiO2),)",4.0,0.0
6965,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Carbon Black (C),)",3.0,0.0
6966,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.0
6967,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Brominated epoxy resin,)",5.0,0.0
6968,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Silicon (Si),)",1.0,0.0
6969,e6,YES,TIN BISMUTH,PNP,12A02CH-TL-E,"(Silver (Ag),)",4.0,0.0
6796,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Silica Crystalline (SiO2),)",4.0,0.0
6797,e6,YES,TIN BISMUTH,PNP,2SB1123T-TD-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.0


Nearest Components based on numerical data:


Unnamed: 0,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
0,8.0,2SB1204T-TL-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.0
1,8.0,2SB1204T-TL-E,"(Fused Silica (SiO2),)",4.0,0.0
2,8.0,2SB1204T-TL-E,"(Carbon Black (C),)",3.0,0.0
3,8.0,2SB1204T-TL-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.0
4,8.0,2SB1204T-TL-E,"(Gold (Au),)",4.0,0.0
5,8.0,2SB1204T-TL-E,"(Bismuth (Bi),)",5.0,0.0
6,8.0,2SB1204T-TL-E,"(Lead (Pb),)",5.0,0.0
7,8.0,2SB1204T-TL-E,"(Silver (Ag),)",4.0,0.0
8,8.0,2SB1204T-TL-E,"(Brominated epoxy resin,)",5.0,0.0
9,8.0,2SB1204T-TL-E,"(Copper (Cu),)",4.0,0.0


Nearest Components based on combined distances and adjusted rankings:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance,RankingScore
5698,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.013852,0
12249,e6,YES,TIN BISMUTH,PNP,0.7,30A02CH-TL-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.013852,0
5687,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.013852,0
5688,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Brominated epoxy resin,)",5.0,0.013852,0
15679,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Brominated Bisphenol A Diglycidyl Ether,)",5.0,0.013852,0
15680,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Antimony Trioxide (Sb2O3),)",5.0,0.013852,0
15682,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Ortho-Cresol Novolac Resin,)",5.0,0.013852,0
5697,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-6-TB-E,"(Carbon Black (C),)",3.0,0.013852,2
12248,e6,YES,TIN BISMUTH,PNP,0.7,30A02CH-TL-E,"(Carbon Black (C),)",3.0,0.013852,2
15681,e6,YES,TIN BISMUTH,PNP,0.7,2SB815-7-TB-E,"(Carbon Black (C),)",3.0,0.013852,2


In [35]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):
    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_columns):
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def prepare_ranking_features(df, ranking_column='normalized_ranking'):
    return df[[ranking_column]].values

def get_new_data_point(df, index, categorical_columns, numerical_columns, ranking_column='normalized_ranking'):
    if index >= len(df):
        raise IndexError("Index out of bounds for the dataframe.")
    row_df = df.iloc[[index]]
    categorical_features = prepare_categorical_features(row_df, categorical_columns)
    numerical_features = prepare_numerical_features(row_df, numerical_columns)
    ranking_features = prepare_ranking_features(row_df, ranking_column)
    return np.vstack([categorical_features]), np.vstack([numerical_features]), ranking_features

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        return cosine_distances(query_point, features_matrix)[0]
    else:
        return euclidean_distances(query_point, features_matrix)[0]

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names','substance_name_rankings']]
    results['Distance'] = nearest_distances  
    return results


relevant_df['normalized_ranking'] = relevant_df['substance_name_rankings'] / relevant_df['substance_name_rankings'].max()


relevant_df['normalized_ranking'].fillna(relevant_df['normalized_ranking'].mean(), inplace=True)


categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]

features_matrix_cat = prepare_categorical_features(relevant_df, categorical_columns)
features_matrix_num = prepare_numerical_features(relevant_df, numerical_columns)
features_matrix_rank = prepare_ranking_features(relevant_df)


new_data_point_cat, new_data_point_num, new_data_point_rank = get_new_data_point(relevant_df, 310, categorical_columns, numerical_columns)


cat_distances = calculate_distances(features_matrix_cat, new_data_point_cat, 'cosine')
num_distances = calculate_distances(features_matrix_num, new_data_point_num, 'euclidean')
rank_distances = calculate_distances(features_matrix_rank, new_data_point_rank, 'euclidean')


best_alpha = None
best_score = float('inf')

for alpha in np.linspace(0, 1, 11): 
    combined_distances = alpha * rank_distances + (1 - alpha) * (cat_distances + num_distances) / 2
    nearest_components_combined = find_nearest_components(relevant_df, combined_distances, categorical_columns + numerical_columns)
   
    score = nearest_components_combined['Distance'].sum()
    if score < best_score:
        best_score = score
        best_alpha = alpha

print(f"Best alpha: {best_alpha}")


combined_distances = best_alpha * rank_distances + (1 - best_alpha) * (cat_distances + num_distances) / 2
nearest_components_combined = find_nearest_components(relevant_df, combined_distances, categorical_columns + numerical_columns)


print("Nearest Components based on categorical data:")
nearest_components_cat = find_nearest_components(relevant_df, cat_distances, categorical_columns)
display(nearest_components_cat)

print("Nearest Components based on numerical data:")
nearest_components_num = find_nearest_components(relevant_df, num_distances, numerical_columns)
display(nearest_components_num)

print("Nearest Components based on combined distances:")
display(nearest_components_combined)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  relevant_df['normalized_ranking'].fillna(relevant_df['normalized_ranking'].mean(), inplace=True)


Best alpha: 1.0
Nearest Components based on categorical data:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,mpn,substance_names,substance_name_rankings,Distance
12489,e3,YES,MATTE TIN,NPN,MMBT3416LT3G,"(Phenolic Resin (Novolac),)",4.0,0.0
5171,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Zinc Monoxide (ZnO),)",4.0,0.0
5170,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(2,4,6-triamino-1,3,5-triazine isocyanuric acid,)",2.0,0.0
5169,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Silica Amorphous (SiO2),)",4.0,0.0
5168,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Silicon (Si),)",1.0,0.0
5167,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Nickel (Ni),)",5.0,0.0
5099,e3,YES,MATTE TIN,NPN,NSBC143EDP6T5G,"(Nickel (Ni),)",5.0,0.0
5098,e3,YES,MATTE TIN,NPN,NSBC143EDP6T5G,"(Iron (Fe),)",3.0,0.0
5097,e3,YES,MATTE TIN,NPN,NSBC143EDP6T5G,"(Copper (Cu),)",4.0,0.0
12482,e3,YES,MATTE TIN,NPN,MMBT3416LT3G,"(Iron (Fe),)",3.0,0.0


Nearest Components based on numerical data:


Unnamed: 0,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
9642,,NSVBC144EDXV6T1G,"(Phenolic Resin (Novolac),)",4.0,0.0
16355,,2SA2169-E,"(Tin (Sn),)",3.0,0.0
16354,,2SA2169-E,"(Fused Silica (SiO2),)",4.0,0.0
16353,,2SA2169-E,"(Copper (Cu),)",4.0,0.0
16352,,2SA2169-E,"(Aluminum (Al),)",3.0,0.0
16310,,NSVMMBT5087LT3G,"(Copper (Cu),)",4.0,0.0
16309,,NSVMMBT5087LT3G,"(Ortho Cresol Novolac Resin,)",,0.0
16308,,NSVMMBT5087LT3G,"(Carbon Black (C),)",3.0,0.0
16307,,NSVMMBT5087LT3G,"(Aluminum Hydroxide (Al(OH)3),)",3.0,0.0
16306,,NSVMMBT5087LT3G,"(Fused Silica (SiO2),)",4.0,0.0


Nearest Components based on combined distances:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
11886,e3,YES,MATTE TIN,PNP,0.2,MMBT3906TT1G,"(Aluminum Hydroxide (Al(OH)3),)",3.0,0.0
4181,,,,,,SOC2222AHRG,"(Manganese (Mn),)",3.0,0.0
4179,,,,,,SOC2222AHRG,"(Iron (Fe),)",3.0,0.0
11872,e6,,TIN BISMUTH,,,MCH3109-TL-E,"(Carbon Black (C),)",3.0,0.0
11858,e3,YES,MATTE TIN,NPN,0.5,FMB2222A,"(Carbon Black (C),)",3.0,0.0
4206,e3,YES,MATTE TIN,NPN,0.1,MUN2211T1G,"(Carbon Black (C),)",3.0,0.0
4203,e3,YES,MATTE TIN,NPN,0.05,MMBT2484LT3G,"(Aluminum Hydroxide (Al(OH)3),)",3.0,0.0
4200,e3,YES,MATTE TIN,NPN,0.05,MMBT2484LT3G,"(Tin (Sn),)",3.0,0.0
4198,e3,YES,MATTE TIN,NPN,0.05,MMBT2484LT3G,"(Iron (Fe),)",3.0,0.0
11830,e3,YES,MATTE TIN,PNP,0.1,BSS63LT1G,"(Carbon Black (C),)",3.0,0.0


In [36]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):
    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_columns):
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def calculate_ranking_distances(df):
    rankings = df['normalized_ranking'].values.reshape(-1, 1)
    return euclidean_distances(rankings, rankings)

def get_new_data_point(df, index, categorical_columns, numerical_columns):
    if index >= len(df):
        raise IndexError("Index out of bounds for the dataframe.")
    row_df = df.iloc[[index]]
    categorical_features = prepare_categorical_features(row_df, categorical_columns)
    numerical_features = prepare_numerical_features(row_df, numerical_columns)
    ranking_feature = row_df['normalized_ranking'].values[0]
    return np.vstack([categorical_features]), np.vstack([numerical_features]), ranking_feature

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        return cosine_distances(query_point, features_matrix)[0]
    else:
        return euclidean_distances(query_point, features_matrix)[0]

def normalize_distances(distances):
    return distances / np.max(distances)

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names','substance_name_rankings']]
    results['Distance'] = nearest_distances  
    return results


relevant_df['normalized_ranking'] = relevant_df['substance_name_rankings'] / relevant_df['substance_name_rankings'].max()


relevant_df['normalized_ranking'].fillna(relevant_df['normalized_ranking'].mean(), inplace=True)


categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]

features_matrix_cat = prepare_categorical_features(relevant_df, categorical_columns)
features_matrix_num = prepare_numerical_features(relevant_df, numerical_columns)
ranking_distances = calculate_ranking_distances(relevant_df)


new_data_point_cat, new_data_point_num, new_data_point_rank = get_new_data_point(relevant_df, 310, categorical_columns, numerical_columns)


cat_distances = calculate_distances(features_matrix_cat, new_data_point_cat, 'cosine')
num_distances = calculate_distances(features_matrix_num, new_data_point_num, 'euclidean')
env_distances = ranking_distances[310]


cat_distances_norm = normalize_distances(cat_distances)
num_distances_norm = normalize_distances(num_distances)
env_distances_norm = normalize_distances(env_distances)


total_distances = cat_distances_norm + num_distances_norm + env_distances_norm


nearest_components = find_nearest_components(relevant_df, total_distances, categorical_columns + numerical_columns)


print("Nearest Components based on categorical data:")
nearest_components_cat = find_nearest_components(relevant_df, cat_distances, categorical_columns)
display(nearest_components_cat)

print("Nearest Components based on numerical data:")
nearest_components_num = find_nearest_components(relevant_df, num_distances, numerical_columns)
display(nearest_components_num)

print("Nearest Components based on total distances:")
display(nearest_components)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  relevant_df['normalized_ranking'].fillna(relevant_df['normalized_ranking'].mean(), inplace=True)


Nearest Components based on categorical data:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,mpn,substance_names,substance_name_rankings,Distance
12489,e3,YES,MATTE TIN,NPN,MMBT3416LT3G,"(Phenolic Resin (Novolac),)",4.0,0.0
5171,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Zinc Monoxide (ZnO),)",4.0,0.0
5170,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(2,4,6-triamino-1,3,5-triazine isocyanuric acid,)",2.0,0.0
5169,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Silica Amorphous (SiO2),)",4.0,0.0
5168,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Silicon (Si),)",1.0,0.0
5167,e3,YES,MATTE TIN,NPN,SMUN5215T1G,"(Nickel (Ni),)",5.0,0.0
5099,e3,YES,MATTE TIN,NPN,NSBC143EDP6T5G,"(Nickel (Ni),)",5.0,0.0
5098,e3,YES,MATTE TIN,NPN,NSBC143EDP6T5G,"(Iron (Fe),)",3.0,0.0
5097,e3,YES,MATTE TIN,NPN,NSBC143EDP6T5G,"(Copper (Cu),)",4.0,0.0
12482,e3,YES,MATTE TIN,NPN,MMBT3416LT3G,"(Iron (Fe),)",3.0,0.0


Nearest Components based on numerical data:


Unnamed: 0,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
9642,,NSVBC144EDXV6T1G,"(Phenolic Resin (Novolac),)",4.0,0.0
16355,,2SA2169-E,"(Tin (Sn),)",3.0,0.0
16354,,2SA2169-E,"(Fused Silica (SiO2),)",4.0,0.0
16353,,2SA2169-E,"(Copper (Cu),)",4.0,0.0
16352,,2SA2169-E,"(Aluminum (Al),)",3.0,0.0
16310,,NSVMMBT5087LT3G,"(Copper (Cu),)",4.0,0.0
16309,,NSVMMBT5087LT3G,"(Ortho Cresol Novolac Resin,)",,0.0
16308,,NSVMMBT5087LT3G,"(Carbon Black (C),)",3.0,0.0
16307,,NSVMMBT5087LT3G,"(Aluminum Hydroxide (Al(OH)3),)",3.0,0.0
16306,,NSVMMBT5087LT3G,"(Fused Silica (SiO2),)",4.0,0.0


Nearest Components based on total distances:


Unnamed: 0,JESD-609 Code,Surface Mount,Terminal Finish,Polarity/Channel Type,Collector Current-Max (IC),mpn,substance_names,substance_name_rankings,Distance
15026,e3,YES,MATTE TIN,NPN,0.7,NSS30071MR6T1G,"(Tin (Sn),)",3.0,0.006769
15017,e3,YES,MATTE TIN,NPN,0.7,NSS30071MR6T1G,"(Aluminum Hydroxide (Al(OH)3),)",3.0,0.006769
15018,e3,YES,MATTE TIN,NPN,0.7,NSS30071MR6T1G,"(Carbon Black (C),)",3.0,0.006769
15021,e3,YES,MATTE TIN,NPN,0.7,NSS30071MR6T1G,"(Iron (Fe),)",3.0,0.006769
3893,e3,YES,MATTE TIN,NPN,0.6,MMBT2222ATT3G,"(Iron (Fe),)",3.0,0.006837
5199,e3,YES,MATTE TIN,NPN,0.6,SMMBT2222ALT3G,"(Aluminum Hydroxide (Al(OH)3),)",3.0,0.006837
5198,e3,YES,MATTE TIN,NPN,0.6,SMMBT2222ALT3G,"(Carbon Black (C),)",3.0,0.006837
5195,e3,YES,MATTE TIN,NPN,0.6,SMMBT2222ALT3G,"(Iron (Fe),)",3.0,0.006837
14381,e3,YES,MATTE TIN,NPN,0.6,NSVMMBT5550LT1G,"(Tin (Sn),)",3.0,0.006837
14378,e3,YES,MATTE TIN,NPN,0.6,NSVMMBT5550LT1G,"(Carbon Black (C),)",3.0,0.006837


In [37]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import StandardScaler
from makersite_ml_services_utils.text_embeddings import get_text_embeddings
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

relevant_df = relevant_df.drop_duplicates(keep='first')
embeddings_cache = {}

def cache_embeddings_in_batches(df, json_column='json_representation'):
    global embeddings_cache
    json_strings = df[json_column].unique()
    uncached_jsons = [json_str for json_str in json_strings if json_str not in embeddings_cache]
    if uncached_jsons:
        batch_embeddings = get_text_embeddings(uncached_jsons).embeddings
        for json_str, embedding in zip(uncached_jsons, batch_embeddings):
            if isinstance(embedding, list):
                embedding = np.array(embedding)
            if embedding.ndim == 1:
                embedding = embedding.reshape(1, -1)
            embeddings_cache[json_str] = embedding

def prepare_categorical_features(df, categorical_columns):
    df['json_representation'] = df.apply(lambda row: json.dumps({col: row[col] for col in categorical_columns}), axis=1)
    cache_embeddings_in_batches(df)
    embeddings_list = df['json_representation'].apply(lambda json_str: embeddings_cache[json_str])
    return np.vstack(embeddings_list)

def prepare_numerical_features(df, numerical_columns):
    df_numerical = df[numerical_columns].copy()
    df_numerical.fillna(df_numerical.mean(), inplace=True)
    scaler = StandardScaler()
    return scaler.fit_transform(df_numerical.values)

def calculate_ranking_distances(df):
    rankings = df['normalized_ranking'].values.reshape(-1, 1)
    return euclidean_distances(rankings, rankings)

def get_new_data_point(df, index, categorical_columns, numerical_columns):
    if index >= len(df):
        raise IndexError("Index out of bounds for the dataframe.")
    row_df = df.iloc[[index]]
    categorical_features = prepare_categorical_features(row_df, categorical_columns)
    numerical_features = prepare_numerical_features(row_df, numerical_columns)
    ranking_feature = row_df['normalized_ranking'].values[0]
    return np.vstack([categorical_features]), np.vstack([numerical_features]), ranking_feature

def calculate_distances(features_matrix, query_point, distance_type='euclidean'):
    if distance_type == 'cosine':
        return cosine_distances(query_point, features_matrix)[0]
    else:
        return euclidean_distances(query_point, features_matrix)[0]

def normalize_distances(distances):
    return distances / np.max(distances)

def find_nearest_components(df, distances, columns, top_n=10):
    nearest_indices = np.argsort(distances)[:top_n]
    nearest_distances = distances[nearest_indices]
    results = df.iloc[nearest_indices][columns + ['mpn', 'substance_names','substance_name_rankings']]
    results['Distance'] = nearest_distances  
    return results

relevant_df['normalized_ranking'] = relevant_df['substance_name_rankings'] / relevant_df['substance_name_rankings'].max()

relevant_df['normalized_ranking'].fillna(relevant_df['normalized_ranking'].mean(), inplace=True)

categorical_columns = [spec['key'] for spec in categorical_specs_present]
numerical_columns = [spec['key'] for spec in numerical_specs_present]

features_matrix_cat = prepare_categorical_features(relevant_df, categorical_columns)
features_matrix_num = prepare_numerical_features(relevant_df, numerical_columns)
ranking_distances = calculate_ranking_distances(relevant_df)

new_data_point_cat, new_data_point_num, new_data_point_rank = get_new_data_point(relevant_df, 310, categorical_columns, numerical_columns)

cat_distances = calculate_distances(features_matrix_cat, new_data_point_cat, 'cosine')
num_distances = calculate_distances(features_matrix_num, new_data_point_num, 'euclidean')
env_distances = ranking_distances[310]


cat_distances_norm = normalize_distances(cat_distances)
num_distances_norm = normalize_distances(num_distances)
env_distances_norm = normalize_distances(env_distances)


weight_cat = 0.4  # Example weight for categorical distances
weight_num = 0.4  # Example weight for numerical distances
weight_env = 0.2  # Example weight for environmental distances

total_distances = (weight_cat * cat_distances_norm + 
                  weight_num * num_distances_norm + 
                  weight_env * env_distances_norm)


nearest_components = find_nearest_components(relevant_df, total_distances, categorical_columns + numerical_columns)


print("Nearest Components based on categorical data:")
nearest_components_cat = find_nearest_components(relevant_df, cat_distances, categorical_columns)
display(nearest_components_cat)

print("Nearest Components based on numerical data:")
nearest_components_num = find_nearest_components(relevant_df, num_distances, numerical_columns)
display(nearest_components_num)

print("Nearest Components based on total distances:")
display(nearest_components)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  relevant_df['normalized_ranking'].fillna(relevant_df['normalized_ranking'].mean(), inplace=True)


: 