Load the Dependencies

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


Load the Embedding Model from sentencetransformer

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')



Load the materials.csv data

In [3]:
df = pd.read_csv(r"D:\personal\constructiveiq\assignment\materials.csv")
df.head()

Unnamed: 0,ID,Material_Description
0,1,"INSULATION GASKET KIT - 2"" - 300# - DOUBLE COM..."
1,2,"ASSEMBLY COMPRESSOR - 10"" - 150# - HOT DIP GAL..."
2,3,"SPUR GEAR PINION SHAFT - 10"" - 150# - SCH.XS A..."
3,4,"SUCTION HEADER - 6"" - 600# - HOT DIP GALVANIZE..."
4,5,"MOVABLE STOOL - 6"" - 150# - DUAL CERTIFIED, DR..."


Calculate Embeddings for each Material Description

In [7]:
# Calculate embeddings for each material description
df['Embeddings'] = df['Material_Description'].apply(lambda x: model.encode(x).tolist())
df.head()

Unnamed: 0,ID,Material_Description,Embeddings
0,1,"INSULATION GASKET KIT - 2"" - 300# - DOUBLE COM...","[-0.12933023273944855, 0.10615091025829315, -0..."
1,2,"ASSEMBLY COMPRESSOR - 10"" - 150# - HOT DIP GAL...","[-0.14430809020996094, 0.07952915877103806, -0..."
2,3,"SPUR GEAR PINION SHAFT - 10"" - 150# - SCH.XS A...","[-0.08783309906721115, 0.027707230299711227, -..."
3,4,"SUCTION HEADER - 6"" - 600# - HOT DIP GALVANIZE...","[-0.10966168344020844, 0.08991578966379166, -0..."
4,5,"MOVABLE STOOL - 6"" - 150# - DUAL CERTIFIED, DR...","[0.003951432183384895, -0.04436597600579262, -..."


Save the Embeddings to materials.csv

In [9]:
df.to_csv('materials.csv', index=False)

Calculate Similarity between Pair of ID's given in test_pairs.csv and save the similarity for each pair in submission.csv

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the materials with embeddings from the first CSV file
materials_df = pd.read_csv('materials.csv')

# Load the pairs of material IDs from the second CSV file
pairs_df = pd.read_csv('test_pairs.csv')

# Convert the 'embeddings' column back from string to list (if stored as string in the CSV)
materials_df['Embeddings'] = materials_df['Embeddings'].apply(eval)

# Create a dictionary to quickly access embeddings by material ID
embeddings_dict = dict(zip(materials_df['ID'], materials_df['Embeddings']))

# Function to calculate similarity between two material IDs
def calculate_similarity(id1, id2):
    embedding1 = embeddings_dict.get(id1)
    embedding2 = embeddings_dict.get(id2)
    if embedding1 is not None and embedding2 is not None:
        similarity = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity
    else:
        return None

# Apply the similarity calculation to each pair of IDs
pairs_df['similarity'] = pairs_df.apply(lambda row: calculate_similarity(row['ID_1'], row['ID_2']), axis=1)

# Save the DataFrame with similarity scores back to the CSV file
pairs_df.to_csv('submission.csv', index=False)

print("Similarity scores calculated and saved successfully.")


Similarity scores calculated and saved successfully.


Euclidean Distance

In [2]:
import pandas as pd
from numpy import linalg as LA

# Load the materials with embeddings from the first CSV file
materials_df = pd.read_csv('materials.csv')

# Load the pairs of material IDs from the second CSV file
pairs_df = pd.read_csv('test_pairs.csv')

# Convert the 'embeddings' column back from string to list (if stored as string in the CSV)
materials_df['Embeddings'] = materials_df['Embeddings'].apply(eval)

# Create a dictionary to quickly access embeddings by material ID
embeddings_dict = dict(zip(materials_df['ID'], materials_df['Embeddings']))

# Function to calculate Euclidean distance between two material IDs
def euclidean_distance(id1, id2):
    embedding1 = embeddings_dict.get(id1)
    embedding2 = embeddings_dict.get(id2)
    if embedding1 is not None and embedding2 is not None:
        # Calculate Euclidean distance between the two embeddings
        distance = LA.norm([e1 - e2 for e1, e2 in zip(embedding1, embedding2)])
        return distance
    else:
        return None

# Apply the distance calculation to each pair of IDs
pairs_df['euclidean_distance'] = pairs_df.apply(lambda row: euclidean_distance(row['ID_1'], row['ID_2']), axis=1)

# Save the DataFrame with Euclidean distances back to the CSV file
pairs_df.to_csv('test_pairs_with_euclidean_distance.csv', index=False)

print("Euclidean distances calculated and saved successfully.")


Euclidean distances calculated and saved successfully.


In [4]:
import pandas as pd
from numpy import linalg as LA

# Load the materials with embeddings from the first CSV file
materials_df = pd.read_csv('materials.csv')

# Load the pairs of material IDs from the second CSV file
pairs_df = pd.read_csv('test_pairs.csv')

# Convert the 'embeddings' column back from string to list (if stored as string in the CSV)
materials_df['Embeddings'] = materials_df['Embeddings'].apply(eval)

# Create a dictionary to quickly access embeddings by material ID
embeddings_dict = dict(zip(materials_df['ID'], materials_df['Embeddings']))

# Function to calculate Euclidean distance-based similarity between two material IDs
def euclidean_similarity(id1, id2):
    embedding1 = embeddings_dict.get(id1)
    embedding2 = embeddings_dict.get(id2)
    if embedding1 is not None and embedding2 is not None:
        # Calculate Euclidean distance between the two embeddings
        distance = LA.norm([e1 - e2 for e1, e2 in zip(embedding1, embedding2)])
        # Convert Euclidean distance to similarity
        similarity = 1 / (1 + distance)
        return similarity
    else:
        return None

# Apply the similarity calculation to each pair of IDs
pairs_df['euclidean_similarity'] = pairs_df.apply(lambda row: euclidean_similarity(row['ID_1'], row['ID_2']), axis=1)

# Save the DataFrame with similarity scores back to the CSV file
pairs_df.to_csv('test_pairs_with_euclidean_similarity.csv', index=False)

print("Euclidean similarity scores calculated and saved successfully.")


Euclidean similarity scores calculated and saved successfully.


I calculated Euclidean similarity but it was not that effective as cosine similarity