<a href="https://colab.research.google.com/github/SandipTheCoder/ConstructivIQ/blob/main/ConstructivIQ_Code_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/ConstructivIQ_Code/

/content/drive/My Drive/ConstructivIQ_Code


Main Code

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
materials = pd.read_csv("materials.csv")
test_pairs = pd.read_csv("test_pairs.csv")

# Display dataset info
print("Materials Dataset Head:")
print(materials.head())

print("\nTest Pairs Dataset Head:")
print(test_pairs.head())

# Step 1: Preprocess the material descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(materials['Material_Description'])

# Create a mapping from material ID to its TF-IDF vector
material_id_to_vector = dict(zip(materials['ID'], tfidf_matrix))

# Step 2: Compute similarity scores for each test pair
def compute_similarity(id1, id2):
    # Retrieve the TF-IDF vectors for the material IDs
    vector1 = material_id_to_vector.get(id1)
    vector2 = material_id_to_vector.get(id2)
    if vector1 is not None and vector2 is not None:
        # Compute cosine similarity between the two vectors
        similarity = cosine_similarity(vector1, vector2)[0][0]
        return similarity
    return 0.0  # Default to 0 if vector is not found

# Add similarity scores for each test pair
test_pairs['Similarity_Score'] = test_pairs.apply(
    lambda row: compute_similarity(row['ID_1'], row['ID_2']), axis=1
)

# Step 3: Generate predictions for MAP@K
# Create a dictionary where each ID_1 maps to a list of predicted IDs (ranked by similarity)
predicted_rankings = (
    test_pairs.sort_values(by=['ID_1', 'Similarity_Score'], ascending=[True, False])
    .groupby('ID_1')['ID_2']
    .apply(list)
    .to_dict()
)

# Ground truth: Assume all pairs in the test set are relevant for this example
ground_truth = test_pairs.groupby('ID_1')['ID_2'].apply(list).to_dict()

# Step 4: Define MAP@K function
def apk(actual, predicted, k):
    """
    Computes the Average Precision at K (AP@K).
    """
    if not actual or not predicted:
        return 0.0

    predicted = predicted[:k]
    score = 0.0
    num_hits = 0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:  # Avoid duplicates
            num_hits += 1
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)

def mapk(actual_dict, predicted_dict, k):
    """
    Computes Mean Average Precision at K (MAP@K).
    """
    apk_scores = [
        apk(actual_dict.get(key, []), predicted_dict.get(key, []), k)
        for key in predicted_dict.keys()
    ]
    return sum(apk_scores) / len(apk_scores)

# Step 5: Compute MAP@K
k = 10  # Define the value of K
mapk_score = mapk(ground_truth, predicted_rankings, k)

print(f"MAP@{k}: {mapk_score:.4f}")


# Step 6: Save the results in the required submission format
test_pairs[['ID_1', 'ID_2', 'Similarity_Score']].to_csv(
    "submission.csv", index=False
)

print("Submission saved to submission.csv!")


Materials Dataset Head:
   ID                               Material_Description
0   1  INSULATION GASKET KIT - 2" - 300# - DOUBLE COM...
1   2  ASSEMBLY COMPRESSOR - 10" - 150# - HOT DIP GAL...
2   3  SPUR GEAR PINION SHAFT - 10" - 150# - SCH.XS A...
3   4  SUCTION HEADER - 6" - 600# - HOT DIP GALVANIZE...
4   5  MOVABLE STOOL - 6" - 150# - DUAL CERTIFIED, DR...

Test Pairs Dataset Head:
   ID_1  ID_2
0   375   932
1   588    22
2   876   724
3   270   154
4   512   544
MAP@10: 1.0000
Submission saved to submission.csv!


Validation Code

In [None]:
# Load the submission file
submission_file = "submission.csv"
submission = pd.read_csv(submission_file)

# Step 1: Check the first few rows
print("Submission File Head:")
print(submission.head())

# Step 2: Check columns
required_columns = ['ID_1', 'ID_2', 'Similarity_Score']
if all(col in submission.columns for col in required_columns):
    print("\nAll required columns are present.")
else:
    print("\nError: Missing required columns!")

# Step 3: Validate similarity scores are within the range [0, 1]
invalid_scores = submission[(submission['Similarity_Score'] < 0) | (submission['Similarity_Score'] > 1)]
if invalid_scores.empty:
    print("\nAll similarity scores are valid (between 0 and 1).")
else:
    print("\nInvalid similarity scores found:")
    print(invalid_scores)

# Step 4: Check number of rows matches the test_pairs dataset
test_pairs = pd.read_csv("test_pairs.csv")
if len(submission) == len(test_pairs):
    print("\nThe number of rows matches the test_pairs dataset.")
else:
    print(f"\nMismatch in row count: Submission has {len(submission)} rows, "
          f"but test_pairs has {len(test_pairs)} rows.")

print("\nValidation completed!")


Submission File Head:
   ID_1  ID_2  Similarity_Score
0   375   932          0.071704
1   588    22          0.110973
2   876   724          0.059701
3   270   154          0.162412
4   512   544          0.026452

All required columns are present.

All similarity scores are valid (between 0 and 1).

The number of rows matches the test_pairs dataset.

Validation completed!
