In [14]:
# Importing necessary Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [15]:
# Necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Loading the data
materials = pd.read_csv('materials.csv')
ground_truth = pd.read_csv('submission.csv')
test_pairs = pd.read_csv('test_pairs.csv')

In [17]:
# Text preprocessing
def preprocess_text(text):
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)

materials['cleaned_description'] = materials['Material_Description'].apply(preprocess_text)

In [18]:
# Create TF-IDF vectors with parameters
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)
tfidf_matrix = vectorizer.fit_transform(materials['cleaned_description'])

In [19]:
# Function to calculate similarity between two material IDs
def calculate_similarity(id1, id2):
    vec1 = tfidf_matrix[materials[materials['ID'] == id1].index[0]]
    vec2 = tfidf_matrix[materials[materials['ID'] == id2].index[0]]
    return cosine_similarity(vec1, vec2)[0][0]


In [20]:
# Calculate similarities for ground truth data
ground_truth['Predicted_Similarity'] = ground_truth.apply(lambda row: calculate_similarity(row['ID_1'], row['ID_2']), axis=1)

# MAP@K implementation
def apk(actual, predicted, k=10):
    if not actual:
        return 0.0
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


In [25]:
#  MAP@K calculation with adjusted threshold
def prepare_map_data(df, threshold=0.3):  # Lowered threshold
    actual = [[i for i, score in enumerate(df['Similarity_Score']) if score > threshold]]
    predicted = [sorted(range(len(df)), key=lambda i: df['Predicted_Similarity'].iloc[i], reverse=True)]
    return actual, predicted

In [26]:
# Calculate MAP@K for different K values
for k in [1, 3, 5, 10]:
    actual, predicted = prepare_map_data(ground_truth)
    map_score = mapk(actual, predicted, k=k)
    print(f"MAP@{k}: {map_score}")

MAP@1: 1.0
MAP@3: 0.5555555555555555
MAP@5: 0.4833333333333333
MAP@10: 0.5325396825396825


In [27]:
# Generate predictions for the test set
test_pairs['Predicted_Similarity'] = test_pairs.apply(lambda row: calculate_similarity(row['ID_1'], row['ID_2']), axis=1)

# Create submission file
submission = test_pairs[['ID_1', 'ID_2', 'Predicted_Similarity']]
submission = submission.rename(columns={'Predicted_Similarity': 'Similarity_Score'})
submission.to_csv('submission_final.csv', index=False)
