In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np

materials = pd.read_csv('materials.csv')
test_pairs = pd.read_csv('test_pairs.csv')

tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(materials['Material_Description'])

def compute_similarity_features(id1, id2):
    vec1 = tfidf_matrix[id1-1] 
    vec2 = tfidf_matrix[id2-1]  
    cosine_sim = cosine_similarity(vec1, vec2)[0][0]
    return [cosine_sim]

X_test = []
for _, row in test_pairs.iterrows():
    features = compute_similarity_features(row['ID_1'], row['ID_2'])
    X_test.append(features)

X_test = np.array(X_test)



np.random.seed(42)
train_pairs = np.random.randint(1, len(materials)+1, size=(2000, 2))

X_train = []
y_train = []
for pair in train_pairs:
    id1, id2 = pair[0], pair[1]
    features = compute_similarity_features(id1, id2)
    X_train.append(features)
    if id1 == id2:
        y_train.append(1)
    else:
        cosine_sim = features[0]
        y_train.append(1 if cosine_sim > 0.8 else 0)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

val_predictions = rf_model.predict(X_val)
val_predictions = np.clip(val_predictions, 0, 1)  


test_predictions = rf_model.predict(X_test)
test_predictions = np.clip(test_predictions, 0, 1)  

submission = test_pairs.copy()
submission['Similarity_Score'] = test_predictions

submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created.")


Submission file 'submission.csv' created.
