In [None]:
!pip install faiss-cpu
!pip install sentence-transformers

In [None]:
import pandas as pd
import faiss
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [None]:
csv_file_path = '/kaggle/input/ipc-preprocessed-solutions/preprocessed_solutions_v1.csv'
df = pd.read_csv(csv_file_path)

In [None]:
class DocumentSearch:
    def __init__(self, model_name, docs_text, dataframe):
        self.model_name = model_name
        self.docs_text = docs_text
        self.df = dataframe
        self.embedding_model = None
        self.index = None
        self.docs_embedding = None
        self.dim = 0
        self.load_model()
        self.load_faiss_index()
#         self.do_embedding()
    
    def cosine_similarity(self, text1, text2):
        embeddings = self.embedding_model.encode([text1, text2], show_progress_bar=False)
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        similarity = np.dot(embeddings[0], embeddings[1])
        return similarity

    def do_embedding(self):
#         self.embedding_model = SentenceTransformer(self.model_name)
        self.docs_embeddings = self.embedding_model.encode(self.docs_text, show_progress_bar=True, convert_to_numpy=True)
        self.dim = self.docs_embeddings.shape[1]  # Dimension of the embeddings
        print(f"Dimensions = {self.dim}")
        self.index = faiss.IndexFlatIP(self.dim)  # Use a FlatIP index for inner product (cosine similarity)
        self.index.add(self.docs_embeddings)  # Add the embeddings to the index
    
    def save_model(self, directory="model_directory"):
        self.embedding_model.save(directory)
    
    def save_faiss_index(self, file_path="faiss_index.bin"):
        faiss.write_index(self.index, file_path)
    
    def load_model(self, directory="/kaggle/working/model_directory"):
        self.embedding_model = SentenceTransformer(directory)

    def load_faiss_index(self, file_path="/kaggle/working/faiss_index.bin"):
        self.index = faiss.read_index(file_path)
        
    def query(self, query_text, k=10):
        query_embedding = self.embedding_model.encode([query_text], convert_to_numpy=True)
        D, I = self.index.search(query_embedding, k)  # D: distances, I: indices
        similar_problems = self.df.iloc[I[0]]  # I[0] because `I` is a list of lists
        print("Top similar problems:")
        for i, idx in enumerate(I[0]):
            print(f"{i+1}: {self.df['problem_statement'].iloc[idx]} {self.df['problem_link'].iloc[idx]} (Similarity: {D[0][i]})")

In [None]:
model_name = "sentence-transformers/all-mpnet-base-v2"

In [None]:
docs_text = df['summarized_problem_statement'].tolist()

In [None]:
search_engine = DocumentSearch(model_name, docs_text, df)

In [None]:
search_engine.save_model()

In [None]:
search_engine.save_faiss_index()

In [None]:
query = """
One hot summer day Pete and his friend Billy decided to buy a watermelon. They chose the biggest and the ripest one, in their opinion. After that the watermelon was weighed, and the scales showed w kilos. They rushed home, dying of thirst, and decided to divide the berry, however they faced a hard problem.

Pete and Billy are great fans of even numbers, that's why they want to divide the watermelon in such a way that each of the two parts weighs even number of kilos, at the same time it is not obligatory that the parts are equal. The boys are extremely tired and want to start their meal as soon as possible, that's why you should help them and find out, if they can divide the watermelon in the way they want. For sure, each of them should get a part of positive weight.
"""

In [None]:
search_engine.query(query, k=5)  # Retrieve top 5 similar problems

In [None]:
!zip -r fuck.zip /kaggle/working/*