# Bibliotecas

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

# Problema
Dado um conjunto de dados, como encontrar potenciais plágios e/ou duplicatas para remoção?

## Dados Base

In [None]:
texts = [
    "Gradient boosting ensembles sequentially add models to correct errors of prior ones.",
    "SVMs operate by locating the hyperplane that maximally divides different categories.",
    "Support vector machines work by finding a hyperplane that best separates classes.",
    "Deep learning models can extract intricate patterns from data.",
    "Overfitting happens when a model learns noise instead of the underlying pattern.",
    "Decision trees split data based on feature thresholds to make predictions.",
    "Activation functions introduce non-linearity into neural network layers.",
    "Principal component analysis reduces dimensionality by projecting onto orthogonal axes.",
    "This is an unrelated sentence about cooking recipes.",
    "Data preprocessing is essential for machine learning success.",
    "Batch normalization stabilizes learning by normalizing layer inputs.",
    "Data preprocessing is essential for machine learning success.",
    "From data, deep learning models are able to extract complex patterns.",
    "Neural networks require large amounts of labeled training data.",
    "Support vector machines classify data by identifying the optimal separating hyperplane between classes.",
    "Reinforcement learning agents improve through trial and error interactions.",
    "Cross-validation helps assess model generalization on unseen data.",
    "Hyperparameter tuning finds the best configuration for model performance.",
    "Neural networks demand substantial labeled data for training.",
    "Clustering algorithms group points that are close in feature space."
]

# Solução (Busca Vetorial entre Itens)
A ideia é vetorizar todos os textos e checar a similaridade deste versus os outros.
Aqueles que tiverem distância (dissimilaridade) próxima de `0` provavelmente são duplicatas ou tratam de temas extremamente parecidos.

## Carregando Modelo

In [None]:
# Setando o modelo
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformerEmbeddings(model_name=model_name)

## Construindo o Banco Vetorial

In [None]:
# Construindo com o FAISS no LangChain
vectordb = FAISS.from_texts(texts, embedding=embedder)

## Checando Semelhança Entre Itens

In [None]:
THRESHOLD = 1.0 # Distância mínima para não ser considerado muito semelhante
potential_dups = []

for i in range(vectordb.index.ntotal):
    # Olhamos um documento específico
    doc_id = vectordb.index_to_docstore_id[i]
    doc = vectordb.docstore.search(doc_id)

    # Capturamos os top 4 mais semelhantes (3 + ele)
    results = vectordb.similarity_search_with_score(query=doc.page_content, k=4)

    # Coletamos todos que tem distância menor que 1, exceto ele próprio
    suspects = [
        other.page_content
        for other, score in results
        if other.page_content != doc.page_content and score < THRESHOLD
    ]

    # Guardamos o "original" e as potenciais duplicatas
    if suspects:
        potential_dups.append((doc.page_content, suspects))


In [None]:
# Printando o resultado
for original, duplicates in potential_dups:
    print(f"Original: {original}")
    for dup in duplicates:
        print(f"    Dup.: {dup}")
    print()