In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
df = pd.read_csv(r"C:\Users\spide\OneDrive\Desktop\Bachlorz\Data\extracted_data.csv")

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings for a given text
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Generate embeddings for each tender content
df['embeddings'] = df['content'].apply(lambda x: generate_embeddings(x))

# Function to handle search query and find the most relevant tenders
def recommend_tenders(query, df):
    query_embedding = generate_embeddings(query)
    
    # Compute similarity scores between the query and each document
    similarities = df['embeddings'].apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])
    
    # Sort tenders based on similarity and return the top 5 recommendations
    top_recommendations = df.iloc[similarities.argsort()[-5:]]
    return top_recommendations[['filename', 'source_format', 'content']]

# Example usage
query = "cloud computing"
recommended_tenders = recommend_tenders(query, df)
print(recommended_tenders)


In [3]:
# Check the shape of an embedding for content and query
sample_content = df['content'].iloc[0]  # Get the first content sample
sample_embedding = generate_embeddings(sample_content)
print("Sample content embedding shape:", sample_embedding.shape)

sample_query = "cloud computing"
query_embedding = generate_embeddings(sample_query)
print("Query embedding shape:", query_embedding.shape)


Sample content embedding shape: (768,)
Query embedding shape: (768,)


In [4]:
# Check cosine similarity for the first document
similarity_score = cosine_similarity([query_embedding], [sample_embedding])[0][0]
print(f"Cosine similarity between query and sample content: {similarity_score}")


Cosine similarity between query and sample content: 0.20795033872127533


In [5]:
# Updated recommend_tenders function to return the top recommendations with content similarity
def recommend_tenders(query, df):
    query_embedding = generate_embeddings(query)
    
    # Compute similarity scores between the query and each document
    similarities = df['embeddings'].apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])
    
    # Sort tenders based on similarity and return the top 5 recommendations
    top_recommendations = df.iloc[similarities.argsort()[-5:]]
    
    # Debugging output: Print similarity scores
    print(f"Top 5 similarity scores: {similarities[similarities.argsort()[-5:]]}")
    
    return top_recommendations[['filename', 'source_format', 'content']]

# Run the example to check recommendations
query = "cloud computing"
recommended_tenders = recommend_tenders(query, df)
print(recommended_tenders)


Top 5 similarity scores: 74     0.242805
178    0.254729
13     0.270323
303    0.275314
424    0.321805
Name: embeddings, dtype: float32
                                              filename source_format  \
74   C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 ...           pdf   
178  C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 ...           pdf   
13   C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 ...           pdf   
303  C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 ...          xlsx   
424  C:\Users\spide\OneDrive\Desktop\Bachlorz\2022 ...           pdf   

                                               content  
74   Stor-Elvdal kommune Enhet Forvaltning ABAKUS A...  
178  Dato: 11.05.2022 Side 1 Hovedkontor Sandvika S...  
13   4 3 2 1 MISSINGMEASUREMENTSARETOBETAKEFROM3DMO...  
303  Tabular response general requirements Unnamed:...  
424  Dokument «Integrasjonsrammeverk», ID 2945 - EQ...  


In [7]:
import gradio as gr

def recommend_tenders_gradio(query):
    recommended_tenders = recommend_tenders(query, df)
    return recommended_tenders[['content']]

# Create Gradio Interface
interface = gr.Interface(fn=recommend_tenders_gradio, inputs="text", outputs="dataframe")
interface.launch()


* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


