In [23]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords


# Load the data
df = pd.read_csv(r"C:\Users\spide\OneDrive\Desktop\Bachlorz\extracted_data.csv")

# Load pre-trained multilingual BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained('bert-base-multilingual-uncased')

# Load Norwegian stopwords from NLTK
norwegian_stopwords = set(stopwords.words('norwegian'))

# Function to remove stopwords from text
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in norwegian_stopwords]
    return ' '.join(filtered_words)

# Function to generate embeddings for a given text
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Apply stopword removal to the content
df['content'] = df['content'].apply(remove_stopwords)

# Generate embeddings for each tender content
df['embeddings'] = df['content'].apply(lambda x: generate_embeddings(x))

# Function to handle search query and find the most relevant tenders
def recommend_tenders(query, df):
    query = remove_stopwords(query)  # Remove stopwords from the query as well
    query_embedding = generate_embeddings(query)
    
    # Compute similarity scores between the query and each document
    similarities = df['embeddings'].apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])
    
    # Sort tenders based on similarity and return the top 5 recommendations
    top_recommendations = df.iloc[similarities.argsort()[-5:]]
    return top_recommendations[['filename', 'source_format', 'content']]

# Example usage
query = "cloud computing"
recommended_tenders = recommend_tenders(query, df)
print(recommended_tenders)


                                               filename source_format  \
635   C:\Users\spide\OneDrive\Desktop\Bachlorz\Tende...           pdf   
867   C:\Users\spide\OneDrive\Desktop\Bachlorz\Tende...           pdf   
963   C:\Users\spide\OneDrive\Desktop\Bachlorz\Tende...           pdf   
1354  C:\Users\spide\OneDrive\Desktop\Bachlorz\Tende...           pdf   
877   C:\Users\spide\OneDrive\Desktop\Bachlorz\Tende...           pdf   

                                                content  
635   EUROPEAN UNION SATELLITE CENTRE REFERENCE: SAT...  
867   Ref. Ares(2023)7327302 - 27/10/2023 EASA/2023/...  
963   EASA.2023.HVP.01: Mobile Communication Service...  
1354  PPMT REFERENCE: SATCEN/2024/OP/0013 INTERNAL R...  
877   CEDEFOP/2023/OP/0002 - ICT Helpdesk Services F...  


In [24]:
import gradio as gr
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
df = pd.read_csv("C:/Users/spide/OneDrive/Desktop/Bachlorz/extracted_data.csv")

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings for a given text
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings.numpy()

# Generate embeddings for each tender content
df['embeddings'] = df['content'].apply(lambda x: generate_embeddings(x))

# Function to handle search query and find the most relevant tenders
def recommend_tenders(query, df):
    query_embedding = generate_embeddings(query)
    
    # Compute similarity scores between the query and each document
    similarities = df['embeddings'].apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])
    
    # Sort tenders based on similarity and return the top 5 recommendations
    top_recommendations = df.iloc[similarities.argsort()[-5:]]
    
    # Add cosine similarity score to the result
    top_recommendations['cosine_similarity'] = similarities[similarities.argsort()[-5:]]
    
    return top_recommendations[['filename', 'source_format', 'content', 'cosine_similarity']]

# Gradio function for displaying recommendations
def recommend_tenders_gradio(query):
    recommended_tenders = recommend_tenders(query, df)
    return recommended_tenders

# Create Gradio Interface
interface = gr.Interface(fn=recommend_tenders_gradio, inputs="text", outputs="dataframe", title="Tender Recommendation System", description="Enter a search query to find the most relevant tenders.")

# Launch the interface
interface.launch()


* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['cosine_similarity'] = similarities[similarities.argsort()[-5:]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['cosine_similarity'] = similarities[similarities.argsort()[-5:]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['cosine_similar

In [22]:
# Gradio function for displaying recommendations
def recommend_tenders_gradio(query):
    recommended_tenders = recommend_tenders(query, df)
    return recommended_tenders

# Create Gradio Interface
interface = gr.Interface(fn=recommend_tenders_gradio, inputs="text", outputs="dataframe", live=True, title="Tender Recommendation System", description="Enter a search query to find the most relevant tenders.")

# Launch the interface
interface.launch()


* Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.




In [21]:
query = "cloud computing krav" 
query_embedding = generate_embeddings(query)

# Compare similarity
similarity = cosine_similarity([query_embedding])
print("Cosine Similarity:", similarity[0][0])  # Expect a value between 0 and 1


Cosine Similarity: 0.99999976


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['cosine_similarity'] = similarities[similarities.argsort()[-5:]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['cosine_similarity'] = similarities[similarities.argsort()[-5:]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['cosine_similar