In [1]:
import pandas as pd
import numpy as np
import string
import scipy
import sklearn
import spacy
import nltk
import re
import os

In [2]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [3]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove numbers, symbols, and punctuation (except for the case where 2 follows CO)
    text = re.sub(r'[\d' + re.escape(string.punctuation) + '](?<![cC][oO]2)', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [4]:
def text_embed(text_col_name, dataframe, vector_col_name):
    model = SentenceTransformer("thenlper/gte-small")
    dataframe[text_col_name] = dataframe[text_col_name].astype(str)
    dataframe[vector_col_name] = dataframe[text_col_name].apply(lambda x: model.encode(x))
    return

In [5]:
df = pd.read_excel(r"C:\Users\RedHat\Downloads\Tiiqu\Def to be transformed for Library demo- with_gpt_data.xlsx", 
                        sheet_name=1).reset_index().iloc[:,2:4]
df

Unnamed: 0,Question,Answer
0,How do operational strategies like bid price c...,Both bid price control and wind forecast contr...
1,What is the purpose of Power-to-Hydrogen (P2H)...,P2H technology is used to store excess electri...
2,How does P2H contribute to the integration of ...,P2H technology helps to integrate ARE into the...
3,Why is surplus ARE alone insufficient for an e...,Surplus ARE alone is insufficient for an econo...
4,What is the best route for P2H in terms of dis...,The best route for P2H is to displace diesel i...
...,...,...
295,How did Zhang et al.'s (2016) dual-layer bus n...,Zhang et al. (2016) advanced a dual-layer bus ...
296,"How did Sun et al., 2018 utilize network theor...","Sun et al., 2018 utilized network theory to an..."
297,How do planners ensure accuracy when construct...,Planners ensure accuracy by incorporating vari...
298,What considerations inform the establishment o...,Connections between stations in bus and subway...


In [6]:
# Merge two columns using the + operator
df['QA'] = df['Question'] + ' ' + df['Answer']

In [7]:
def find_most_relevant_qna(qna_index, qna_df, corpus_column):
    
    # preprocess corpus
    qna_df['QA_prep'] = corpus_column.apply(preprocess_text)
    
    # text embedding for preprocessed corpus
    text_embed('QA_prep', qna_df, 'QA_prep_vector')
    
    # Extract the vector embedding of the given index
    qna_index_embedding = qna_df.loc[qna_index, 'QA_prep_vector']
    
    qna = df.loc[qna_index, ['Question', 'Answer']]
    
    # Get vector embeddings for QnA pairs in the QnA DataFrame
    qna_embeddings = qna_df['QA_prep_vector'].tolist()
    
    # Compute cosine similarity between the query subtopic and all QnA pairs
    similarities = cosine_similarity([qna_index_embedding], qna_embeddings)
    
    # Get the indices of top 5 most similar QnA pairs
    top_indices = similarities.argsort()[0][::-1][1:6]
    
    # Extract top similar QnA pairs along with their similarity scores
    top_similar_qna = qna_df.iloc[top_indices][['Question', 'Answer']]
    top_similar_qna['Similarity Score'] = similarities[0][top_indices]
    top_similar_qna = top_similar_qna[top_similar_qna['Similarity Score'] < 1]

    print(qna)
    
    return top_similar_qna

In [8]:
qna_index = 109
top_similar_qna = find_most_relevant_qna(qna_index, df, df['QA'])
top_similar_qna

Question    What impact did increasing traffic volumes hav...
Answer      Increasing traffic volumes led to more dispers...
Name: 109, dtype: object


Unnamed: 0,Question,Answer,Similarity Score
9,How did the growth of traffic volumes in Amste...,The growth of traffic volumes in Amsterdam and...,0.97387
209,In what ways has the proliferation of traffic ...,The proliferation of traffic has led to sprawl...,0.946693
115,What initiatives did Groningen undertake to al...,Groningen shifted from car-centric planning to...,0.900802
15,How did Groningen change its spatial planning ...,Groningen changed its car-oriented spatial pla...,0.890152
215,In what ways did Groningen reframe its urban d...,Groningen shifted focus from cars to revitaliz...,0.884207
