In [None]:
import pandas as pd
import numpy as np
import h5py
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import corpus_bleu
import time
# Download NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Preloaded stop words
stop_words = set(stopwords.words('english'))

# Pre-compiled regular expressions
html_tags_regex = re.compile(r'<.*?>')
non_alpha_numeric_regex = re.compile(r'[^a-zA-Z\s]')

#Text Preprocessing
def preprocess_text(text):
    text = html_tags_regex.sub('', text)
    text = non_alpha_numeric_regex.sub('', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

# Load data
data = pd.read_csv('chat_health.csv')
data =data.head(2000)
data['original_answer'] = data['short_answer']
data['tokenized_questions'] = data['short_question'].apply(preprocess_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Training the Word2Vec model
model = Word2Vec(sentences=data['tokenized_questions'], vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec_model_2000_pairs.model")
model = Word2Vec.load("word2vec_model_2000_pairs.model")

# Get a vector representation of the problem from the word list
def get_question_vector(words, model):
    return np.mean([model.wv[word] for word in words if word in model.wv], axis=0)

# Get the word vector of the text
data['question_vec'] = data['tokenized_questions'].apply(lambda words: get_question_vector(words, model))

# Save vector data to HDF5 file
with h5py.File('question_vectors_2000_pairs.h5', 'w') as f:
    vectors = np.stack(data['question_vec'].dropna(), axis=0)
    f.create_dataset('vectors', data=vectors)
    dt = h5py.string_dtype(encoding='utf-8')
    f.create_dataset('answers', data=data['original_answer'].to_numpy(dtype=dt))

In [None]:
import numpy as np
import h5py
from sklearn.metrics.pairwise import cosine_similarity



best_index = None

def generate_answer(question, model):
    question_words = preprocess_text(question)
    question_vec = get_question_vector(question_words, model)
    with h5py.File('question_vectors_2000_pairs.h5', 'r') as f:
        vectors = f['vectors'][:]
        cosine_similarities = cosine_similarity([question_vec], vectors)
        best_index = np.argmax(cosine_similarities)
        best_answer = f['answers'][best_index].decode('utf-8')
    return best_answer



In [None]:
testData = pd.read_csv('testset.csv')

In [None]:
testData['generated_answer'] = testData['short_question'].apply(lambda q: generate_answer(q, model))

references = testData['original_answer'].apply(lambda a: [a.split()]).tolist()
candidates = testData['generated_answer'].apply(lambda a: a.split()).tolist()

bleu_score = corpus_bleu(references, candidates)
print("BLEU Score:", bleu_score)

BLEU Score: 0.761963965471042


In [None]:
start_time = time.time()
testData['generated_answer'] = testData['short_question'].head(50).apply(lambda q: generate_answer(q, model))
end_time = time.time()
response_time = end_time - start_time
average_response_time = response_time / 50

print(f"Average response time per record: {average_response_time} seconds")

Average response time per record: 0.005222439765930176 seconds
