In [4]:
import json

# Opening JSON file
f = open('/content/MSDialog-Intent.json')

# returns JSON object as
# a dictionary
data = json.load(f)
indexes = list(data.keys())

# Query formulation and Preprocessing Dataset

In [123]:
titles = [data[indexes[i]]['title'] for i in range(len(indexes))]
titles_clean =[]
for title in titles:
  temp = ''
  for word in title.split():
    temp += word.lower() + ' '
  titles_clean.append(temp)

In [120]:
# I have written about 50 queries leveraging the use of title from above
queries = [
    "Why does 'backgroundtaskhost.exe' stop working?",
    "How to improve system performance and maintenance for Microsoft Edge on Windows 10?",
    "Troubleshooting start menu issues in Windows 10.",
    "Fixing screen sketch problems.",
    "Dealing with high CPU usage by 'remsh.exe' in the task manager.",
    "Steps to remove Skype for Business from Windows 10.",
    "Resolving the error 'Something went wrong' during Windows 10 factory reset.",
    "Fixing network setup issues on Windows.",
    "Understanding the difference between legacy and UEFI in Dell E6420 BIOS.",
    "Explaining the functionality of the charms menu in Windows.",
    "Best practices for backup in Windows 10 on an HP x360 laptop.",
    "How to fix sudden screen dimming while playing games on a laptop.",
    "Addressing graphics issues after an update in Microsoft Edge.",
    "Troubleshooting restart issues after an update on October 1.",
    "Resolving BSOD loop on new hardware.",
    "Understanding the '0x801901f4' error.",
    "Exploring features of the Creator's Update for a tablet.",
    "Fixing keyboard issues on Toshiba Satellite S50-C after Creators Update.",
    "Resolving 'Bad System Config File' stop code.",
    "Registering '.ocx' files in Windows.",
    "Fixing application error 0xc00000e5.",
    "Making ITNS-300 negative scanner work on Windows 10.",
    "Troubleshooting Xbox One controller issues after Windows 10 anniversary update.",
    "Transferring Windows 10 Pro license on hard drive failure.",
    "Handling the Cumulative Update error (KB4040724) in Windows 10.",
    "Improving super slow and unresponsive Windows 10.",
    "Fixing Event ID 2 Kernel-EventTracing error.",
    "Making Windows Explorer always open in details view.",
    "Resolving Microsoft .NET Framework error after laptop comes out of sleep.",
    "Avoiding crashes of third-party applications when changing directories in open file dialog.",
    "Fixing Windows 10 update stuck at 89%.",
    "Addressing issues with invisible settings in Windows 10.",
    "Best practices for backing up everything on a laptop.",
    "Restoring data from Quick Access in Windows 10.",
    "Handling USB reset failure in Windows 10.",
    "Fixing overlapping taskbar in Windows 10.",
    "Troubleshooting Windows 10 update issues.",
    "Seeking help with scans on a computer.",
    "Resolving force shutdowns during Windows update.",
    "Recovering lost contacts after upgrading to Windows 10.",
    "Troubleshooting Windows 10 Start Menu problems.",
    "Creating a backup CD in Windows 10.",
    "Addressing Windows Installer loop after upgrading to Office 2016.",
    "Disabling full row select in Windows 10.",
    "Unable to download from the Windows Store – how to fix?",
    "General queries about Windows 10.",
    "Understanding normal file structure in Windows 10.",
    "Dealing with Windows 10 hanging issues.",
    "Fixing encrypted USB drive issues after Windows 10 update.",
    "Resolving desktop stuck and never loading issues in Windows 10."
]

In [94]:
follow_up = {}
f_id = 0
potential_answer = {}
p_id = 0
for i in range(1000):
  values = data[indexes[i]]
  for utterance in values['utterances']:
    if utterance['affiliation'] != 'Common User' and ('CQ' in utterance['tags'] or 'FD' in utterance['tags'] or 'FQ' in utterance['tags']):

      follow_up[f_id] = ' '.join([word.lower() for word in utterance['utterance'].split()])
      f_id += 1
    if utterance['affiliation'] != 'Common User' and ('IR' in utterance['tags'] or 'PA' in utterance['tags'] ):
      potential_answer[f_id] = ' '.join([word.lower() for word in utterance['utterance'].split()])
      p_id += 1

In [95]:
print('Total follow up questions:', f_id+1)
print('Total answers:', p_id+1)

Total follow up questions: 562
Total answers: 1665


# Baseline Binary Relevance

In [73]:
from gensim.parsing.preprocessing import preprocess_string

# Preprocess documents
preprocessed_documents = [preprocess_string(doc) for doc in follow_up.values()]
print(preprocessed_documents[10])
# Define functions for AND and OR operations
def AND(query1, query2):
    results1 = [i for i, doc in enumerate(preprocessed_documents) if set(query1).issubset(set(doc))]
    results2 = [i for i, doc in enumerate(preprocessed_documents) if set(query2).issubset(set(doc))]
    return list(set(results1) & set(results2))

def OR(query1, query2):
    results1 = [i for i, doc in enumerate(preprocessed_documents) if set(query1).issubset(set(doc))]
    results2 = [i for i, doc in enumerate(preprocessed_documents) if set(query2).issubset(set(doc))]
    return list(set(results1) | set(results2))

# Example usage
query1 = preprocess_string('Windows 10 Microsoft Edge is slow - System Performance and maintenance')[:2]
query2 = preprocess_string('Windows 10 factory reset error "Something Went Wrong" after selecting Language, Keyboard and Language.')[:2]
print(query1, query2)
and_results = AND(query1, query2)
or_results = OR(query1, query2)

print(f"Documents matching both queries: {and_results}")
print(f"Documents matching at least one query: {or_results}")


['hello', 'window', 'new', 'laptop', 'ultra', 'confus', 'understand', 'wai', 'dai', 'window', 'backup', 'window', 'imag', 'creat', 'backup', 'medium', 'futur', 'recoveri', 'need', 'told', 'gather', 'confus', 'blog', 'window', 'file', 'histori', 'backup', 'maintain', 'version', 'file', 'window', 'imag', 'cover', 'window', 'file', 'histori', 'backup', 'suggest', 'got', 'us', 'macrium', 'reflect', 'free', 'version', 'window', 'imag', 'recoveri', 'purpos', 'separ', 'us', 'window', 'file', 'histori', 'backup', 'privat', 'creat', 'file', 'folder', 'sound', 'interest', 'impli', 'think', 'keep', 'distinct', 'separ', 'backup', 'window', 'imag', 'import', 'file', 'advic', 'tip', 'suggest', 'proper', 'approach', 'highli', 'appreci', 'thank', 'advanc']
['window', 'microsoft'] ['window', 'factori']
Documents matching both queries: [648]
Documents matching at least one query: [512, 260, 516, 648, 1034, 524, 655, 400, 532, 22, 155, 28, 32, 33, 34, 417, 1057, 294, 696, 313, 58, 189, 829, 454, 329, 844

In [74]:
follow_up[648]

'  I beg you Microsoft in the Mighty name of God  please send or activate my Lenovo Yoga 11 Window RT8.1. I have done factory reset severally and still NO PRODUCT KEY, NO PRODUCT ID. If I try to do Window Update it will reply me unable to get window update, if I try to access Microsoft office I will get MICROSOFT CAN"T READ THE LICENSE OF THE COMPUTER AND CAN"T REPAIR IT. I tried to do window Activate Window by PHONE which display STEP1 Call one of the Numbers to get confirmation ID STEP 2: Provide this number when you are asked; 1 2 3 4 5 6 7 8 9 STEP3: Enter the Confirmation ID you get Over the Phone; A B C D E F G H with a box under each alphabets  The most suprising thing is they connected me to even their technical department and they still can\'t solve me problems. No what do I do? will I just throw away this computer or what do you advise PLEASE HELP '

Very less query matches are returned as its just a basic comparison and it requires that all the query words be present in the document, which is not most of the case, unless the query is very small of the size 2 or 3 words

# BM25

In [121]:
import math
from collections import Counter

class BM25:
    def __init__(self, corpus, k1=1.5, b=0.75):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.doc_lengths = [len(doc) for doc in corpus]
        self.avg_doc_length = sum(self.doc_lengths) / len(self.doc_lengths)
        self.doc_count = len(corpus)
        self.doc_freqs = [Counter(doc) for doc in corpus]

    def idf(self, term):
        doc_count_with_term = sum(1 for doc in self.corpus if term in doc)
        return math.log((self.doc_count - doc_count_with_term + 0.5) / (doc_count_with_term + 0.5) + 1.0)

    def bm25_score(self, query, doc, doc_index):
        score = 0
        for term in query:
            tf = doc.count(term)
            idf = self.idf(term)
            doc_length = self.doc_lengths[doc_index]
            numerator = tf * (self.k1 + 1)
            denominator = tf + self.k1 * (1 - self.b + self.b * (doc_length / self.avg_doc_length))
            score += idf * numerator / denominator
        return score

    def rank_documents(self, query):
        scores = [(doc_index, self.bm25_score(query, self.corpus[doc_index], doc_index)) for doc_index in range(self.doc_count)]
        ranked_docs = sorted(scores, key=lambda x: x[1], reverse=True)
        return ranked_docs


corpus = list(potential_answer.values())
bm25_model = BM25(corpus)
bm25_follow_up = BM25(titles_clean)
for query in queries[:20]:
  print(query)
  query = [word.lower() for word in query.split()]
  ranked_documents = bm25_model.rank_documents(query)
  for index, score in ranked_documents[:5]:
      print(f"Document {index + 1}: BM25 Score - {score}")
      print(corpus[index])
  print('\nFollow Up\n')
  follow_up_documents = bm25_follow_up.rank_documents(query)
  for index, score in follow_up_documents[:5]:
    print(f"Document {index + 1}: BM25 Score - {score}")
    print(titles_clean[index])
  print('\n\n')

# Below are the 50 queries with top 5 relevant documents displayed using BM25 Model
# Followed by this is the set of 5 potential follow up queries the user can select as the next question in a conversation

Why does 'backgroundtaskhost.exe' stop working?
Document 430: BM25 Score - 7.590244908013002
hi paul, welcome to the skype community forum. i know how important it is for you to use the 4.17.135 skype version. however, we have fully stopped supporting this version and upgraded the app to 4.18.15 and that's the reason why you're getting the same error message all over again. we always recommend to use the latest version of skype for the best possible experience. there are few reasons why you should update your application(s): i hope this helps. kindly let us know if you have questions. regards, rhiza_emonica
Document 25: BM25 Score - 6.413318909496925
hi denise, there are several possible reasons why you are unable to send an email in outlook.com. it can be because the message or attachments are too large or your inbox is full. for information and steps on how to resolve the issue with sending emails in outlook.com, you can refer to the article on this link. to stop or cancel an email t

# Query-likelihood with dirichlet smoothing

In [118]:
import math
from collections import Counter

class QueryLikelihoodModel:
    def __init__(self, corpus, mu=2000):
        self.corpus = corpus
        self.mu = mu
        self.term_frequencies = [Counter(doc) for doc in corpus]
        self.document_lengths = [len(doc) for doc in corpus]
        self.collection_size = sum(self.document_lengths)

    def dirichlet_smoothed_score(self, query, document):
        score = 0
        doc_length = len(document)

        for term in query:
            term_frequency = document.count(term)
            collection_frequency = sum(doc_freq[term] for doc_freq in self.term_frequencies)
            background_prob = collection_frequency / self.collection_size

            smoothed_term_prob = (term_frequency + self.mu * background_prob) / (doc_length + self.mu)
            # Add a small constant to avoid log(0)
            smoothed_term_prob = max(smoothed_term_prob, 1e-10)
            score += math.log(smoothed_term_prob)

        return score

    def rank_documents(self, query):
        scores = [(doc_index, self.dirichlet_smoothed_score(query, self.corpus[doc_index])) for doc_index in range(len(self.corpus))]
        ranked_docs = sorted(scores, key=lambda x: x[1], reverse=True)
        return ranked_docs


corpus = list(potential_answer.values())
ql_model = QueryLikelihoodModel(corpus)
ql_follow_up = QueryLikelihoodModel(titles_clean)
for query in queries[:20]:
  print(query)
  query = [word.lower() for word in query.split()]
  ranked_documents = ql_model.rank_documents(query)
  for index, score in ranked_documents[:5]:
      print(f"Document {index + 1}: Query Likelihood Score - {score}")
      print(corpus[index])
  print('\nFollow Up\n')
  follow_up_documents = ql_follow_up.rank_documents(query)
  for index, score in follow_up_documents[:5]:
    print(f"Document {index + 1}: Query Likelihood Score - {score}")
    print(titles_clean[index])
  print('\n\n')

# Below are the 50 queries with top 5 relevant documents displayed using Query-likelihood Model
# Followed by this is the set of potential follow up queries the user can select as the next question in a conversation

backgroundtaskhost.exe stopped working 
Document 15: Query Likelihood Score - -53.62111365233163
hello . i'm tara, an 8 year microsoft windows mvp, here to help you. 1) please work through this checklist to make sure the install is set up correctly, optimized for best performance, and any needed repairs are done: http://answers.microsoft.com/en-us/windows/wiki/windows_10-performance/windows-10-performance-and-install-integrity/75529fd4-fac7-4653-893a-dd8cd4b4db00 2) if any steps cannot be performed then try whichever method works to enter safe mode with networking which may work better to perform the steps: http://www.pcworld.com/article/2984712/windows/how-to-enter-windows-10s-safe-mode.html 3) if no safe mode, from the advanced startup options use the command prompt at boot to run the repair commands in the checklist: https://www.tenforums.com/tutorials/2294-advanced-startup-options-boot-windows-10-a.html 3) if nothing else works then you can reinstall windows while keeping your file

# TF_IDF Vector Space Model

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
documents = list(potential_answer.values())

# Step 1: Text Preprocessing
vectorizer = TfidfVectorizer(stop_words='english')
follow_up_vectorizer = TfidfVectorizer(stop_words='english')
# Step 2: Building the TF-IDF Matrix
tfidf_matrix = vectorizer.fit_transform(documents)
follow_up_tfidf_matrix = follow_up_vectorizer.fit_transform(titles)

for query in queries[:20]:
  # Step 3: Query Representation
  query_vector = vectorizer.transform([query])
  follow_up_query_vector = follow_up_vectorizer.transform([query])

  # Step 4: Similarity Calculation
  cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
  follow_up_cosine_similarities = cosine_similarity(follow_up_query_vector, follow_up_tfidf_matrix).flatten()

  # Get document indices sorted by similarity
  sorted_indices = cosine_similarities.argsort()[::-1]
  follow_up_sorted_indices = follow_up_cosine_similarities.argsort()[::-1]

  # Print the most similar document
  print('Query: ', query)
  for i in sorted_indices[:5]:
    print('Document',i,': ', documents[i])
  print('')
  for i in follow_up_sorted_indices[:5]:
    print('Document',i,': ', titles[i])
  print('\n\n')

Query:  Why does 'backgroundtaskhost.exe' stop working?
Document 161 :  internet explorer is iexplorer.exe explorer.exe is the file manager
Document 41 :  there is a backend issue and microsoft is working on it. no eta when it will be fixed.
Document 134 :  hello antonio, thank you for the update on this issue. i suggest you the following link to know more about microsoft account and computer specific microsoft account verification refer to the link to know about microsoft account and its configuration. get the new account created by following the article. ensure to get the microsoft account verified on your computer. refer to the link to verify your computer for microsoft account. after creating new user account please try to install all pending updates after running dism (checksur.exe). run the system update readiness tool (checksur.exe) download and run the system update readiness tool. this tool runs a one-time scan for inconsistencies that may prevent future servicing operations. 

# Evaluation of Models

In [None]:
def calculate_precision_at_1(queries, results, relevance_judgments):
  """
  Calculates the Precision@1 score for a set of queries, results, and relevance judgments.

  Args:
    queries: A list of query strings.
    results: A list of lists, where each sublist contains the ranked IDs of documents for a corresponding query.
    relevance_judgments: A list of dictionaries, where each dictionary maps a document ID to a relevance score (e.g., 1 for relevant, 0 for non-relevant).

  Returns:
    A list of Precision@1 scores for each query.
  """

  precisions = []
  for query, documents, judgments in zip(queries, results, relevance_judgments):
    top_doc_id = documents[0]
    precision = judgments[top_doc_id] / len(documents)
    precisions.append(precision)
  return precisions

def calculate_mrr(queries, results, relevance_judgments):
  """
  Calculates the MRR score for a set of queries, results, and relevance judgments.

  Args:
    queries: A list of query strings.
    results: A list of lists, where each sublist contains the ranked IDs of documents for a corresponding query.
    relevance_judgments: A list of dictionaries, where each dictionary maps a document ID to a relevance score (e.g., 1 for relevant, 0 for non-relevant).

  Returns:
    The MRR score as a float between 0 and 1.
  """

  reciprocal_ranks = []
  for query, documents, judgments in zip(queries, results, relevance_judgments):
    for document_id in documents:
      if judgments[document_id] > 0:
        rank = documents.index(document_id) + 1
        reciprocal_rank = 1 / rank
        reciprocal_ranks.append(reciprocal_rank)
        break
  mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
  return mrr

# Example usage
queries = ["query1", "query2", "query3"]
results = [[123, 456, 789], [987, 654, 321], [10, 20, 30]]
relevance_judgments = [{123: 1}, {987: 1}, {10: 0}]

precisions = calculate_precision_at_1(queries, results, relevance_judgments)
mrr = calculate_mrr(queries, results, relevance_judgments)

print(f"Precision@1 for each query: {precisions}")
print(f"Overall MRR: {mrr}")
