In [1]:
#pip install fuzzywuzzy scikit-learn nltk

In [2]:
#pip install python-Levenshtein

In [89]:
import numpy as np
import pandas as pd
import pandas as pd
import re
import nltk
from fuzzywuzzy import fuzz, process
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

## Preprocessing the data

In [90]:
solved = pd.read_csv('resolved_queries.csv')
solved.head()

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [91]:
solved.describe()

Unnamed: 0,Query_ID
count,5.0
mean,3.0
std,1.581139
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,5.0


In [92]:
new = pd.read_csv('new_queries.csv')
new.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID
0,Unabel to conect to the internet,1
1,Can’t connect to internet,1
2,Intenet not working,1
3,Payment failed while chekout,2
4,Payment did not go through during chckout,2


In [93]:
new.shape

(20, 2)

In [94]:
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

In [95]:
solved['Pre_Resolved_Query'] = solved['Pre_Resolved_Query'].apply(preprocess_text)
new['Variation_Query'] = new['Variation_Query'].apply(preprocess_text)

## Fuzzy Search

In [100]:
def fuzzy_match(unresolved_query, resolved_queries, resolved_query_ids, threshold=80):
    best_match = None
    best_match_id = None
    best_score = 0
    for idx, resolved_query in enumerate(resolved_queries):
        score = fuzz.token_set_ratio(unresolved_query, resolved_query)  # Using Token Set Ratio
        if score > best_score and score >= threshold:
            best_score = score
            best_match = resolved_query
            best_match_id = resolved_query_ids[idx]  # Get the corresponding Query_ID
    return best_match_id, best_score

In [104]:
fuzzy_results = []
for uq in new['Variation_Query']:
    match_id, score = fuzzy_match(uq, solved['Pre_Resolved_Query'], solved['Query_ID'], threshold=80)
    fuzzy_results.append({'Variation_Query': uq, 'Matches_With_Query_ID': match_id, 'Score': score})

In [105]:
fuzzy_df = pd.DataFrame(fuzzy_results)
fuzzy_df

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Score
0,unabel conect internet,1.0,93
1,connect internet,1.0,100
2,intenet working,,0
3,payment failed chekout,2.0,98
4,payment go chckout,,0
5,payment issue check,,0
6,application crashes opening setings,3.0,86
7,app crash going settings,3.0,88
8,settings cause app chrash,,0
9,forgot passwrd cant reset,,0


In [106]:
fuzzy_df.shape

(20, 3)

In [108]:
correct = 0
for i in range(new.shape[0]):
    if new['Matches_With_Query_ID'][i] == fuzzy_df['Matches_With_Query_ID'][i]:
        correct += 1

accuracy = (correct/new.shape[0])*100
print("Accuracy: ",accuracy)

Accuracy:  55.00000000000001


## Cosine Similarity

In [30]:
vectorizer = TfidfVectorizer()  # Use CountVectorizer() for BoW instead of Tf-IDF
resolved_vectors = vectorizer.fit_transform(solved['Pre_Resolved_Query'])
unresolved_vectors = vectorizer.transform(new['Variation_Query'])

In [31]:
cosine_sim_matrix = cosine_similarity(unresolved_vectors, resolved_vectors)

In [33]:
cosine_results = []
for idx, uq in enumerate(new['Variation_Query']):
    similarity_scores = cosine_sim_matrix[idx]
    best_match_idx = similarity_scores.argmax()
    best_match_id = solved['Query_ID'].iloc[best_match_idx]  # Get the corresponding Query_ID
    best_score = similarity_scores[best_match_idx]
    cosine_results.append({'Variation_Query': uq, 'Matches_With_Query_ID': best_match_id, 'Cosine_Similarity': best_score})

In [34]:
cosine_df = pd.DataFrame(cosine_results)
cosine_df

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Cosine_Similarity
0,unabel conect internet,1,0.63907
1,connect internet,1,0.903782
2,intenet working,1,0.0
3,payment failed chekout,2,0.816497
4,payment go chckout,2,0.57735
5,payment issue check,2,0.57735
6,application crashes opening setings,3,0.707107
7,app crash going settings,3,0.707107
8,settings cause app chrash,3,0.707107
9,forgot passwrd cant reset,4,0.761551


In [35]:
cosine_df.shape

(20, 3)

In [41]:
correct = 0
for i in range(new.shape[0]):
    if new['Matches_With_Query_ID'][i] == cosine_df['Matches_With_Query_ID'][i]:
        correct += 1

accuracy = (correct/new.shape[0])*100
print("Accuracy: ",accuracy)

Accuracy:  100.0
