In [1]:
import pandas as pd
from fuzzywuzzy import fuzz, process
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d', '', text)

    text = text.lower()

    tokens = text.split()
    filtered_words = [word for word in tokens if word not in stop_words]
    
    return ' '.join(filtered_words)

In [3]:
resolved_queries_path = '/kaggle/input/textsearch/resolved_queries (1).csv'  
new_queries_path = '/kaggle/input/textsearch/new_queries (1).csv'

resolved_queries_df = pd.read_csv(resolved_queries_path)
new_queries_df = pd.read_csv(new_queries_path)

resolved_queries_df['Pre_Resolved_Query'] = resolved_queries_df['Pre_Resolved_Query'].apply(preprocess_text)
new_queries_df['Variation_Query'] = new_queries_df['Variation_Query'].apply(preprocess_text)

# Fuzzy Matching
def fuzzy_match_queries(new_query, resolved_queries, method='token_set_ratio', threshold=70):
    matches = process.extractOne(new_query, resolved_queries, scorer=fuzz.token_set_ratio)
    if matches[1] >= threshold:
        return matches
    else:
        return (None, 0)

resolved_queries_list = resolved_queries_df['Pre_Resolved_Query'].tolist()
new_queries_df['Fuzzy_Matched_Query'] = None
new_queries_df['Fuzzy_Match_Score'] = None

threshold = 90
for idx, row in new_queries_df.iterrows():
    new_query = row['Variation_Query']
    best_match, score = fuzzy_match_queries(new_query, resolved_queries_list, method='token_set_ratio', threshold=threshold)
    new_queries_df.at[idx, 'Fuzzy_Matched_Query'] = best_match
    new_queries_df.at[idx, 'Fuzzy_Match_Score'] = score

In [4]:
# BoW and TF-IDF with Cosine Similarity

combined_queries = resolved_queries_df['Pre_Resolved_Query'].tolist() + new_queries_df['Variation_Query'].tolist()

# Bag of Words (BoW) 
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(combined_queries)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_queries)

In [5]:
resolved_bow_matrix = bow_matrix[:len(resolved_queries_df)]
new_bow_matrix = bow_matrix[len(resolved_queries_df):]

resolved_tfidf_matrix = tfidf_matrix[:len(resolved_queries_df)]
new_tfidf_matrix = tfidf_matrix[len(resolved_queries_df):]

bow_cosine_sim = cosine_similarity(new_bow_matrix, resolved_bow_matrix)
tfidf_cosine_sim = cosine_similarity(new_tfidf_matrix, resolved_tfidf_matrix)

In [9]:
print("BoW Matching Example (Different Set):")
for i, query in enumerate(new_queries_df['Variation_Query'].iloc[5:10]):
    new_query_bow = bow_vectorizer.transform([query])
    cosine_sim = cosine_similarity(new_query_bow, resolved_bow_matrix).flatten()
    best_idx = cosine_sim.argmax()
    best_match = resolved_queries_list[best_idx]
    best_score = cosine_sim[best_idx]
    print(f"New Query: {query}\nBest Match: {best_match}\nScore: {best_score}\n")

print("Fuzzy Matching Example (Different Set):")
for i, query in enumerate(new_queries_df['Variation_Query'].iloc[5:10]):
    match, score = fuzzy_match_queries(query, resolved_queries_list, threshold=70)
    print(f"New Query: {query}\nBest Match: {match}\nScore: {score}\n")

print("TF-IDF Matching Example (Different Set):")
for i, query in enumerate(new_queries_df['Variation_Query'].iloc[5:10]):
    new_query_tfidf = tfidf_vectorizer.transform([query])
    cosine_sim = cosine_similarity(new_query_tfidf, resolved_tfidf_matrix).flatten()
    best_idx = cosine_sim.argmax()
    best_match = resolved_queries_list[best_idx]
    best_score = cosine_sim[best_idx]
    print(f"New Query: {query}\nBest Match: {best_match}\nScore: {best_score}\n")



BoW Matching Example (Different Set):
New Query: payment issue check
Best Match: payment failed checkout
Score: 0.3333333333333334

New Query: application crashes opening setings
Best Match: app crashes opening settings
Score: 0.5

New Query: app crash going settings
Best Match: app crashes opening settings
Score: 0.5

New Query: settings cause app chrash
Best Match: app crashes opening settings
Score: 0.5

New Query: forgot passwrd cant reset
Best Match: forgot password unable reset
Score: 0.5

Fuzzy Matching Example (Different Set):
New Query: payment issue check
Best Match: payment failed checkout
Score: 76

New Query: application crashes opening setings
Best Match: app crashes opening settings
Score: 86

New Query: app crash going settings
Best Match: app crashes opening settings
Score: 88

New Query: settings cause app chrash
Best Match: None
Score: 0

New Query: forgot passwrd cant reset
Best Match: forgot password unable reset
Score: 75

TF-IDF Matching Example (Different Set):


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def find_best_matches(cosine_sim, resolved_df, new_df, threshold=0.5):
    matches = []
    for i, sims in enumerate(cosine_sim):
        best_idx = np.argmax(sims)
        best_score = sims[best_idx]
        if best_score >= threshold:
            matches.append({
                'Variation_Query': new_df.iloc[i]['Variation_Query'],
                'Resolved_Query': resolved_df.iloc[best_idx]['Pre_Resolved_Query'],
                'Similarity_Score': best_score
            })
    return pd.DataFrame(matches)

cosine_sim = cosine_similarity(new_tfidf_matrix, resolved_tfidf_matrix)

matches_df = find_best_matches(cosine_sim, resolved_queries_df, new_queries_df, threshold=0.5)

matches_df = matches_df.sort_values('Similarity_Score', ascending=False)

matches_df.head()

Unnamed: 0,Variation_Query,Resolved_Query,Similarity_Score
4,upload files server,unable upload files server,0.894358
0,connect internet,unable connect internet,0.858168
5,checkout page says payment failed,payment failed checkout,0.719198
3,reset password,forgot password unable reset,0.670173
2,forgotten password unable reset,forgot password unable reset,0.628965


In [12]:
def get_best_cosine_match(cosine_sim_matrix, resolved_queries):
    best_matches = []
    for row in cosine_sim_matrix:
        best_idx = row.argmax()
        best_match_score = row[best_idx]
        best_matches.append((resolved_queries[best_idx], best_match_score))
    return best_matches

bow_best_matches = get_best_cosine_match(bow_cosine_sim, resolved_queries_list)
tfidf_best_matches = get_best_cosine_match(tfidf_cosine_sim, resolved_queries_list)

new_queries_df['BoW_Matched_Query'] = [match[0] for match in bow_best_matches]
new_queries_df['BoW_Match_Score'] = [match[1] for match in bow_best_matches]

new_queries_df['TFIDF_Matched_Query'] = [match[0] for match in tfidf_best_matches]
new_queries_df['TFIDF_Match_Score'] = [match[1] for match in tfidf_best_matches]

new_queries_df[['Variation_Query', 'Fuzzy_Matched_Query', 'Fuzzy_Match_Score', 
                'BoW_Matched_Query', 'BoW_Match_Score', 'TFIDF_Matched_Query', 'TFIDF_Match_Score']].head()

Unnamed: 0,Variation_Query,Fuzzy_Matched_Query,Fuzzy_Match_Score,BoW_Matched_Query,BoW_Match_Score,TFIDF_Matched_Query,TFIDF_Match_Score
0,unabel conect internet,unable connect internet,93,unable connect internet,0.333333,unable connect internet,0.25641
1,connect internet,unable connect internet,100,unable connect internet,0.816497,unable connect internet,0.858168
2,intenet working,,0,unable connect internet,0.0,unable connect internet,0.0
3,payment failed chekout,payment failed checkout,98,payment failed checkout,0.666667,payment failed checkout,0.558907
4,payment go chckout,,0,payment failed checkout,0.333333,payment failed checkout,0.219802
