In [1]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.7/162.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rap

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

In [3]:
#Creating a Function to clean text
def clean_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return ""

In [4]:
#implementing TF-IDF and fuzzy matching
def get_best_match(text, category_list, threshold=0.3):
    if not text or not category_list:
        return None

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2,4))
    tfidf_matrix = vectorizer.fit_transform([text] + category_list)
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    best_match_index = np.argmax(similarities)
    best_score = similarities[best_match_index]
    if best_score < threshold:
        fuzzy_match, fuzzy_score = process.extractOne(text, category_list)
        if fuzzy_score > (threshold * 100):
            return fuzzy_match

    return category_list[best_match_index] if best_score >= threshold else None

In [5]:
def tag_data(task_df, taxonomy_df):
    taxonomy_df.columns = taxonomy_df.columns.str.strip()
    taxonomy_dict = {col: taxonomy_df[col].dropna().unique().tolist() for col in taxonomy_df.columns}

    combined_text = task_df[['Complaint', 'Cause', 'Correction']].fillna('').astype(str).agg(' '.join, axis=1)

    #Mapping task dataset columns to taxonomy categories
    column_mapping = {'Root Cause': 'Root Cause','Symptom Condition 1': 'Symptom Condition','Symptom Component 1': 'Symptom Component','Fix Condition 1': 'Fix Condition','Fix Component 1': 'Fix Component'}

    #Applying matching function to each related fields
    for task_col, taxonomy_col in column_mapping.items():
        category_list = taxonomy_dict.get(taxonomy_col, [])
        task_df[task_col] = combined_text.apply(lambda x: get_best_match(clean_text(x), category_list))

    return task_df

In [6]:
# Function to calculate tagging accuracy for multiple fields
def calculate_accuracy(tagged_df, original_df, fields):
    accuracy_results = {}
    for field in fields:
        correct_matches = (tagged_df[field].fillna('') == original_df[field].fillna('')).sum()
        total_entries = original_df[field].notna().sum()
        accuracy = (correct_matches / total_entries) * 100 if total_entries > 0 else 0
        accuracy_results[field] = accuracy
    return accuracy_results

In [7]:
# Load dataset and taxonomy from the Excel file
xls = pd.ExcelFile("DA - Task 1..xlsx")
task_df = pd.read_excel(xls, sheet_name="Task")
taxonomy_df = pd.read_excel(xls, sheet_name="Taxonomy")

In [8]:
tagged_df = tag_data(task_df, taxonomy_df)

In [9]:
fields_to_evaluate = ['Root Cause', 'Symptom Condition 1', 'Symptom Component 1', 'Fix Condition 1', 'Fix Component 1']
accuracy_results = calculate_accuracy(tagged_df, task_df, fields_to_evaluate)

In [10]:
# Printing the accuracy
for field, accuracy in accuracy_results.items():
    print(f"{field} Tagging Accuracy: {accuracy:.2f}%")

Root Cause Tagging Accuracy: 100.00%
Symptom Condition 1 Tagging Accuracy: 100.00%
Symptom Component 1 Tagging Accuracy: 100.00%
Fix Condition 1 Tagging Accuracy: 100.00%
Fix Component 1 Tagging Accuracy: 100.00%


In [11]:
# Saving the data tagged dataset to an Excel file
tagged_df.to_excel("datatagged_Dataset.xlsx", index=False)