In [3]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from tqdm import tqdm
import gensim.downloader as api
from gensim.models import KeyedVectors
import numpy as np
from scipy.spatial.distance import cosine
from fuzzywuzzy import process

# Load pre-trained word embeddings
word_vectors = api.load("glove-wiki-gigaword-100")

# Load your dataset
df = pd.read_csv('final_data_exploration.csv')

# # Expanded manufacturing taxonomy
# manufacturing_taxonomy = {
#     "equipment": ["machine", "tool", "device", "gauge", "lamp", "filament", "wire", "desk", "station", "nut", "chipper", "sander", "grinder", "press", "CNC", "drill", "mixer", "saw", "chisel", "caliper", "bolt", "key", "stand", "hook", "pliers", "lathe", "oscilloscope", "feeler", "milling", "jack", "crane", "clamp", "screw", "gage", "mill", "cutter", "workbench", "mallet", "whiteout", "dispenser", "voltmeter", "chainsaw", "anvil", "horseshoe", "corkscrew", "vial", "pin", "wrench", "square", "tailstock", "secateur", "knurling", "roadheader", "rasper", "telehandler", "trephine", "fissurometer", "optometer"],
#     "process": ["manufacturing", "assembly", "fabrication", "casting", "printing", "welding", "chopping", "wiring", "foundry", "molding", "processing", "snow blower", "track hardware", "rail fastening", "grinding", "milling", "drilling", "cutting", "sawing", "pumping", "clinching", "imprinting", "stamping", "damascening", "enchain"],
#     "material": ["metal", "plastic", "composite", "brick", "wood", "rope", "ceramic", "mold", "steel", "material", "paint", "marble", "concrete", "rail", "track"],
#     "personnel": ["worker", "operator", "engineer", "welder", "crewmate", "jeweller"],
#     "quality": ["inspection", "testing", "measurement", "gage", "gauge"],
#     "safety": ["protection", "hazard", "precaution", "helmet", "muff", "protector", "light", "safety", "mask", "voltage detector", "cone", "interlock"],
#     "maintenance": ["repair", "upkeep", "sander", "grinder", "maintenance", "threadlocker"],
#     "logistics": ["transport", "shipping", "truck", "vehicle", "utility", "clip", "carabiner", "twowheeler"],
#     "automation": ["control", "CNC", "robot", "automated", "process", "computer numerical"],
#     "electrical": ["voltage", "power supply", "bulb", "halogen", "incandescent", "voltmeter"],
#     "specialized": ["cowcatcher", "smithingmark", "dheesterboom", "kailyard"],
#     "historical": ["dorfmuseum", "industriedenkmal", "wehrhahn"],
# }

manufacturing_taxonomy = {  
     "machines": ["CNC", "press", "drill", "mixer", "lathe", "mill", "chainsaw", "roadheader", "telehandler", "snow blower", "pump"],
    "tools": ["chipper", "sander", "grinder", "chisel", "pliers", "mallet", "cutter", "wrench", "saw", "secateur", "rasper", "jack", "crane", "clip", "carabiner"],
    "quality": ["gauge", "gage", "caliper", "oscilloscope", "feeler", "voltmeter", "square", "optometer", "fissurometer", "inspection", "testing", "measurement"],
    "fasteners": ["nut", "bolt", "screw", "hook", "clamp", "pin", "rail fastening"],
    "electrical": ["lamp", "filament", "wire", "bulb", "halogen", "incandescent", "power supply", "voltage detector"],
    "safety": ["helmet", "muff", "protector", "mask", "cone", "interlock"],
    "specialized": ["anvil", "horseshoe", "corkscrew", "knurling", "trephine","cowcatcher", "smithingmark", "dheesterboom"],
    "processes": ["manufacturing", "assembly", "fabrication", "casting", "printing", "welding", "chopping", "wiring", "foundry", "molding", "processing", "grinding", "milling", "drilling", "cutting", "sawing", "pumping", "clinching", "imprinting", "stamping", "damascening", "enchain"],
    "materials": ["metal", "plastic", "composite", "brick", "wood", "rope", "ceramic", "mold", "steel", "paint", "marble", "concrete", "rail", "track"],
    "maintenance": ["repair", "upkeep", "threadlocker", "worker", "operator", "engineer", "welder", "crewmate", "jeweller", "desk", "station", "workbench", "stand", "control", "robot", "automated", "process", "computer numerical", "transport", "shipping", "truck", "vehicle", "utility", "twowheeler"],
    "miscellaneous": ["dorfmuseum", "industriedenkmal", "wehrhahn", "kailyard"]
}

def analyze_label(label):
    return label.lower().split()

df['analyzed_labels'] = df['label'].apply(analyze_label)

def find_most_similar_category(concept, categories, threshold=0.3):
    if concept in word_vectors.key_to_index:
        similarities = []
        for category in categories:
            if category in word_vectors.key_to_index:
                similarity = 1 - cosine(word_vectors[concept], word_vectors[category])
                if similarity > threshold:
                    similarities.append((category, similarity))
        if similarities:
            return max(similarities, key=lambda x: x[1])[0]
    return None

def fuzzy_match(word, category_terms, threshold=80):
    matches = process.extractBests(word, category_terms, score_cutoff=threshold)
    return matches[0][0] if matches else None

def map_to_taxonomy(concepts, taxonomy):
    for i in range(len(concepts)):
        concept = concepts[i]
        
        # Direct match
        for category, terms in taxonomy.items():
            if concept in terms:
                return category
        
        # Fuzzy match
        for category, terms in taxonomy.items():
            fuzzy_match_result = fuzzy_match(concept, terms)
            if fuzzy_match_result:
                return category
        
        # Word similarity
        category = find_most_similar_category(concept, taxonomy.keys())
        if category:
            return category
        
        # Check word combinations
        if i < len(concepts) - 1:
            two_word_concept = concepts[i] + " " + concepts[i+1]
            for category, terms in taxonomy.items():
                if two_word_concept in terms:
                    return category
    
    # Fallback: return the category with the highest average similarity
    avg_similarities = []
    for category, terms in taxonomy.items():
        similarities = [word_vectors.similarity(concept, term) for concept in concepts for term in terms if concept in word_vectors.key_to_index and term in word_vectors.key_to_index]
        if similarities:
            avg_similarities.append((category, np.mean(similarities)))
    
    if avg_similarities:
        return max(avg_similarities, key=lambda x: x[1])[0]
    
    # If still unknown, assign to the most common category
    return "miscellaneous"

df['taxonomy_label'] = df['analyzed_labels'].apply(lambda concepts: map_to_taxonomy(concepts, manufacturing_taxonomy))

# Print statistics
print(df['taxonomy_label'].value_counts(normalize=True))

# Check remaining unknown labels
unknown_labels = df[df['taxonomy_label'] == 'unknown']['label'].unique()
print("\nSample of remaining 'unknown' labels:")
print(unknown_labels[:20])  # Print first 20 unknown labels

# Print absolute counts
print("\nAbsolute counts:")
print(df['taxonomy_label'].value_counts())


taxonomy_label
machines         0.164671
tools            0.158405
processes        0.112098
fasteners        0.107077
materials        0.099695
quality          0.099438
electrical       0.087035
maintenance      0.065019
safety           0.062401
specialized      0.038797
miscellaneous    0.005365
Name: proportion, dtype: float64

Sample of remaining 'unknown' labels:
[]

Absolute counts:
taxonomy_label
machines         3837
tools            3691
processes        2612
fasteners        2495
materials        2323
quality          2317
electrical       2028
maintenance      1515
safety           1454
specialized       904
miscellaneous     125
Name: count, dtype: int64


In [4]:
# Save the results
df.to_csv('kevin__data_exploration_taxonomy.csv', index=False)

In [5]:
# import pandas as pd
# import nltk
# from nltk.corpus import wordnet
# from tqdm import tqdm
# import gensim.downloader as api
# from gensim.models import KeyedVectors
# import numpy as np
# from scipy.spatial.distance import cosine

# # Download necessary NLTK data
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

# # Load pre-trained word embeddings
# print("Loading word embeddings...")
# word_vectors = api.load("glove-wiki-gigaword-100")

# # Load your dataset
# df = pd.read_csv('final_data_exploration.csv')

# # Define a basic manufacturing taxonomy
# manufacturing_taxonomy = {  
#      "machines": ["CNC", "press", "drill", "mixer", "lathe", "mill", "chainsaw", "roadheader", "telehandler", "snow blower", "pump"],
#     "tools": ["chipper", "sander", "grinder", "chisel", "pliers", "mallet", "cutter", "wrench", "saw", "secateur", "rasper", "jack", "crane", "clip", "carabiner"],
#     "quality": ["gauge", "gage", "caliper", "oscilloscope", "feeler", "voltmeter", "square", "optometer", "fissurometer", "inspection", "testing", "measurement"],
#     "fasteners": ["nut", "bolt", "screw", "hook", "clamp", "pin", "rail fastening"],
#     "electrical": ["lamp", "filament", "wire", "bulb", "halogen", "incandescent", "power supply", "voltage detector"],
#     "safety": ["helmet", "muff", "protector", "mask", "cone", "interlock"],
#     "specialized": ["anvil", "horseshoe", "corkscrew", "knurling", "trephine","cowcatcher", "smithingmark", "dheesterboom"],
#     "processes": ["manufacturing", "assembly", "fabrication", "casting", "printing", "welding", "chopping", "wiring", "foundry", "molding", "processing", "grinding", "milling", "drilling", "cutting", "sawing", "pumping", "clinching", "imprinting", "stamping", "damascening", "enchain"],
#     "materials": ["metal", "plastic", "composite", "brick", "wood", "rope", "ceramic", "mold", "steel", "paint", "marble", "concrete", "rail", "track"],
#     "maintenance": ["repair", "upkeep", "threadlocker", "worker", "operator", "engineer", "welder", "crewmate", "jeweller", "desk", "station", "workbench", "stand", "control", "robot", "automated", "process", "computer numerical", "transport", "shipping", "truck", "vehicle", "utility", "twowheeler"],
#     "miscellaneous": ["dorfmuseum", "industriedenkmal", "wehrhahn", "kailyard"]
# }

# # Step 1: Label Analysis
# def analyze_label(label):
#     tokens = nltk.word_tokenize(label)
#     pos_tags = nltk.pos_tag(tokens)
    
#     concepts = []
#     current_concept = []
    
#     for word, pos in pos_tags:
#         if pos.startswith('NN') or pos.startswith('JJ'):
#             current_concept.append(word.lower())
#         else:
#             if current_concept:
#                 concepts.append(' '.join(current_concept))
#                 current_concept = []
    
#     if current_concept:
#         concepts.append(' '.join(current_concept))
    
#     return concepts

# df['analyzed_labels'] = df['label'].apply(analyze_label)

# # Step 2: Taxonomy Creation
# class TaxonomyNode:
#     def __init__(self, name):
#         self.name = name
#         self.children = {}
    
#     def add_child(self, child_name):
#         if child_name not in self.children:
#             self.children[child_name] = TaxonomyNode(child_name)
#         return self.children[child_name]

# root = TaxonomyNode('root')

# # Initialize the taxonomy with the manufacturing-specific categories
# for category in manufacturing_taxonomy.keys():
#     root.add_child(category)

# # Create a frequency dictionary for concepts
# concept_freq = {}
# for concepts in df['analyzed_labels']:
#     for concept in concepts:
#         concept_freq[concept] = concept_freq.get(concept, 0) + 1

# # Sort concepts by frequency
# sorted_concepts = sorted(concept_freq.items(), key=lambda x: x[1], reverse=True)

# # Function to find the most similar category
# def find_most_similar_category(concept, categories):
#     if concept in word_vectors.key_to_index:
#         similarities = [
#             (category, 1 - cosine(word_vectors[concept], word_vectors[category]))
#             for category in categories
#             if category in word_vectors.key_to_index
#         ]
#         if similarities:
#             return max(similarities, key=lambda x: x[1])[0]
#     return None

# # Create taxonomy based on frequency, word similarity, and manufacturing categories
# for concept, _ in sorted_concepts:
#     parts = concept.split()
#     current_node = root
#     for part in parts:
#         # Check if the part fits into any existing manufacturing category
#         category = find_most_similar_category(part, manufacturing_taxonomy.keys())
#         if category and category in current_node.children:
#             current_node = current_node.children[category]
#         current_node = current_node.add_child(part)

# # Add subcategories to their respective categories
# for category, subcategories in manufacturing_taxonomy.items():
#     category_node = root.children[category]
#     for subcategory in subcategories:
#         if subcategory not in category_node.children:
#             category_node.add_child(subcategory)

# # Step 3: Data Annotation
# def get_path(node, label):
#     label_parts = label.lower().split()
#     path = ['root']
#     current_node = node
    
#     for part in label_parts:
#         if part in current_node.children:
#             path.append(part)
#             current_node = current_node.children[part]
#         else:
#             # Find the most similar child using word embeddings
#             most_similar = None
#             highest_similarity = -1
#             for child in current_node.children:
#                 if child in word_vectors.key_to_index and part in word_vectors.key_to_index:
#                     similarity = word_vectors.similarity(child, part)
#                     if similarity > highest_similarity:
#                         highest_similarity = similarity
#                         most_similar = child
            
#             if most_similar and highest_similarity > 0.5:
#                 path.append(most_similar)
#                 current_node = current_node.children[most_similar]
#             else:
#                 break
    
#     return path if len(path) > 1 else None

# df['hierarchical_label'] = df['label'].apply(lambda x: get_path(root, x))

# # Save the annotated dataset
# df.to_csv('annotated_dataset.csv', index=False)

# # Print the taxonomy
# def print_taxonomy(node, level=0):
#     print('  ' * level + node.name)
#     for child in node.children.values():
#         print_taxonomy(child, level + 1)

# print("Taxonomy structure:")
# print_taxonomy(root)

# # Validation step
# def validate_taxonomy(node, path=[]):
#     issues = []
#     current_path = path + [node.name]
    
#     # Check for very deep branches (e.g., more than 5 levels)
#     if len(current_path) > 5:
#         issues.append(f"Deep branch detected: {' > '.join(current_path)}")
    
#     # Check for nodes with too many children (e.g., more than 10)
#     if len(node.children) > 10:
#         issues.append(f"Node with many children: {' > '.join(current_path)}, Children count: {len(node.children)}")
    
#     # Recursive call for children
#     for child in node.children.values():
#         issues.extend(validate_taxonomy(child, current_path))
    
#     return issues

# print("\nValidating taxonomy...")
# validation_issues = validate_taxonomy(root)

# if validation_issues:
#     print("Validation issues found:")
#     for issue in validation_issues:
#         print(f"- {issue}")
# else:
#     print("No validation issues found.")

# # Manual adjustment function
# def manual_adjust_taxonomy():
#     while True:
#         action = input("\nEnter action (add/move/delete/done): ").lower()
#         if action == 'done':
#             break
#         elif action in ['add', 'move', 'delete']:
#             path = input("Enter path (e.g., root > equipment > machine): ").split(' > ')
#             if action in ['move', 'delete']:
#                 node_name = path.pop()
#             if action == 'add':
#                 new_node = input("Enter new node name: ")
            
#             current_node = root
#             for node in path:
#                 if node in current_node.children:
#                     current_node = current_node.children[node]
#                 else:
#                     print(f"Path not found: {' > '.join(path)}")
#                     break
#             else:
#                 if action == 'add':
#                     current_node.add_child(new_node)
#                     print(f"Added {new_node} to {' > '.join(path)}")
#                 elif action == 'move':
#                     if node_name in current_node.children:
#                         node_to_move = current_node.children.pop(node_name)
#                         new_parent = input("Enter new parent path: ").split(' > ')
#                         new_current = root
#                         for new_node in new_parent:
#                             if new_node in new_current.children:
#                                 new_current = new_current.children[new_node]
#                             else:
#                                 print(f"New parent path not found: {' > '.join(new_parent)}")
#                                 break
#                         else:
#                             new_current.children[node_name] = node_to_move
#                             print(f"Moved {node_name} to {' > '.join(new_parent)}")
#                     else:
#                         print(f"Node not found: {node_name}")
#                 elif action == 'delete':
#                     if node_name in current_node.children:
#                         del current_node.children[node_name]
#                         print(f"Deleted {node_name} from {' > '.join(path)}")
#                     else:
#                         print(f"Node not found: {node_name}")
#         else:
#             print("Invalid action. Please enter add, move, delete, or done.")

# print("\nManual Taxonomy Adjustment")
# print("You can now manually adjust the taxonomy.")
# # manual_adjust_taxonomy()

# # # Re-annotate the dataset after manual adjustments
# # df['hierarchical_label'] = df['label'].apply(lambda x: get_path(root, x))

# # # Save the final annotated dataset
# # df.to_csv('final_annotated_dataset.csv', index=False)

# # print("\nFinal Taxonomy structure:")
# print_taxonomy(root)

# # print("\nAnnotation complete. Final dataset saved as 'final_annotated_dataset.csv'")

[nltk_data] Downloading package wordnet to /home/trkosire/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/trkosire/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Loading word embeddings...
Taxonomy structure:
root
  machines
    lathe
      information
      operator
      chuck
        insert
      tailstock
        assembly
        center
      tool
        post
      way
      cutter
      center
      carriage
      headstock
        assembly
      bit
      tail
        stock
      compound
        rest
      workpiece
      mandrel
      goalpost
      dog
      gearbox
      control
      tumbler
      head
        stock
      chunk
      work
        piece
      outside
      feedstock
      jaw
      clamp
      machine
    machine
      tool
      screw
      shop
        wheel
      cutter
        storage
      phonography
      pres
      workpiece
      rotary
        table
      table
      collet
      chuck
      tread
      tooling
      motor
      surface
      stone
        chisel
          bit
      part
        diagram
      label
      loader
      center
      hammer
        peeing
      wheel
      boring
        tool
 

In [73]:
# import pandas as pd
# import nltk
# from nltk.corpus import wordnet
# from tqdm import tqdm
# import gensim.downloader as api
# from gensim.models import KeyedVectors
# import numpy as np
# from scipy.spatial.distance import cosine
# from fuzzywuzzy import process

# # Load pre-trained word embeddings
# word_vectors = api.load("glove-wiki-gigaword-100")

# # Load your dataset
# df = pd.read_csv('final_data_exploration.csv')

# # Expanded manufacturing taxonomy
# manufacturing_taxonomy = {
#     "equipment": ["machine", "tool", "device", "gauge", "lamp", "filament", "wire", "desk", "station", "nut", "chipper", "sander", "grinder", "press", "CNC", "drill", "mixer", "saw", "chisel", "caliper", "bolt", "key", "stand", "hook", "pliers", "lathe", "oscilloscope", "feeler", "milling", "jack", "crane", "clamp", "screw", "gage", "mill", "cutter", "workbench", "mallet", "whiteout", "dispenser", "voltmeter", "chainsaw", "anvil", "horseshoe", "corkscrew", "vial", "pin", "wrench", "square"],
#     "process": ["manufacturing", "assembly", "fabrication", "casting", "printing", "welding", "chopping", "wiring", "foundry", "molding", "processing", "snow blower", "track hardware", "rail fastening", "grinding", "milling", "drilling", "cutting", "sawing", "pumping", "clinching", "imprinting", "stamping"],
#     "material": ["metal", "plastic", "composite", "brick", "wood", "rope", "ceramic", "mold", "steel", "material", "paint", "marble", "concrete", "rail", "track"],
#     "personnel": ["worker", "operator", "engineer", "welder", "crewmate", "jeweller"],
#     "quality": ["inspection", "testing", "measurement", "gage", "gauge"],
#     "safety": ["protection", "hazard", "precaution", "helmet", "muff", "protector", "light", "safety", "mask", "voltage detector", "cone", "interlock"],
#     "maintenance": ["repair", "upkeep", "sander", "grinder", "maintenance"],
#     "logistics": ["transport", "shipping", "truck", "vehicle", "utility", "clip", "carabiner"],
#     "automation": ["control", "CNC", "robot", "automated", "process", "computer numerical"],
#     "electrical": ["voltage", "power supply", "bulb", "halogen", "incandescent", "voltmeter"],
# }

# def analyze_label(label):
#     return label.lower().split()

# df['analyzed_labels'] = df['label'].apply(analyze_label)

# def find_most_similar_category(concept, categories, threshold=0.3):
#     if concept in word_vectors.key_to_index:
#         similarities = []
#         for category in categories:
#             if category in word_vectors.key_to_index:
#                 similarity = 1 - cosine(word_vectors[concept], word_vectors[category])
#                 if similarity > threshold:
#                     similarities.append((category, similarity))
#         if similarities:
#             return max(similarities, key=lambda x: x[1])[0]
#     return None

# def fuzzy_match(word, category_terms, threshold=80):
#     matches = process.extractBests(word, category_terms, score_cutoff=threshold)
#     return matches[0][0] if matches else None

# def map_to_taxonomy(concepts, taxonomy):
#     for i in range(len(concepts)):
#         # Check single words
#         concept = concepts[i]
        
#         # Direct match
#         for category, terms in taxonomy.items():
#             if concept in terms:
#                 return category
        
#         # Fuzzy match
#         for category, terms in taxonomy.items():
#             fuzzy_match_result = fuzzy_match(concept, terms)
#             if fuzzy_match_result:
#                 return category
        
#         # Word similarity
#         category = find_most_similar_category(concept, taxonomy.keys())
#         if category:
#             return category
        
#         # Check word combinations (for multi-word labels)
#         if i < len(concepts) - 1:
#             two_word_concept = concepts[i] + " " + concepts[i+1]
#             for category, terms in taxonomy.items():
#                 if two_word_concept in terms:
#                     return category
    
#     # Fallback: return the category with the highest average similarity
#     avg_similarities = []
#     for category, terms in taxonomy.items():
#         similarities = [word_vectors.similarity(concept, term) for concept in concepts for term in terms if concept in word_vectors.key_to_index and term in word_vectors.key_to_index]
#         if similarities:
#             avg_similarities.append((category, np.mean(similarities)))
    
#     if avg_similarities:
#         return max(avg_similarities, key=lambda x: x[1])[0]
    
#     return "unknown"

# df['taxonomy_label'] = df['analyzed_labels'].apply(lambda concepts: map_to_taxonomy(concepts, manufacturing_taxonomy))

# # Print statistics
# print(df['taxonomy_label'].value_counts(normalize=True))

# # Check remaining unknown labels
# unknown_labels = df[df['taxonomy_label'] == 'unknown']['label'].unique()
# print("\nSample of remaining 'unknown' labels:")
# print(unknown_labels[:20])  # Print first 20 unknown labels

# # Print absolute counts
# print("\nAbsolute counts:")
# print(df['taxonomy_label'].value_counts())




taxonomy_label
equipment      0.371443
process        0.188404
material       0.144886
electrical     0.099567
safety         0.057508
quality        0.042402
personnel      0.033389
maintenance    0.024076
automation     0.021158
logistics      0.010944
unknown        0.006223
Name: proportion, dtype: float64

Sample of remaining 'unknown' labels:
['tailstock' 'dorfmuseum unterammergau' 'cowcatcher' 'secateur' 'knurling'
 'smithingmark' 'roadheader' 'threadlocker'
 'industriedenkmal riemenfallhmmer breckerfeld' 'rasper' 'telehandler'
 'damascening' 'trephine' 'dheesterboom' 'fissurometer' 'twowheeler'
 'wehrhahn' 'enchain' 'optometer' 'kailyard']

Absolute counts:
taxonomy_label
equipment      8655
process        4390
material       3376
electrical     2320
safety         1340
quality         988
personnel       778
maintenance     561
automation      493
logistics       255
unknown         145
Name: count, dtype: int64


In [None]:
# import pandas as pd
# import nltk
# from nltk.corpus import wordnet
# from tqdm import tqdm
# import gensim.downloader as api
# from gensim.models import KeyedVectors
# import numpy as np
# from scipy.spatial.distance import cosine

# # Load pre-trained word embeddings
# word_vectors = api.load("glove-wiki-gigaword-100")

# # Load your dataset
# df = pd.read_csv('final_data_exploration.csv')

# # Expanded manufacturing taxonomy
# manufacturing_taxonomy = {
#     "equipment": ["machine", "tool", "device", "gauge", "lamp", "filament", "wire", "desk", "station", "nut", "chipper", "sander", "grinder", "press", "CNC", "drill", "mixer", "saw", "chisel", "caliper", "bolt", "key", "stand", "hook", "pliers", "lathe", "oscilloscope", "feeler", "milling", "jack", "crane", "clamp", "screw", "gage", "mill", "cutter", "workbench"],
#     "process": ["manufacturing", "assembly", "fabrication", "casting", "printing", "welding", "chopping", "wiring", "foundry", "molding", "processing", "snow blower", "track hardware", "rail fastening", "grinding", "milling", "drilling", "cutting", "sawing", "pumping"],
#     "material": ["metal", "plastic", "composite", "brick", "wood", "rope", "ceramic", "mold", "steel", "material", "paint", "marble", "concrete", "rail", "track"],
#     "personnel": ["worker", "operator", "engineer", "welder", "crewmate", "jeweller"],
#     "quality": ["inspection", "testing", "measurement", "gage", "gauge"],
#     "safety": ["protection", "hazard", "precaution", "helmet", "muff", "protector", "light", "safety", "mask", "voltage detector", "cone"],
#     "maintenance": ["repair", "upkeep", "sander", "grinder", "maintenance"],
#     "logistics": ["transport", "shipping", "truck", "vehicle", "utility", "clip", "carabiner"],
#     "automation": ["control", "CNC", "robot", "automated", "process", "computer numerical"],
#     "electrical": ["voltage", "power supply", "bulb", "halogen", "incandescent"],
# }

# def analyze_label(label):
#     tokens = nltk.word_tokenize(label.lower())
#     return tokens

# df['analyzed_labels'] = df['label'].apply(analyze_label)

# def find_most_similar_category(concept, categories, threshold=0.3):
#     if concept in word_vectors.key_to_index:
#         similarities = []
#         for category in categories:
#             if category in word_vectors.key_to_index:
#                 similarity = 1 - cosine(word_vectors[concept], word_vectors[category])
#                 if similarity > threshold:
#                     similarities.append((category, similarity))
#         if similarities:
#             return max(similarities, key=lambda x: x[1])[0]
#     return None

# def map_to_taxonomy(concepts, taxonomy):
#     for concept in concepts:
#         # First, check if the concept is directly in any category
#         for category, terms in taxonomy.items():
#             if concept in terms:
#                 return category
        
#         # If not, use word similarity
#         category = find_most_similar_category(concept, taxonomy.keys())
#         if category:
#             return category
        
#         # If still no match, check similarity with terms in each category
#         for category, terms in taxonomy.items():
#             for term in terms:
#                 if find_most_similar_category(concept, [term], threshold=0.5):
#                     return category
    
#     return "unknown"

# df['taxonomy_label'] = df['analyzed_labels'].apply(lambda concepts: map_to_taxonomy(concepts, manufacturing_taxonomy))

# # Print statistics
# print(df['taxonomy_label'].value_counts(normalize=True))

# # Check remaining unknown labels
# unknown_labels = df[df['taxonomy_label'] == 'unknown']['label'].unique()
# print("\nSample of remaining 'unknown' labels:")
# print(unknown_labels[:20])  # Print first 20 unknown labels


In [65]:
# import pandas as pd
# import nltk
# from nltk.corpus import wordnet
# from tqdm import tqdm
# import gensim.downloader as api
# from gensim.models import KeyedVectors
# import numpy as np
# from scipy.spatial.distance import cosine

# # Download necessary NLTK data
# # nltk.download('wordnet')
# # nltk.download('averaged_perceptron_tagger')

# # Load pre-trained word embeddings
# # print("Loading word embeddings...")
# word_vectors = api.load("glove-wiki-gigaword-100")

# # Load your dataset
# df = pd.read_csv('final_data_exploration.csv')

# # Define a basic manufacturing taxonomy
# manufacturing_taxonomy = {
#     "equipment": ["machine", "tool", "device", "gauge", "lamp", "filament", "wire", "desk", "station", "nut", "chipper", "sander", "grinder", "press", "CNC", "drill", "mixer", "saw", "chisel", "caliper", "bolt", "key", "stand", "hook", "pliers", "lathe", "oscilloscope"],
#     "process": ["manufacturing", "assembly", "fabrication", "casting", "printing", "welding", "chopping", "wiring", "foundry", "molding", "processing", "snow blower", "track hardware", "rail fastening"],
#     "material": ["metal", "plastic", "composite", "brick", "wood", "rope", "ceramic", "mold", "steel", "material", "paint", "marble"],
#     "personnel": ["worker", "operator", "engineer", "welder", "crewmate"],
#     "quality": ["inspection", "testing", "measurement", "gage"],
#     "safety": ["protection", "hazard", "precaution", "helmet", "muff", "protector", "light", "safety", "mask", "voltage detector"],
#     "maintenance": ["repair", "upkeep", "sander", "grinder", "maintenance"],
#     "logistics": ["transport", "shipping", "truck", "vehicle", "utility", "clip"],
#     "automation": ["control", "CNC", "robot", "automated", "process"],
# }


# # Step 1: Label Analysis
# def analyze_label(label):
#     tokens = nltk.word_tokenize(label)
#     pos_tags = nltk.pos_tag(tokens)
    
#     concepts = []
#     current_concept = []
    
#     for word, pos in pos_tags:
#         if pos.startswith('NN') or pos.startswith('JJ'):
#             current_concept.append(word.lower())
#         else:
#             if current_concept:
#                 concepts.append(' '.join(current_concept))
#                 current_concept = []
    
#     if current_concept:
#         concepts.append(' '.join(current_concept))
    
#     return concepts

# df['analyzed_labels'] = df['label'].apply(analyze_label)

# # Step 2: Taxonomy Creation
# class TaxonomyNode:
#     def __init__(self, name):
#         self.name = name
#         self.children = {}
    
#     def add_child(self, child_name):
#         if child_name not in self.children:
#             self.children[child_name] = TaxonomyNode(child_name)
#         return self.children[child_name]

# root = TaxonomyNode('root')

# # Initialize the taxonomy with the manufacturing-specific categories
# for category in manufacturing_taxonomy.keys():
#     root.add_child(category)

# # Create a frequency dictionary for concepts
# concept_freq = {}
# for concepts in df['analyzed_labels']:
#     for concept in concepts:
#         concept_freq[concept] = concept_freq.get(concept, 0) + 1

# # Sort concepts by frequency
# sorted_concepts = sorted(concept_freq.items(), key=lambda x: x[1], reverse=True)

# # Function to find the most similar category
# def find_most_similar_category(concept, categories):
#     if concept in word_vectors.key_to_index:
#         similarities = [
#             (category, 1 - cosine(word_vectors[concept], word_vectors[category]))
#             for category in categories
#             if category in word_vectors.key_to_index
#         ]
#         if similarities:
#             return max(similarities, key=lambda x: x[1])[0]
#     return None

# # Create taxonomy based on frequency, word similarity, and manufacturing categories
# for concept, _ in sorted_concepts:
#     parts = concept.split()
#     current_node = root
#     for part in parts:
#         # Check if the part fits into any existing manufacturing category
#         category = find_most_similar_category(part, manufacturing_taxonomy.keys())
#         if category and category in current_node.children:
#             current_node = current_node.children[category]
#         current_node = current_node.add_child(part)

# # Add subcategories to their respective categories
# for category, subcategories in manufacturing_taxonomy.items():
#     category_node = root.children[category]
#     for subcategory in subcategories:
#         if subcategory not in category_node.children:
#             category_node.add_child(subcategory)


# df['analyzed_labels'] = df['label'].apply(analyze_label)


# # If there are common terms that should be categorized, we can add them to the taxonomy
# # change the labels with the manufacturing_taxonomy there should be in total  5 labels
# # equipment, process, material, personnel, quality, safety
# # for example machine should be equipment, manufacturing should be process, metal should be material, worker should be personnel, inspection should be quality, protection should be safety
# # Function to find the most similar category
# def find_most_similar_category(concept, categories):
#     if concept in word_vectors.key_to_index:
#         similarities = []
#         for category in categories:
#             if category in word_vectors.key_to_index:
#                 similarity = 1 - cosine(word_vectors[concept], word_vectors[category])
#                 similarities.append((category, similarity))
#         if similarities:
#             return max(similarities, key=lambda x: x[1])[0]
#     return None

# # Function to map labels to the closest manufacturing taxonomy category
# def map_to_taxonomy(concepts, taxonomy):
#     for concept in concepts:
#         category = find_most_similar_category(concept, taxonomy.keys())
#         if category:
#             return category
#     return "unknown"  # If no match found, return 'unknown' or any default label

# # Update the DataFrame with the new labels
# df['taxonomy_label'] = df['analyzed_labels'].apply(lambda concepts: map_to_taxonomy(concepts, manufacturing_taxonomy))



In [66]:
df

Unnamed: 0,label,x1,y1,height,width,source,file_name,analyzed_labels,taxonomy_label
0,railway grinding machine,0.0,0.0,720.0,720.0,user,1711591656377,"[railway, machine]",maintenance
1,rail grinder,0.0,0.0,720.0,720.0,user,1711591656377,[rail grinder],unknown
2,rail grinder with worker,0.0,0.0,720.0,720.0,user,1711591656377,"[rail grinder, worker]",maintenance
3,railway grinding machine with worker,0.0,0.0,720.0,720.0,user,1711591656377,"[railway, machine, worker]",maintenance
4,worker,0.0,0.0,720.0,720.0,user,1711591656377,[worker],maintenance
...,...,...,...,...,...,...,...,...,...
23296,minibar t,0.0,0.0,2660.0,4000.0,wikimedia,1711580020679,[minibar t],unknown
23297,rail clamp,0.0,0.0,605.0,907.0,user,1711581093292,[rail clamp],unknown
23298,patrol rail fastening,0.0,0.0,2304.0,3456.0,wikimedia,1711581093292,[patrol rail fastening],unknown
23299,drill bit,0.0,0.0,651.0,907.0,user,1711566352395,[bit],quality


In [67]:
df['taxonomy_label'].value_counts()

taxonomy_label
unknown        11704
automation      2592
equipment       2343
material        1957
process         1786
maintenance      938
safety           919
quality          846
personnel        125
logistics         91
Name: count, dtype: int64

In [68]:
# Check a sample of the 'unknown' labels to understand why they weren't matched
unknown_labels = df[df['taxonomy_label'] == 'unknown']['label'].unique()
print("Sample of 'unknown' labels:")
print(unknown_labels[:100])  # Print first 50 unknown labels


Sample of 'unknown' labels:
['rail grinder' 'railgrinder of the united state' 'ceramic mold'
 'feeler gage' 'feeler gauge' 'milling machine'
 'computer numerical control' 'printing pres' 'marinoni rotary pres cam'
 'wood drill' 'work desk' 'work station' 'paint bottle' 'ear muff'
 'hearing protector' 'desk lamp' 'jeweller bench' 'wood chipper'
 'traffic cone' 'square nut' 'utility vehicle' 'jack stand' 'scissor jack'
 'crane hook' 'safety helmet' 'grinder spark' 'door handle' 'door lock'
 'from woltersum' 'steel foundry' 'ladle metallurgy' 'hand sander'
 'random orbital sander' 'angle grinder' 'light bulb' 'halogen light bulb'
 'incandescent light bulb' 'machine screw' 'gage block' 'gauge block'
 'portable hand coffee mill' 'hand tool' 'outside caliper' 'hand chisel'
 'hack saw' 'c clamp' 'try square' 'industrial machine' 'vertical mixer'
 'heavyduty mixer' 'material mixer' 'industrial processor' 'batch mixer'
 'mechanical mixer' 'casting machine' 'chamberlain key' 'correction tape'
 '