In [1]:
# import libraries
import os 
import csv
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
import glob
from PIL import ImageFile
from sklearn.cluster import KMeans
import transformers
from sentence_transformers import SentenceTransformer
from PIL import Image, UnidentifiedImageError
import sklearn
ImageFile.LOAD_TRUNCATED_IMAGES = True
import re
import random
from spellchecker import SpellChecker
import inflect

# don't show the warning
import warnings
warnings.filterwarnings("ignore")

In [1]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from tqdm import tqdm
import gensim.downloader as api
from gensim.models import KeyedVectors
import numpy as np
from scipy.spatial.distance import cosine

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load pre-trained word embeddings
print("Loading word embeddings...")
word_vectors = api.load("glove-wiki-gigaword-100")

# Load your dataset
df = pd.read_csv('final_data_exploration.csv')

# Define a basic manufacturing taxonomy
manufacturing_taxonomy = {
    "equipment": ["machine", "tool", "device"],
    "process": ["manufacturing", "assembly", "fabrication"],
    "material": ["metal", "plastic", "composite"],
    "personnel": ["worker", "operator", "engineer"],
    "quality": ["inspection", "testing", "measurement"],
    "safety": ["protection", "hazard", "precaution"]
}

# Step 1: Label Analysis
def analyze_label(label):
    tokens = nltk.word_tokenize(label)
    pos_tags = nltk.pos_tag(tokens)
    
    concepts = []
    current_concept = []
    
    for word, pos in pos_tags:
        if pos.startswith('NN') or pos.startswith('JJ'):
            current_concept.append(word.lower())
        else:
            if current_concept:
                concepts.append(' '.join(current_concept))
                current_concept = []
    
    if current_concept:
        concepts.append(' '.join(current_concept))
    
    return concepts

df['analyzed_labels'] = df['label'].apply(analyze_label)

# Step 2: Taxonomy Creation
class TaxonomyNode:
    def __init__(self, name):
        self.name = name
        self.children = {}
    
    def add_child(self, child_name):
        if child_name not in self.children:
            self.children[child_name] = TaxonomyNode(child_name)
        return self.children[child_name]

root = TaxonomyNode('root')

# Initialize the taxonomy with the manufacturing-specific categories
for category in manufacturing_taxonomy.keys():
    root.add_child(category)

# Create a frequency dictionary for concepts
concept_freq = {}
for concepts in df['analyzed_labels']:
    for concept in concepts:
        concept_freq[concept] = concept_freq.get(concept, 0) + 1

# Sort concepts by frequency
sorted_concepts = sorted(concept_freq.items(), key=lambda x: x[1], reverse=True)

# Function to find the most similar category
def find_most_similar_category(concept, categories):
    if concept in word_vectors.key_to_index:
        similarities = [
            (category, 1 - cosine(word_vectors[concept], word_vectors[category]))
            for category in categories
            if category in word_vectors.key_to_index
        ]
        if similarities:
            return max(similarities, key=lambda x: x[1])[0]
    return None

# Create taxonomy based on frequency, word similarity, and manufacturing categories
for concept, _ in sorted_concepts:
    parts = concept.split()
    current_node = root
    for part in parts:
        # Check if the part fits into any existing manufacturing category
        category = find_most_similar_category(part, manufacturing_taxonomy.keys())
        if category and category in current_node.children:
            current_node = current_node.children[category]
        current_node = current_node.add_child(part)

# Add subcategories to their respective categories
for category, subcategories in manufacturing_taxonomy.items():
    category_node = root.children[category]
    for subcategory in subcategories:
        if subcategory not in category_node.children:
            category_node.add_child(subcategory)

# Step 3: Data Annotation
def get_path(node, label):
    label_parts = label.lower().split()
    path = ['root']
    current_node = node
    
    for part in label_parts:
        if part in current_node.children:
            path.append(part)
            current_node = current_node.children[part]
        else:
            # Find the most similar child using word embeddings
            most_similar = None
            highest_similarity = -1
            for child in current_node.children:
                if child in word_vectors.key_to_index and part in word_vectors.key_to_index:
                    similarity = word_vectors.similarity(child, part)
                    if similarity > highest_similarity:
                        highest_similarity = similarity
                        most_similar = child
            
            if most_similar and highest_similarity > 0.5:
                path.append(most_similar)
                current_node = current_node.children[most_similar]
            else:
                break
    
    return path if len(path) > 1 else None

df['hierarchical_label'] = df['label'].apply(lambda x: get_path(root, x))

# Save the annotated dataset
df.to_csv('annotated_dataset.csv', index=False)

# Print the taxonomy
def print_taxonomy(node, level=0):
    print('  ' * level + node.name)
    for child in node.children.values():
        print_taxonomy(child, level + 1)

print("Taxonomy structure:")
print_taxonomy(root)

# Validation step
def validate_taxonomy(node, path=[]):
    issues = []
    current_path = path + [node.name]
    
    # Check for very deep branches (e.g., more than 5 levels)
    if len(current_path) > 5:
        issues.append(f"Deep branch detected: {' > '.join(current_path)}")
    
    # Check for nodes with too many children (e.g., more than 10)
    if len(node.children) > 10:
        issues.append(f"Node with many children: {' > '.join(current_path)}, Children count: {len(node.children)}")
    
    # Recursive call for children
    for child in node.children.values():
        issues.extend(validate_taxonomy(child, current_path))
    
    return issues

print("\nValidating taxonomy...")
validation_issues = validate_taxonomy(root)

if validation_issues:
    print("Validation issues found:")
    for issue in validation_issues:
        print(f"- {issue}")
else:
    print("No validation issues found.")

# Manual adjustment function
def manual_adjust_taxonomy():
    while True:
        action = input("\nEnter action (add/move/delete/done): ").lower()
        if action == 'done':
            break
        elif action in ['add', 'move', 'delete']:
            path = input("Enter path (e.g., root > equipment > machine): ").split(' > ')
            if action in ['move', 'delete']:
                node_name = path.pop()
            if action == 'add':
                new_node = input("Enter new node name: ")
            
            current_node = root
            for node in path:
                if node in current_node.children:
                    current_node = current_node.children[node]
                else:
                    print(f"Path not found: {' > '.join(path)}")
                    break
            else:
                if action == 'add':
                    current_node.add_child(new_node)
                    print(f"Added {new_node} to {' > '.join(path)}")
                elif action == 'move':
                    if node_name in current_node.children:
                        node_to_move = current_node.children.pop(node_name)
                        new_parent = input("Enter new parent path: ").split(' > ')
                        new_current = root
                        for new_node in new_parent:
                            if new_node in new_current.children:
                                new_current = new_current.children[new_node]
                            else:
                                print(f"New parent path not found: {' > '.join(new_parent)}")
                                break
                        else:
                            new_current.children[node_name] = node_to_move
                            print(f"Moved {node_name} to {' > '.join(new_parent)}")
                    else:
                        print(f"Node not found: {node_name}")
                elif action == 'delete':
                    if node_name in current_node.children:
                        del current_node.children[node_name]
                        print(f"Deleted {node_name} from {' > '.join(path)}")
                    else:
                        print(f"Node not found: {node_name}")
        else:
            print("Invalid action. Please enter add, move, delete, or done.")

print("\nManual Taxonomy Adjustment")
print("You can now manually adjust the taxonomy.")
# manual_adjust_taxonomy()

# Re-annotate the dataset after manual adjustments
df['hierarchical_label'] = df['label'].apply(lambda x: get_path(root, x))

# Save the final annotated dataset
df.to_csv('final_annotated_dataset.csv', index=False)

print("\nFinal Taxonomy structure:")
print_taxonomy(root)

print("\nAnnotation complete. Final dataset saved as 'final_annotated_dataset.csv'")

[nltk_data] Downloading package wordnet to /home/trkosire/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/trkosire/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Loading word embeddings...
Taxonomy structure:
root
  equipment
    scissors
    chainsaw
      chain
      chap
      blade
      pie
      safety
        clothing
      tooth
      link
    tool
      bit
      post
      belt
      chest
      grinder
      box
      crib
      battery
      insert
      bench
      tip
      end
      kit
      head
      handler
      organization
      cabinet
      holder
      storage
        area
      presenter
      cutter
        grinder
    lathe
      information
      operator
      chuck
        insert
      tailstock
        assembly
        center
      tool
        post
      way
      cutter
      center
      carriage
      headstock
        assembly
      bit
      tail
        stock
      compound
        rest
      workpiece
      mandrel
      goalpost
      dog
      gearbox
      control
      tumbler
      head
        stock
      chunk
      work
        piece
      outside
      feedstock
      jaw
      clamp
      machin

In [2]:
df = pd.read_csv('final_data_exploration.csv')

In [3]:
# applying clustering to the data from 



model = SentenceTransformer('all-mpnet-base-v2')

label_embeddings = model.encode(df['label'].tolist())

max_clusters = 300
all_silhouette_scores = []
for num_clusters in range(2, max_clusters + 1) :
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
    cluster_labels = kmeans.fit_predict(label_embeddings)
    silhouette_sc = sklearn.metrics.silhouette_score(label_embeddings, cluster_labels)
    all_silhouette_scores.append(silhouette_sc)

optimal_clusters = all_silhouette_scores.index(max(all_silhouette_scores)) + 2

kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(label_embeddings)

label_to_cluster = {label: cluster for label, cluster in zip(df['label'].unique(), cluster_labels)}
df[f'optimal_clusters_{optimal_clusters}'] = df['label'].map(label_to_cluster)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
