# New Implementation

In [1]:
import pandas as pd
import re

# Predefined mapping from technology to abbreviation.
# abbreviations = {
#     "Ada": "Ada",
#     "Apex": "Apex",
#     "Assembly": "ASM",  
#     "Bash/Shell (all shells)": "Bash",
#     "C": "C",
#     "C#": "C#",
#     "C++": "C++",
#     "Clojure": "Clojure",
#     "Cobol": "COBOL",
#     "Crystal": "Crystal",
#     "Dart": "Dart",
#     "Delphi": "Delphi",
#     "Elixir": "Elixir",
#     "Erlang": "Erlang",
#     "F#": "F#",
#     "Fortran": "Fortran",
#     "GDScript": "GDScript",
#     "Go": "Go",
#     "Groovy": "Groovy",
#     "Haskell": "Haskell",
#     "HTML/CSS": "HTML/CSS",
#     "Java": "Java",
#     "JavaScript": "JS",
#     "Julia": "Julia",
#     "Kotlin": "Kotlin",
#     "Lisp": "Lisp",
#     "Lua": "Lua",
#     "MATLAB": "MATLAB",
#     "MicroPython": "MicroPython",
#     "Nim": "Nim",
#     "Objective-C": "Obj-C",
#     "OCaml": "OCaml",
#     "Perl": "Perl",
#     "PHP": "PHP",
#     "PowerShell": "PowerShell",
#     "Prolog": "Prolog",
#     "Python": "Python",
#     "R": "R",
#     "Ruby": "Ruby",
#     "Rust": "Rust",
#     "Scala": "Scala",
#     "Solidity": "Solidity",
#     "SQL": "SQL",
#     "Swift": "Swift",
#     "TypeScript": "TS",
#     "VBA": "VBA",
#     "Visual Basic (.Net)": "VB.NET",
#     "Zephyr": "Zephyr",
#     "Zig": "Zig"
# }

# def update_technology_abbreviations(csv_path, target_column):
#     df = pd.read_csv(csv_path)
#     summary_changes = {}
    
#     def process_cell(cell):
#         if not isinstance(cell, str):
#             return cell
#         items = [item.strip() for item in cell.split(";") if item.strip()]
#         new_items = []
#         for item in items:
#             if re.search(r'\(.*\)', item):
#                 new_items.append(item)
#                 continue
#             updated_item = item
#             for tech, abbr in abbreviations.items():
#                 pattern = re.compile(rf"^{re.escape(tech)}$", re.IGNORECASE)
#                 if pattern.match(item):
#                     updated_item = f"{tech} ({abbr})"
#                     summary_changes[item] = updated_item
#                     break
#             new_items.append(updated_item)
#         return "; ".join(new_items)
    
#     df[target_column] = df[target_column].apply(process_cell)
#     return df, summary_changes

if __name__ == "__main__":
    csv_file = "clean_v2.csv"
    target_col = "LanguageHaveWorkedWith"
    updated_df, changes = update_technology_abbreviations(csv_file, target_col)
    print("Summary of changes:")
    for orig, updated in changes.items():
        print(f"{orig} -> {updated}")
    updated_df.to_csv("clean_v3.csv", index=False)


NameError: name 'update_technology_abbreviations' is not defined

In [17]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure required NLTK resources are available.
# nltk.download('punkt')
# nltk.download('stopwords')

class ConceptMatcher:
    def __init__(
        self,
        csv_path="clean_v2_updated.csv",  # Use updated CSV with abbreviations
        columns=None,
        model_name="all-mpnet-base-v2",
        similarity_threshold_graph=0.7,
        ngram_threshold=0.5,
        filter_similarity_threshold=0.85
    ):
        if columns is None:
            columns = [
                "LanguageHaveWorkedWith", "DatabaseHaveWorkedWith", "PlatformHaveWorkedWith",
                "WebframeHaveWorkedWith", "EmbeddedHaveWorkedWith", "MiscTechHaveWorkedWith",
                "ToolsTechHaveWorkedWith"
            ]
        self.csv_path = csv_path
        self.columns = columns
        self.model_name = model_name
        self.similarity_threshold_graph = similarity_threshold_graph
        self.ngram_threshold = ngram_threshold
        self.filter_similarity_threshold = filter_similarity_threshold

        # Initialize NLTK stop words and custom filter words.
        self.stop_words = set(stopwords.words('english'))
        self.custom_filter_words = {
            'additionally', 'also', 'furthermore', 'moreover', 'including', 'like', 'career', 'etc'
        }

        # Initialize the SentenceTransformer model.
        self.model = SentenceTransformer(self.model_name)

        # Placeholders for later processing.
        self.stack_concepts = []          # List of concept dictionaries.
        self.concept_embeddings = None    # Numpy array of concept embeddings.
        self.candidate_phrases = []       # Candidate n‑gram phrases from input text.
        self.candidate_embeddings = None  # Numpy array of candidate embeddings.
        self.recognized_candidates_ngram = []  # Matched candidates with similarity scores.
        self.filtered_by_concept = {}     # Final grouped output after global filtering.
        self.graph = None                 # Optional similarity graph.

    def clean_text(self, text):
        """
        Lowercase the text and remove punctuation except for hyphens and parentheses.
        """
        # Preserve '-' and parentheses by removing other punctuation.
        punctuation_to_remove = "".join(ch for ch in string.punctuation if ch not in "-()")
        text = text.lower().translate(str.maketrans("", "", punctuation_to_remove))
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return " ".join(tokens)

    def is_meaningful(self, phrase):
        tokens = [t.lower() for t in word_tokenize(phrase) if t.isalpha()]
        if not tokens:
            return False
        if any(token in self.custom_filter_words for token in tokens):
            return False
        if len(tokens) == 1 and tokens[0] in self.stop_words:
            return False
        if tokens and sum(1 for t in tokens if t in self.stop_words) / len(tokens) > 0.5:
            return False
        return True

    def load_concepts(self):
        df = pd.read_csv(self.csv_path)
        concepts_dict = {}
        for col in self.columns:
            cells = df[col].dropna().tolist()
            for cell in cells:
                items = [item.strip() for item in cell.split(";") if item.strip()]
                for item in items:
                    concepts_dict[item] = col
        self.stack_concepts = [{"name": name, "type": ctype} for name, ctype in concepts_dict.items()]
        print(f"Total StackOverflow Concepts: {len(self.stack_concepts)}")

    def generate_concept_embeddings(self, save_embeddings=True):
        concept_texts = [concept["name"] for concept in self.stack_concepts]
        self.concept_embeddings = self.model.encode(concept_texts, convert_to_numpy=True)
        if save_embeddings:
            filename = f"stack_concept_embeddings_{self.model_name.replace('/', '_')}.npy"
            np.save(filename, self.concept_embeddings)
            print(f"Concept embeddings saved to {filename}")

    def build_similarity_graph(self):
        self.graph = nx.Graph()
        concept_texts = [concept["name"] for concept in self.stack_concepts]
        for concept in self.stack_concepts:
            self.graph.add_node(concept["name"], category=concept["type"])
        sim_matrix = cosine_similarity(self.concept_embeddings)
        for i in range(len(concept_texts)):
            for j in range(i + 1, len(concept_texts)):
                if sim_matrix[i][j] >= self.similarity_threshold_graph:
                    self.graph.add_edge(concept_texts[i], concept_texts[j], weight=sim_matrix[i][j])
        print(f"Graph contains {len(self.graph.nodes)} nodes and {len(self.graph.edges)} edges.")

    def prepare_candidate_phrases(self, long_text):
        cleaned_full_text = self.clean_text(long_text)
        tokens_clean = word_tokenize(cleaned_full_text)
        candidate_phrases = []
        for n in [3, 2, 1]:
            for gram in ngrams(tokens_clean, n):
                phrase = " ".join(gram)
                if phrase.strip() and self.is_meaningful(phrase):
                    candidate_phrases.append(phrase)
        self.candidate_phrases = list(set(candidate_phrases))
        print(f"Total candidate phrases generated: {len(self.candidate_phrases)}")

    def vectorized_match_candidates(self):
        self.candidate_embeddings = self.model.encode(self.candidate_phrases, convert_to_numpy=True)
        similarity_matrix = cosine_similarity(self.candidate_embeddings, self.concept_embeddings)
        max_similarities = similarity_matrix.max(axis=1)
        max_indices = similarity_matrix.argmax(axis=1)
        valid_indices = np.where(max_similarities >= self.ngram_threshold)[0]
        self.recognized_candidates_ngram = []
        for idx in valid_indices:
            max_sim = max_similarities[idx]
            max_idx = max_indices[idx]
            concept_name = self.stack_concepts[max_idx]["name"]
            concept_type = self.stack_concepts[max_idx]["type"]
            phrase = self.candidate_phrases[idx]
            n_val = len(phrase.split())
            tokens_phrase = phrase.split()
            self.recognized_candidates_ngram.append(
                (concept_name, concept_type, phrase, max_sim, n_val, tokens_phrase)
            )
        print(f"Total recognized candidate matches: {len(self.recognized_candidates_ngram)}")

    def global_filtering(self):
        recognized = sorted(self.recognized_candidates_ngram, key=lambda x: x[3], reverse=True)
        global_used_words = set()
        filtered_candidates = []
        for candidate in recognized:
            concept_name, concept_type, phrase, score, n_val, tokens_phrase = candidate
            if any(token in global_used_words for token in tokens_phrase):
                continue
            filtered_candidates.append(candidate)
            if score > self.filter_similarity_threshold:
                global_used_words.update(tokens_phrase)
        self.filtered_by_concept = {}
        for concept_name, concept_type, phrase, score, n_val, tokens_phrase in filtered_candidates:
            self.filtered_by_concept.setdefault(concept_name, {"type": concept_type, "phrases": []})
            self.filtered_by_concept[concept_name]["phrases"].append((phrase, score, n_val, tokens_phrase))
        print("Global filtering completed.")

    def print_results(self):
        print("\nGlobally Filtered Recognized Concepts using n‑gram detection (from StackOverflow data):")
        print("=" * 60)
        for concept, info in self.filtered_by_concept.items():
            concept_type = info["type"]
            print(f"Concept: {concept} ({concept_type})")
            for phrase, score, n_val, tokens_phrase in sorted(info["phrases"], key=lambda x: x[1], reverse=True):
                print(f"    Detected {n_val}-gram: '{phrase}' with similarity {score:.2f}")
            print("-" * 60)

# Example usage:
if __name__ == "__main__":
    matcher = ConceptMatcher(
        csv_path="clean_v2_updated.csv",
        model_name="all-mpnet-base-v2",
        similarity_threshold_graph=0.7,
        ngram_threshold=0.5,
        filter_similarity_threshold=0.85
    )
    matcher.load_concepts()
    matcher.generate_concept_embeddings()
    matcher.build_similarity_graph()
    
    # Sample text now includes "Obj-C" as written.
    sample_text = """
    I have extensive experience in data analysis and have worked with a variety of technologies including Microsoft SQL Server,
    Python, Obj-C, cloud computing platforms like AWS, and I am proficient with machine learning techniques. My background also includes
    developing user interfaces with modern tools. Additionally, I have hands-on experience with business intelligence and
    graphical user interface design.
    """
    matcher.prepare_candidate_phrases(sample_text)
    matcher.vectorized_match_candidates()
    matcher.global_filtering()
    matcher.print_results()


Total StackOverflow Concepts: 238
Concept embeddings saved to stack_concept_embeddings_all-mpnet-base-v2.npy
Graph contains 238 nodes and 5 edges.
Total candidate phrases generated: 86
Total recognized candidate matches: 22
Global filtering completed.

Globally Filtered Recognized Concepts using n‑gram detection (from StackOverflow data):
Concept: Microsoft SQL Server (DatabaseHaveWorkedWith)
    Detected 3-gram: 'microsoft sql server' with similarity 1.00
------------------------------------------------------------
Concept: Amazon Web Services (AWS) (PlatformHaveWorkedWith)
    Detected 1-gram: 'aws' with similarity 0.81
    Detected 3-gram: 'aws proficient machine' with similarity 0.59
    Detected 2-gram: 'aws proficient' with similarity 0.57
------------------------------------------------------------
Concept: Python (Python) (LanguageHaveWorkedWith)
    Detected 1-gram: 'python' with similarity 0.81
------------------------------------------------------------
Concept: Google Cloud