# Stack Overflow Technology Embedding and Matching

In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure required NLTK resources are available.
# nltk.download('punkt')
# nltk.download('stopwords')

class ConceptMatcher:
    def __init__(
        self,
        csv_path="../data/processed/technologies_with_abbreviations.csv",  # Updated path to new CSV with abbreviations
        columns=None,
        model_name="all-mpnet-base-v2",
        similarity_threshold_graph=0.7,
        ngram_threshold=0.5,
        filter_similarity_threshold=0.85
    ):
        if columns is None:
            columns = [
                "LanguageHaveWorkedWith", "DatabaseHaveWorkedWith", "PlatformHaveWorkedWith",
                "WebframeHaveWorkedWith", "EmbeddedHaveWorkedWith", "MiscTechHaveWorkedWith",
                "ToolsTechHaveWorkedWith"
            ]
        self.csv_path = csv_path
        self.columns = columns
        self.model_name = model_name
        self.similarity_threshold_graph = similarity_threshold_graph
        self.ngram_threshold = ngram_threshold
        self.filter_similarity_threshold = filter_similarity_threshold

        # Initialize NLTK stop words and custom filter words.
        self.stop_words = set(stopwords.words('english'))
        self.custom_filter_words = {
            'additionally', 'also', 'furthermore', 'moreover', 'including', 'like', 'career', 'etc'
        }

        # Initialize the SentenceTransformer model.
        self.model = SentenceTransformer(self.model_name)

        # Placeholders for later processing.
        self.tech_data = None          # DataFrame to store technology names and abbreviations
        self.stack_concepts = []       # List of concept dictionaries.
        self.concept_embeddings = None # Numpy array of concept embeddings.
        self.candidate_phrases = []    # Candidate n‑gram phrases from input text.
        self.candidate_embeddings = None  # Numpy array of candidate embeddings.
        self.recognized_candidates_ngram = []  # Matched candidates with similarity scores.
        self.filtered_by_concept = {}     # Final grouped output after global filtering.
        self.graph = None                 # Optional similarity graph.

    def clean_text(self, text):
        """
        Lowercase the text and remove punctuation except for hyphens and parentheses.
        """
        # Preserve '-' and parentheses by removing other punctuation.
        punctuation_to_remove = "".join(ch for ch in string.punctuation if ch not in "-()")
        text = text.lower().translate(str.maketrans("", "", punctuation_to_remove))
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words]
        return " ".join(tokens)

    def is_meaningful(self, phrase):
        tokens = [t.lower() for t in word_tokenize(phrase) if t.isalpha()]
        if not tokens:
            return False
        if any(token in self.custom_filter_words for token in tokens):
            return False
        if len(tokens) == 1 and tokens[0] in self.stop_words:
            return False
        if tokens and sum(1 for t in tokens if t in self.stop_words) / len(tokens) > 0.5:
            return False
        return True

    def load_concepts(self):
        # Load the technology data with abbreviations
        self.tech_data = pd.read_csv(self.csv_path)
        print(f"Loaded technologies data with {len(self.tech_data)} entries")
        
        # Create concepts from both technology names and their abbreviations
        self.stack_concepts = []
        
        # Add technology names first
        for _, row in self.tech_data.iterrows():
            tech_name = row['Technology']
            self.stack_concepts.append({"name": tech_name, "type": "Technology", "original": tech_name})
        
        # Add abbreviations if they exist
        for _, row in self.tech_data.iterrows():
            if pd.notna(row['abrv']) and row['abrv'].strip():  # Check if abbreviation exists and is not empty
                tech_name = row['Technology']
                abbr = row['abrv']
                self.stack_concepts.append({"name": abbr, "type": "Abbreviation", "original": tech_name})
        
        print(f"Total StackOverflow Concepts (including abbreviations): {len(self.stack_concepts)}")

    def generate_concept_embeddings(self, save_embeddings=True, load_if_exists=True):
        """
        Generate embeddings for all technology concepts or load existing ones if available.
        
        Parameters:
        - save_embeddings: Whether to save newly generated embeddings
        - load_if_exists: Whether to try loading existing embeddings first
        """
        filename = f"stack_concept_embeddings_{self.model_name.replace('/', '_')}.npy"
        
        # Try to load existing embeddings if requested
        if load_if_exists:
            try:
                self.concept_embeddings = np.load(filename)
                print(f"Loaded existing concept embeddings from {filename}")
                return
            except FileNotFoundError:
                print(f"No existing embeddings found at {filename}, generating new ones...")
        
        # Generate new embeddings
        concept_texts = [concept["name"] for concept in self.stack_concepts]
        self.concept_embeddings = self.model.encode(concept_texts, convert_to_numpy=True)
        
        if save_embeddings:
            np.save(filename, self.concept_embeddings)
            print(f"Concept embeddings saved to {filename}")

    def build_similarity_graph(self):
        self.graph = nx.Graph()
        concept_texts = [concept["name"] for concept in self.stack_concepts]
        for concept in self.stack_concepts:
            self.graph.add_node(concept["name"], category=concept["type"], original=concept.get("original", concept["name"]))
        sim_matrix = cosine_similarity(self.concept_embeddings)
        for i in range(len(concept_texts)):
            for j in range(i + 1, len(concept_texts)):
                if sim_matrix[i][j] >= self.similarity_threshold_graph:
                    self.graph.add_edge(concept_texts[i], concept_texts[j], weight=sim_matrix[i][j])
        print(f"Graph contains {len(self.graph.nodes)} nodes and {len(self.graph.edges)} edges.")

    def prepare_candidate_phrases(self, long_text):
        cleaned_full_text = self.clean_text(long_text)
        tokens_clean = word_tokenize(cleaned_full_text)
        candidate_phrases = []
        for n in [3, 2, 1]:
            for gram in ngrams(tokens_clean, n):
                phrase = " ".join(gram)
                if phrase.strip() and self.is_meaningful(phrase):
                    candidate_phrases.append(phrase)
        self.candidate_phrases = list(set(candidate_phrases))
        print(f"Total candidate phrases generated: {len(self.candidate_phrases)}")

    def vectorized_match_candidates(self):
        self.candidate_embeddings = self.model.encode(self.candidate_phrases, convert_to_numpy=True)
        similarity_matrix = cosine_similarity(self.candidate_embeddings, self.concept_embeddings)
        max_similarities = similarity_matrix.max(axis=1)
        max_indices = similarity_matrix.argmax(axis=1)
        valid_indices = np.where(max_similarities >= self.ngram_threshold)[0]
        self.recognized_candidates_ngram = []
        for idx in valid_indices:
            max_sim = max_similarities[idx]
            max_idx = max_indices[idx]
            concept_name = self.stack_concepts[max_idx]["name"]
            concept_type = self.stack_concepts[max_idx]["type"]
            original_name = self.stack_concepts[max_idx].get("original", concept_name)
            phrase = self.candidate_phrases[idx]
            n_val = len(phrase.split())
            tokens_phrase = phrase.split()
            self.recognized_candidates_ngram.append(
                (original_name, concept_type, phrase, max_sim, n_val, tokens_phrase)
            )
        print(f"Total recognized candidate matches: {len(self.recognized_candidates_ngram)}")

    def global_filtering(self):
        recognized = sorted(self.recognized_candidates_ngram, key=lambda x: x[3], reverse=True)
        global_used_words = set()
        filtered_candidates = []
        for candidate in recognized:
            concept_name, concept_type, phrase, score, n_val, tokens_phrase = candidate
            if any(token in global_used_words for token in tokens_phrase):
                continue
            filtered_candidates.append(candidate)
            if score > self.filter_similarity_threshold:
                global_used_words.update(tokens_phrase)
        self.filtered_by_concept = {}
        for concept_name, concept_type, phrase, score, n_val, tokens_phrase in filtered_candidates:
            self.filtered_by_concept.setdefault(concept_name, {"type": concept_type, "phrases": []})
            self.filtered_by_concept[concept_name]["phrases"].append((phrase, score, n_val, tokens_phrase))
        print("Global filtering completed.")

    def print_results(self):
        print("\nGlobally Filtered Recognized Concepts using n‑gram detection (from StackOverflow data):")
        print("=" * 60)
        for concept, info in self.filtered_by_concept.items():
            concept_type = info["type"]
            print(f"Concept: {concept} ({concept_type})")
            for phrase, score, n_val, tokens_phrase in sorted(info["phrases"], key=lambda x: x[1], reverse=True):
                print(f"    Detected {n_val}-gram: '{phrase}' with similarity {score:.2f}")
            print("-" * 60)

    def get_recognized_technologies(self):
        """Get a simple list of recognized technologies."""
        return list(self.filtered_by_concept.keys())

model_names = [
    "all-MiniLM-L6-v2",
    "sentence-transformers/msmarco-distilbert-base-v4",
    "all-mpnet-base-v2"
]
# Example usage:
if __name__ == "__main__":
    matcher = ConceptMatcher(
        csv_path="../data/processed/technologies_with_abbreviations.csv",
        model_name="all-mpnet-base-v2",
        similarity_threshold_graph=0.7,
        ngram_threshold=0.5,
        filter_similarity_threshold=0.85
    )
    matcher.load_concepts()
    matcher.generate_concept_embeddings(save_embeddings=True, load_if_exists=True)
    matcher.build_similarity_graph()
    
    # Sample text now includes abbreviations that should be detected
    sample_text = """
    I have extensive experience in data analysis and have worked with a variety of technologies including MSSQL,
    Py, ObjC, cloud computing platforms like AWS, and I am proficient with machine learning techniques using TF and Sklearn. My background also includes
    developing user interfaces with React and Vue. Additionally, I have hands-on experience with business intelligence and
    graphical user interface design using Node and TS.
    """
    matcher.prepare_candidate_phrases(sample_text)
    matcher.vectorized_match_candidates()
    matcher.global_filtering()
    matcher.print_results()


  _warn(("h5py is running against HDF5 {0} when it was built against {1}, "


# Career Path Prediction Model Implementation

This notebook demonstrates the implementation of an efficient machine learning model for predicting career paths based on technology skills and years of coding experience.

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.optimizers import Adam
import joblib

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Load and prepare the data
def load_data(file_path="../data/processed/clean_v3.csv"):
    """Load and prepare the Stack Overflow survey data."""
    df = pd.read_csv(file_path)
    print(f"Loaded dataset with shape: {df.shape}")
    return df

# Load the dataset
df = load_data()
df.head(2)

## Data Preprocessing

We need to preprocess the data to prepare it for model training. This includes handling the `YearsCode` column properly as a numerical feature.

In [None]:
# Check if YearsCode column exists
if 'YearsCode' in df.columns:
    print("YearsCode column exists. Checking value distribution...")
    print(df['YearsCode'].value_counts().head(10))
    print(f"Data type: {df['YearsCode'].dtype}")
else:
    print("YearsCode column not found!")

In [None]:
# Handle YearsCode column - convert to numeric
def preprocess_years_code(df):
    """Convert YearsCode column to numeric values."""
    if 'YearsCode' in df.columns:
        # Make a copy to avoid warning
        df = df.copy()
        
        # Replace text values with numeric equivalents
        df['YearsCode'] = df['YearsCode'].replace('Less than 1 year', '0.5')
        df['YearsCode'] = df['YearsCode'].replace('More than 50 years', '51')
        
        # Convert to numeric and handle errors
        df['YearsCode'] = pd.to_numeric(df['YearsCode'], errors='coerce')
        
        # Fill missing values with median
        median_years = df['YearsCode'].median()
        df['YearsCode'].fillna(median_years, inplace=True)
        
        print(f"YearsCode converted to numeric. Median value: {median_years}")
    return df

# Preprocess YearsCode column
df = preprocess_years_code(df)

# Visualize YearsCode distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['YearsCode'], bins=20)
plt.title('Distribution of Years of Coding Experience')
plt.xlabel('Years of Coding Experience')
plt.ylabel('Count')
plt.show()

In [None]:
# Define the technology columns and target column
TECH_COLUMNS = [
    "LanguageHaveWorkedWith", 
    "DatabaseHaveWorkedWith", 
    "PlatformHaveWorkedWith",
    "WebframeHaveWorkedWith", 
    "MiscTechHaveWorkedWith",
    "ToolsTechHaveWorkedWith"
]
TARGET_COLUMN = "DevType"

# Process technology columns (convert from semicolon-separated strings to lists)
def process_tech_columns(df, tech_columns=TECH_COLUMNS):
    """Process technology columns by splitting semicolon-separated values."""
    df = df.copy()
    for col in tech_columns:
        if col in df.columns:
            df[col] = df[col].fillna('').apply(
                lambda x: [tech.strip() for tech in x.split(';')] if x else []
            )
    return df

# Process technology columns
df = process_tech_columns(df)

# Check a sample of processed data
print("Sample of processed technology columns:")
for col in TECH_COLUMNS[:2]:  # Just show first 2 columns as example
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].head(2))

In [None]:
# Extract unique career paths from the DevType column
def extract_career_paths(df, target_col=TARGET_COLUMN):
    """Extract unique career paths from the target column."""
    if target_col in df.columns:
        # Split semicolon-separated values and flatten the list
        all_devtypes = [
            devtype.strip() 
            for devtypes in df[target_col].fillna('').str.split(';') 
            for devtype in devtypes if devtype.strip()
        ]
        career_paths = sorted(set(all_devtypes))
        print(f"Found {len(career_paths)} unique career paths")
        return career_paths
    return []

# Get unique career paths
career_paths = extract_career_paths(df)
print("\nSample career paths:")
print(career_paths[:10])

In [None]:
# Process target column (convert from semicolon-separated strings to lists)
def process_target_column(df, target_col=TARGET_COLUMN):
    """Process target column by splitting semicolon-separated values."""
    df = df.copy()
    if target_col in df.columns:
        df[target_col] = df[target_col].fillna('').apply(
            lambda x: [role.strip() for role in x.split(';')] if x else []
        )
    return df

# Process target column
df = process_target_column(df)

# Check a sample of processed target data
print("Sample of processed target column:")
print(df[TARGET_COLUMN].head(5))

## Feature Engineering

We'll now create features from the technology columns and the YearsCode column.

In [None]:
# One-hot encode technology columns
def one_hot_encode_tech(df, tech_columns=TECH_COLUMNS):
    """One-hot encode the technology columns using MultiLabelBinarizer."""
    # Combine all technology lists for fitting the MultiLabelBinarizer
    all_techs = []
    for col in tech_columns:
        if col in df.columns:
            all_techs.extend([tech for techs in df[col] for tech in techs if tech])
    
    unique_techs = sorted(set(all_techs))
    mlb = MultiLabelBinarizer(classes=unique_techs)
    mlb.fit([unique_techs])
    
    tech_features_df = pd.DataFrame()
    for col in tech_columns:
        if col in df.columns:
            # Transform the list column into one-hot encoded columns
            col_encoded = mlb.transform(df[col])
            col_df = pd.DataFrame(
                col_encoded, 
                columns=[f"{tech}" for tech in mlb.classes_],
                index=df.index
            )
            tech_features_df = pd.concat([tech_features_df, col_df], axis=1)
    
    # Handle duplicate columns by taking the maximum value
    tech_features_df = tech_features_df.groupby(tech_features_df.columns, axis=1).max()
    
    print(f"Created {tech_features_df.shape[1]} technology features")
    return tech_features_df, mlb

# Get one-hot encoded tech features
tech_features, mlb = one_hot_encode_tech(df)

# Display a sample of the encoded features
print("\nSample of one-hot encoded technology features:")
print(tech_features.iloc[:2, :10])  # Show first 2 rows and 10 columns

In [None]:
# Prepare features and target
def prepare_features_target(df, tech_features, target_col=TARGET_COLUMN):
    """Prepare final features (including YearsCode) and target for model training."""
    # Prepare numerical features (YearsCode)
    numerical_features = pd.DataFrame(index=df.index)
    
    # Include YearsCode as a feature if available
    if 'YearsCode' in df.columns:
        numerical_features['YearsCode'] = df['YearsCode']
        
        # Scale the YearsCode column
        scaler = StandardScaler()
        numerical_features['YearsCode'] = scaler.fit_transform(
            numerical_features[['YearsCode']]
        )
        
        print(f"YearsCode scaled. Mean: {scaler.mean_[0]:.2f}, Std: {scaler.scale_[0]:.2f}")
    else:
        print("Warning: YearsCode column not found!")
    
    # Combine tech features and numerical features
    X = pd.concat([tech_features, numerical_features], axis=1)
    
    # Prepare target variable
    y = None
    if target_col in df.columns:
        y = df[target_col]  # Already processed into lists
    
    return X, y, scaler

# Get features and target
X, y, scaler = prepare_features_target(df, tech_features)

# Display feature information
print(f"\nFeature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print("\nFeature columns:")
print(X.columns[-10:])  # Show last 10 columns including YearsCode

In [None]:
# One-hot encode the target variable
def encode_target(y, career_paths=None):
    """One-hot encode the target variable using MultiLabelBinarizer."""
    if career_paths is None:
        target_mlb = MultiLabelBinarizer()
        y_encoded = target_mlb.fit_transform(y)
    else:
        target_mlb = MultiLabelBinarizer(classes=career_paths)
        y_encoded = target_mlb.fit_transform(y)
    
    print(f"Target encoded with {y_encoded.shape[1]} classes")
    return y_encoded, target_mlb

# Encode the target
y_encoded, target_mlb = encode_target(y, career_paths)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

print(f"\nTraining data shape: {X_train.shape}, {y_train.shape}")
print(f"Test data shape: {X_test.shape}, {y_test.shape}")

## Model Implementation

Now we'll implement an efficient deep learning model that specifically handles both technology skills and years of coding experience.

In [None]:
# Implement a hybrid deep learning model that processes tech skills and YearsCode separately
def build_hybrid_model(tech_input_dim, num_classes):
    """Build a hybrid model that processes tech skills and YearsCode separately."""
    # Tech input branch
    tech_input = Input(shape=(tech_input_dim,), name='tech_input')
    tech_dense1 = Dense(512, activation='relu')(tech_input)
    tech_dropout1 = Dropout(0.3)(tech_dense1)
    tech_dense2 = Dense(256, activation='relu')(tech_dropout1)
    tech_dropout2 = Dropout(0.3)(tech_dense2)
    
    # Years code input branch
    years_input = Input(shape=(1,), name='years_input')
    years_dense = Dense(32, activation='relu')(years_input)
    
    # Combine branches
    combined = Concatenate()([tech_dropout2, years_dense])
    combined_dense = Dense(128, activation='relu')(combined)
    output = Dense(num_classes, activation='sigmoid')(combined_dense)
    
    # Create and compile model
    model = Model(inputs=[tech_input, years_input], outputs=output)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Get dimensions for the model
tech_input_dim = X_train.shape[1] - 1  # All columns except YearsCode
num_classes = y_train.shape[1]

# Build the model
model = build_hybrid_model(tech_input_dim, num_classes)
model.summary()

In [None]:
# Prepare inputs for the hybrid model
X_train_tech = X_train.drop(columns=['YearsCode']).values
X_train_years = X_train['YearsCode'].values.reshape(-1, 1)
X_test_tech = X_test.drop(columns=['YearsCode']).values
X_test_years = X_test['YearsCode'].values.reshape(-1, 1)

# Train the model
history = model.fit(
    [X_train_tech, X_train_years], y_train,
    epochs=20,
    batch_size=64,
    validation_data=([X_test_tech, X_test_years], y_test),
    verbose=1
)

In [None]:
# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Evaluate the model
def evaluate_model(model, X_test_tech, X_test_years, y_test, target_mlb):
    """Evaluate the model performance."""
    # Get predictions
    y_pred_proba = model.predict([X_test_tech, X_test_years])
    
    # Convert probabilities to binary predictions (threshold 0.5)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Micro-F1: {micro_f1:.4f}")
    print(f"Macro-F1: {macro_f1:.4f}")
    
    # Show the top classes with highest positive counts
    class_counts = y_pred.sum(axis=0)
    class_names = target_mlb.classes_
    
    top_indices = np.argsort(-class_counts)[:10]  # Top 10 classes
    print("\nTop 10 predicted career paths:")
    for idx in top_indices:
        print(f"{class_names[idx]}: {class_counts[idx]} predictions")
    
    return y_pred, y_pred_proba

# Evaluate the model
y_pred, y_pred_proba = evaluate_model(model, X_test_tech, X_test_years, y_test, target_mlb)

In [None]:
# Analyze years of experience for different career paths
def analyze_years_by_career_path(df, scaler):
    """Analyze the relationship between YearsCode and career paths."""
    # Create a dataframe with DevType and YearsCode
    years_by_role = []
    
    for _, row in df.iterrows():
        roles = row[TARGET_COLUMN]
        years = row['YearsCode']
        
        for role in roles:
            years_by_role.append({
                'Role': role,
                'YearsCode': years
            })
    
    years_df = pd.DataFrame(years_by_role)
    
    # Group by role and calculate mean years
    role_years = years_df.groupby('Role')['YearsCode'].agg(['mean', 'median', 'count']).reset_index()
    role_years = role_years.sort_values('mean', ascending=False)
    
    # Only keep roles with at least 50 instances to ensure statistical significance
    role_years = role_years[role_years['count'] >= 50]
    
    print("Average years of coding experience by career path:")
    print(role_years.head(10))
    
    # Plot the results
    plt.figure(figsize=(12, 8))
    sns.barplot(x='mean', y='Role', data=role_years.head(15), palette='viridis')
    plt.title('Average Years of Coding Experience by Career Path')
    plt.xlabel('Years of Coding Experience')
    plt.ylabel('Career Path')
    plt.tight_layout()
    plt.show()
    
    return role_years

# Analyze years by career path
role_years = analyze_years_by_career_path(df, scaler)

## Save Model and Related Components

Now we'll save the model and related components for use in the FastAPI backend.

In [None]:
# Save the model and related components
def save_model(model, mlb, target_mlb, scaler, save_dir="../models"):
    """Save the model and related components."""
    # Create directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Save the model
    model_path = os.path.join(save_dir, "career_path_model")
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
    # Save components needed for preprocessing and postprocessing
    components = {
        'tech_mlb': mlb,
        'target_mlb': target_mlb,
        'scaler': scaler,
        'career_paths': list(target_mlb.classes_)
    }
    
    components_path = os.path.join(save_dir, "model_components.joblib")
    joblib.dump(components, components_path)
    print(f"Model components saved to {components_path}")
    
# Save the model and components
save_model(model, mlb, target_mlb, scaler)

In [None]:
# Test model loading and prediction
def test_model_loading(model_dir="../models", sample_text=None):
    """Test loading the model and making predictions."""
    # Load model
    model_path = os.path.join(model_dir, "career_path_model")
    loaded_model = tf.keras.models.load_model(model_path)
    
    # Load components
    components_path = os.path.join(model_dir, "model_components.joblib")
    components = joblib.load(components_path)
    
    tech_mlb = components['tech_mlb']
    target_mlb = components['target_mlb']
    scaler = components['scaler']
    career_paths = components['career_paths']
    
    print(f"Model loaded successfully. Can predict {len(career_paths)} career paths.")
    
    # If a sample text is provided, demonstrate prediction
    if sample_text:
        from src.features import ConceptMatcher
        
        # Initialize technology matcher
        matcher = ConceptMatcher()
        matcher.load_concepts()
        matcher.generate_concept_embeddings()
        
        # Extract technologies from text
        matcher.prepare_candidate_phrases(sample_text)
        matcher.vectorized_match_candidates()
        matcher.global_filtering()
        
        # Get recognized technologies
        recognized_techs = matcher.get_recognized_technologies()
        print(f"\nRecognized technologies: {recognized_techs}")
        
        # Extract years of experience using simple pattern matching
        import re
        years_pattern = r'(\d+)\s+years?\s+(?:of\s+)?(?:coding|programming|experience|work)'
        years_matches = re.findall(years_pattern, sample_text, re.IGNORECASE)
        years_code = float(years_matches[0]) if years_matches else 5.0  # Default to 5 years if not found
        print(f"Extracted years of experience: {years_code}")
        
        # Prepare features
        tech_features = pd.DataFrame(np.zeros((1, len(tech_mlb.classes_))), columns=tech_mlb.classes_)
        for tech in recognized_techs:
            if tech in tech_features.columns:
                tech_features[tech] = 1
        
        # Scale YearsCode
        years_code_scaled = scaler.transform([[years_code]])[0][0]
        
        # Prepare inputs for the model
        tech_input = tech_features.values
        years_input = np.array([[years_code_scaled]])
        
        # Make prediction
        y_pred_proba = loaded_model.predict([tech_input, years_input])[0]
        
        # Get top 5 predictions
        top_indices = np.argsort(-y_pred_proba)[:5]
        print("\nTop 5 predicted career paths:")
        for idx in top_indices:
            role = career_paths[idx]
            prob = y_pred_proba[idx]
            print(f"{role}: {prob:.4f} probability")
    
    return loaded_model, components

# Test model loading with a sample text
sample_text = """
I have 8 years of experience as a software developer, primarily working with Python, JavaScript, and SQL. 
I've built several web applications using React and Node.js, and I've worked extensively with AWS cloud services. 
I also have experience with Docker, Kubernetes, and CI/CD pipelines.
"""

loaded_model, components = test_model_loading(sample_text=sample_text)

## FastAPI Implementation

Below is the structure of the FastAPI implementation that will be created in the appropriate Python files.

In [None]:
# This is a code example for the FastAPI implementation
# The actual implementation will be in src/services/api.py

'''
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from typing import List, Dict, Optional
import uvicorn
import tensorflow as tf
import numpy as np
import pandas as pd
import joblib
import re
import os

from src.features import ConceptMatcher

# Load model and components
MODEL_DIR = "../models"
model = tf.keras.models.load_model(os.path.join(MODEL_DIR, "career_path_model"))
components = joblib.load(os.path.join(MODEL_DIR, "model_components.joblib"))
tech_mlb = components["tech_mlb"]
target_mlb = components["target_mlb"]
scaler = components["scaler"]
career_paths = components["career_paths"]

# Initialize ConceptMatcher
matcher = ConceptMatcher()
matcher.load_concepts()
matcher.generate_concept_embeddings(load_if_exists=True)

# Create FastAPI app
app = FastAPI(
    title="CareerConnect API",
    description="API for career path recommendations based on technical skills and experience",
    version="1.0.0"
)

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Input models
class TextInput(BaseModel):
    """Input model for text-based predictions."""
    text: str = Field(..., 
        description="Text describing skills, technologies, and years of coding experience",
        example="I have 5 years of experience with Python, JavaScript, and React. I've also worked with AWS and Docker.")
    
class FeatureInput(BaseModel):
    """Input model for feature-based predictions."""
    features: Dict[str, float] = Field(..., 
        description="Dictionary of technology skills as keys and 1.0 as values to indicate presence")
    years_code: float = Field(..., 
        description="Years of coding experience", 
        example=5.0)

# Output models
class CareerPrediction(BaseModel):
    """Model for career path prediction result."""
    career_path: str
    probability: float
    
class PredictionResponse(BaseModel):
    """Response model for prediction endpoints."""
    predictions: List[CareerPrediction]
    recognized_techs: Optional[List[str]] = None

@app.get("/")
async def root():
    """Root endpoint."""
    return {"message": "Welcome to CareerConnect API"}

@app.post("/predict/text", response_model=PredictionResponse)
async def predict_from_text(input_data: TextInput):
    """Predict career paths from text description of skills and experience."""
    try:
        # Extract technologies from text
        matcher.prepare_candidate_phrases(input_data.text)
        matcher.vectorized_match_candidates()
        matcher.global_filtering()
        recognized_techs = matcher.get_recognized_technologies()
        
        # Extract years of experience using pattern matching
        years_pattern = r"(\d+)\s+years?\s+(?:of\s+)?(?:coding|programming|experience|work)"
        years_matches = re.findall(years_pattern, input_data.text, re.IGNORECASE)
        years_code = float(years_matches[0]) if years_matches else 5.0  # Default to 5 years if not found
        
        # Prepare features
        tech_features = pd.DataFrame(np.zeros((1, len(tech_mlb.classes_))), columns=tech_mlb.classes_)
        for tech in recognized_techs:
            if tech in tech_features.columns:
                tech_features[tech] = 1
        
        # Scale YearsCode
        years_code_scaled = scaler.transform([[years_code]])[0][0]
        
        # Prepare inputs for the model
        tech_input = tech_features.values
        years_input = np.array([[years_code_scaled]])
        
        # Make prediction
        y_pred_proba = model.predict([tech_input, years_input])[0]
        
        # Get top 5 predictions
        top_indices = np.argsort(-y_pred_proba)[:5]
        predictions = []
        for idx in top_indices:
            prob = float(y_pred_proba[idx])
            if prob < 0.1:  # Skip predictions with very low probability
                continue
            predictions.append({
                "career_path": career_paths[idx],
                "probability": prob
            })
        
        # Format response
        response = {
            "predictions": predictions,
            "recognized_techs": recognized_techs
        }
        
        return response
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

@app.post("/predict/features", response_model=PredictionResponse)
async def predict_from_features(input_data: FeatureInput):
    """Predict career paths from explicit feature dictionary."""
    try:
        # Prepare features
        tech_features = pd.DataFrame(np.zeros((1, len(tech_mlb.classes_))), columns=tech_mlb.classes_)
        for tech, value in input_data.features.items():
            if tech in tech_features.columns:
                tech_features[tech] = value
        
        # Scale YearsCode
        years_code_scaled = scaler.transform([[input_data.years_code]])[0][0]
        
        # Prepare inputs for the model
        tech_input = tech_features.values
        years_input = np.array([[years_code_scaled]])
        
        # Make prediction
        y_pred_proba = model.predict([tech_input, years_input])[0]
        
        # Get top 5 predictions
        top_indices = np.argsort(-y_pred_proba)[:5]
        predictions = []
        for idx in top_indices:
            prob = float(y_pred_proba[idx])
            if prob < 0.1:  # Skip predictions with very low probability
                continue
            predictions.append({
                "career_path": career_paths[idx],
                "probability": prob
            })
        
        # Format response
        response = {
            "predictions": predictions,
            "recognized_techs": list(input_data.features.keys())
        }
        
        return response
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

@app.get("/career-paths", response_model=List[str])
async def get_career_paths():
    """Get all possible career paths from the model."""
    return career_paths

@app.get("/technologies", response_model=List[str])
async def get_technologies():
    """Get all technologies that can be used for prediction."""
    return list(tech_mlb.classes_)

if __name__ == "__main__":
    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
'''

## Conclusion

This notebook demonstrates the implementation of a career path prediction model that uses technology skills and years of coding experience (`YearsCode`) as features. The model specifically processes the YearsCode column separately to recognize its importance for roles like Engineering Manager and Project Manager that require coding experience.

The implementation includes:

1. **Data preprocessing** with proper handling of the YearsCode numerical column
2. **Feature engineering** that combines technology skills and years of experience
3. **Efficient deep learning model** with a hybrid architecture that processes tech skills and YearsCode in separate branches
4. **FastAPI backend** design for serving the model through REST API endpoints

The model demonstrates how years of coding experience is a critical factor in determining appropriate career paths, especially for management roles that require technical expertise.