<a href="https://colab.research.google.com/github/SwarupD21/IBM-Project/blob/main/UCE_Core_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install the opendatasets library to download datasets from Kaggle
%pip install opendatasets





In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/everydaycodings/multi-platform-online-courses-dataset")

Skipping, found downloaded files in "./multi-platform-online-courses-dataset" (use force=True to force download)


In [None]:
import pandas as pd
import os

# IMPORTANT: Replace 'path/to/your/dataset.csv' with the actual path to your downloaded dataset file
dataset_path = './multi-platform-online-courses-dataset/Coursera.csv'

# Check if the file exists before trying to read it
if os.path.exists(dataset_path):
    df = pd.read_csv(dataset_path)
    print("Dataset loaded successfully.")
    print(df.head())
else:
    print(f"Error: Dataset not found at {dataset_path}")
    print("Please check the dataset path and filename.")

Dataset loaded successfully.
  partner                                 course  \
0  Google                   Google Cybersecurity   
1  Google                  Google Data Analytics   
2  Google             Google Project Management:   
3  Google  Google Digital Marketing & E-commerce   
4  Google                      Google IT Support   

                                              skills  rating reviewcount  \
0  {" Network Security"," Python Programming"," L...     4.8       16.4k   
1  {" Data Analysis"," R Programming"," SQL"," Bu...     4.8      133.4k   
2  {" Project Management"," Strategy and Operatio...     4.8       97.3k   
3  {" Digital Marketing"," Marketing"," Marketing...     4.8       21.4k   
4  {" Computer Networking"," Network Architecture...     4.8      181.4k   

       level             certificatetype       duration  crediteligibility  
0  Beginner    Professional Certificate    3 - 6 Months              False  
1  Beginner    Professional Certificate    3 - 

In [None]:
# Check if the 'skills' column exists in the DataFrame
if 'skills' in df.columns:
    print("First 10 entries in the 'skills' column:")
    display(df['skills'].head(10))

    print("\nData type of the 'skills' column:")
    print(df['skills'].dtype)

    # Check for missing values in the 'skills' column
    print("\nMissing values in the 'skills' column:")
    print(df['skills'].isnull().sum())

else:
    print("The 'skills' column was not found in the DataFrame.")
    print("Please check the column names in your dataset.")
    print("Available columns:", df.columns.tolist())

First 10 entries in the 'skills' column:


Unnamed: 0,skills
0,"{"" Network Security"","" Python Programming"","" L..."
1,"{"" Data Analysis"","" R Programming"","" SQL"","" Bu..."
2,"{"" Project Management"","" Strategy and Operatio..."
3,"{"" Digital Marketing"","" Marketing"","" Marketing..."
4,"{"" Computer Networking"","" Network Architecture..."
5,"{"" Python Programming"","" Data Science"","" Machi..."
6,"{"" User Experience"","" User Experience Design"",..."
7,"{"" Python Programming"","" Data Visualization"",""..."
8,"{"" Machine Learning"","" Machine Learning Algori..."
9,"{"" Data Science"","" Python Programming"","" Data ..."



Data type of the 'skills' column:
object

Missing values in the 'skills' column:
51


In [None]:
import json

# Function to parse the skills string into a list of strings
def parse_skills(skills_string):
    if pd.isna(skills_string):
        return []  # Return an empty list for missing values
    try:
        # The string looks like a JSON array, so we can try parsing it
        # Need to replace single quotes with double quotes for valid JSON
        skills_string = skills_string.replace("'", '"')
        # The string also has leading/trailing curly braces which are not part of JSON array
        skills_string = skills_string.strip('{}')
        # Wrap in brackets to form a valid JSON array string
        json_string = f'[{skills_string}]'
        # Load the JSON string
        skills_list = json.loads(json_string)
        # Clean up each skill string (remove extra quotes and whitespace)
        cleaned_skills = [skill.strip().strip('"') for skill in skills_list]
        return cleaned_skills
    except json.JSONDecodeError:
        # If JSON decoding fails, return an empty list or handle as appropriate
        print(f"Warning: Could not parse skills string: {skills_string}")
        return [] # Return empty list for parsing errors

# Apply the parsing function to the 'skills' column
if 'skills' in df.columns:
    df['parsed_skills'] = df['skills'].apply(parse_skills)
    print("\n'skills' column parsed and stored in 'parsed_skills' column.")
    print("First 10 entries in the 'parsed_skills' column:")
    display(df['parsed_skills'].head(10))
else:
    print("The 'skills' column was not found in the DataFrame.")


'skills' column parsed and stored in 'parsed_skills' column.
First 10 entries in the 'parsed_skills' column:


Unnamed: 0,parsed_skills
0,"[Network Security, Python Programming, Linux, ..."
1,"[Data Analysis, R Programming, SQL, Business C..."
2,"[Project Management, Strategy and Operations, ..."
3,"[Digital Marketing, Marketing, Marketing Manag..."
4,"[Computer Networking, Network Architecture, Ne..."
5,"[Python Programming, Data Science, Machine Lea..."
6,"[User Experience, User Experience Design, User..."
7,"[Python Programming, Data Visualization, Micro..."
8,"[Machine Learning, Machine Learning Algorithms..."
9,"[Data Science, Python Programming, Data Analys..."


In [None]:
# Flatten the list of lists in the 'parsed_skills' column
all_skills = [skill for skills_list in df['parsed_skills'] for skill in skills_list]

# Get the unique skills
unique_skills = set(all_skills)

print(f"Total number of skills extracted: {len(all_skills)}")
print(f"Total number of unique skills: {len(unique_skills)}")

# Display the first 20 unique skills as an example
print("\nFirst 20 unique skills:")
display(list(unique_skills)[:20])

# Optionally, count the frequency of each skill
from collections import Counter
skill_counts = Counter(all_skills)

print("\nTop 20 most frequent skills:")
display(skill_counts.most_common(20))

Total number of skills extracted: 12760
Total number of unique skills: 429

First 20 unique skills:


['Collaboration',
 '',
 'Business Research',
 '(30.2k reviews)',
 'Full-Stack Web Development',
 'Big Data',
 'Dimensionality Reduction',
 'Software Engineering',
 'Cost Accounting',
 'Computer Programming',
 'Cash Management',
 'Network Security',
 'Business Design',
 'Knitr',
 'Cloud Clients',
 'Storytelling',
 'Internet Of Things',
 'Machine Learning',
 'People Development',
 'Marketing Design']


Top 20 most frequent skills:


[('Leadership and Management', 357),
 ('Data Analysis', 263),
 ('Computer Programming', 215),
 ('Strategy', 214),
 ('Communication', 211),
 ('Critical Thinking', 193),
 ('Problem Solving', 184),
 ('Strategy and Operations', 184),
 ('Python Programming', 170),
 ('Machine Learning', 153),
 ('Business Analysis', 151),
 ('Planning', 140),
 ('Algorithms', 138),
 ('Finance', 134),
 ('Decision Making', 134),
 ('Data Management', 132),
 ('Marketing', 131),
 ('Probability & Statistics', 113),
 ('Cloud Computing', 112),
 ('General Statistics', 104)]

In [None]:
# Filter out irrelevant entries from the skill list
# We'll remove empty strings and entries that look like review counts (contain '(' and ')')
cleaned_skills_list = [skill for skill in all_skills if skill and not ('(' in skill and ')' in skill)]

# Get the unique cleaned skills
unique_cleaned_skills = set(cleaned_skills_list)

print(f"Total number of cleaned skills extracted: {len(cleaned_skills_list)}")
print(f"Total number of unique cleaned skills: {len(unique_cleaned_skills)}")

# Display the first 20 unique cleaned skills as an example
print("\nFirst 20 unique cleaned skills:")
display(list(unique_cleaned_skills)[:20])

# Optionally, count the frequency of each cleaned skill
cleaned_skill_counts = Counter(cleaned_skills_list)

print("\nTop 20 most frequent cleaned skills:")
display(cleaned_skill_counts.most_common(20))

Total number of cleaned skills extracted: 12441
Total number of unique cleaned skills: 328

First 20 unique cleaned skills:


['Collaboration',
 'Business Research',
 'Full-Stack Web Development',
 'Big Data',
 'Dimensionality Reduction',
 'Software Engineering',
 'Cost Accounting',
 'Computer Programming',
 'Cash Management',
 'Network Security',
 'Business Design',
 'Knitr',
 'Cloud Clients',
 'Storytelling',
 'Internet Of Things',
 'Machine Learning',
 'People Development',
 'Marketing Design',
 'DevOps',
 'Calculus']


Top 20 most frequent cleaned skills:


[('Leadership and Management', 357),
 ('Data Analysis', 263),
 ('Computer Programming', 215),
 ('Strategy', 214),
 ('Communication', 211),
 ('Critical Thinking', 193),
 ('Problem Solving', 184),
 ('Strategy and Operations', 184),
 ('Python Programming', 170),
 ('Machine Learning', 153),
 ('Business Analysis', 151),
 ('Planning', 140),
 ('Algorithms', 138),
 ('Finance', 134),
 ('Decision Making', 134),
 ('Data Management', 132),
 ('Marketing', 131),
 ('Probability & Statistics', 113),
 ('Cloud Computing', 112),
 ('General Statistics', 104)]

In [None]:
# This cell defines the Personalized Recommendation Engine Node for the LangGraph.
# It takes the identified skill gaps from the graph state and uses an LLM to suggest learning paths.

import google.generativeai as genai
from google.colab import userdata

# Note: The LLM configuration is now handled within the recommend_learning_paths_node function
# when the LangGraph is invoked.

# print("\nGenerating personalized learning path recommendations...") # Removed direct execution print

# Prepare the prompt for the LLM (This part is still useful for understanding the node's logic)
# The prompt should ask the LLM to suggest learning resources for the identified skill gaps.
# We'll focus on the first few skill gaps for brevity in the prompt.
# skills_to_learn = list(skill_gaps)[:10] # This variable is defined within the node's scope during graph execution

# if skills_to_learn: # Removed direct execution condition
#     prompt = f"Based on the following skill gaps: {', '.join(skills_to_learn)}. Please suggest personalized learning pathways, including types of resources (e.g., online courses, certifications, books, tutorials) to acquire these skills."

#     print("\nPrompt for the LLM:") # Removed direct execution print
#     print(prompt)

    # In a real implementation, you would call the LLM here:
    # This logic is now integrated into the recommend_learning_paths_node function in cell f09cee9e
    # if gemini_model: # Removed direct execution condition
    #     try:
    #         response = gemini_model.generate_content(prompt)
    #         print("\nPersonalized Learning Path Recommendations:") # Removed direct execution print
    #         print(response.text)
    #     except Exception as e:
    #         print(f"Error calling LLM: {e}") # Removed direct execution print
    #         print("An error occurred while generating recommendations.") # Removed direct execution print
    # else: # Removed direct execution else
    #     print("\nLLM is not configured. Cannot generate recommendations.") # Removed direct execution print


# else: # Removed direct execution else
#     print("\nNo skill gaps identified. No recommendations needed.") # Removed direct execution print

print("Cell defines the recommend_learning_paths_node function for LangGraph.")
print("Run the cell that invokes the compiled graph (e.g., cell b992dd94) to execute this node.")

Cell defines the recommend_learning_paths_node function for LangGraph.
Run the cell that invokes the compiled graph (e.g., cell b992dd94) to execute this node.


In [None]:
# Install necessary libraries
%pip install --quiet -U langchain_google_genai langgraph langchainhub
# Install scikit-learn for cosine similarity
%pip install --quiet scikit-learn

# Import necessary classes
from typing import TypedDict, Annotated, List
from langgraph.graph import StateGraph, END
import pandas as pd
from collections import Counter
import os
import json
import google.generativeai as genai
from google.colab import userdata
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer # Import SentenceTransformer


# --- LangGraph State Definition ---
# This defines the structure of the state that will be passed between nodes in the graph.
class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        dataset_path: The path to the dataset.
        dataframe: The pandas DataFrame containing the dataset.
        skills_data: A list of extracted skills from the dataset.
        unique_job_market_skills: set # Set of unique skills from the job market.
        individual_skills: List[str] # Individual skills provided by the user.
        skill_gaps: set # Skill gaps identified (using embeddings).
        recommendations: str # Recommendations generated by LLM.
        error: str # Error message if any.
        stored_result: str # Attribute to represent the stored result.
        # Simplified state, removing mapping-related attributes
        # job_market_categories: set
        # individual_categories: set
        # category_gaps: set
        # detailed_skill_gaps: dict
    """
    dataset_path: str
    dataframe: pd.DataFrame
    skills_data: List[str]
    unique_job_market_skills: set
    individual_skills: List[str]
    skill_gaps: set
    recommendations: str
    error: str
    stored_result: str
    # job_market_categories: set # Removed
    # individual_categories: set # Removed
    # category_gaps: set # Removed
    # detailed_skill_gaps: dict # Removed
    # job_market_embeddings_dict: dict # Added for clarity, though passed directly
    # individual_embeddings_dict: dict # Added for clarity, though passed directly


# --- Node Definitions ---
# These functions represent the individual steps or nodes in the LangGraph workflow.

def load_data_node(state: GraphState):
    """
    LangGraph Node: Loads the dataset from the specified path into a DataFrame.
    This node is the starting point for the data processing graph.
    """
    print("---LOADING DATA (Graph 1)---")
    dataset_path = state.get('dataset_path')

    if not dataset_path:
        return {"error": "Dataset path not provided in state."}

    if os.path.exists(dataset_path):
        try:
            df = pd.read_csv(dataset_path)
            print("Dataset loaded successfully.")
            return {"dataframe": df, "error": ""}
        except Exception as e:
            return {"error": f"Error loading dataset: {e}"}
    else:
        return {"error": f"Error: Dataset not found at {dataset_path}"}


def wrangle_data_node(state: GraphState):
    """
    LangGraph Node: Processes the 'skills' column of the DataFrame.
    It parses the skill strings and extracts unique skills.
    """
    print("---WRANGLING DATA (Graph 1)---")
    df = state.get('dataframe')

    if df is None:
        return {"error": "DataFrame not found in state. Cannot wrangle data."}

    if 'skills' in df.columns:
        def parse_skills(skills_string):
            if pd.isna(skills_string):
                return []
            try:
                skills_string = skills_string.replace("'", '"')
                skills_string = skills_string.strip('{}')
                json_string = f'[{skills_string}]'
                skills_list = json.loads(json_string)
                cleaned_skills = [skill.strip().strip('"') for skill in skills_list]
                return cleaned_skills
            except json.JSONDecodeError:
                return []

        df['parsed_skills'] = df['skills'].apply(parse_skills)
        all_skills = [skill for skills_list in df['parsed_skills'] for skill in skills_list]
        cleaned_skills_list = [skill for skill in all_skills if skill and not ('(' in skill and ')' in skill)]
        unique_cleaned_skills = set(cleaned_skills_list)

        print(f"Total unique cleaned skills extracted: {len(unique_cleaned_skills)}")

        return {"skills_data": cleaned_skills_list, "unique_job_market_skills": unique_cleaned_skills, "error": ""}

    else:
        return {"error": "The 'skills' column was not found in the DataFrame. Cannot wrangle skills."}


# --- Auxiliary Functions (Called Outside the Compiled Graph) ---
# These functions perform specific tasks but are invoked separately in the orchestration cell,
# not as part of the compiled `app_data_processing` graph's flow.

# Preliminary mapping kept only for potential use in LLM prompt generation, not for coverage check.
preliminary_skill_mapping = {
    "Data Analysis": ["Data Analysis", "Business Analysis", "Data Management", "Spreadsheet Software", "Business Intelligence", "Data Visualization"],
    "Programming": ["Computer Programming", "Python Programming", "R Programming", "SQL", "Algorithms", "C++ Programming", "Java Programming", "Web Development", "Full-Stack Web Development", "Front-End Web Development", "Back-End Web Development"],
    "Machine Learning & AI": ["Machine Learning", "Applied Machine Learning", "Deep Learning", "Artificial Neural Networks", "Statistical Machine Learning", "Natural Language Processing", "Computer Vision", "Dimensionality Reduction"],
    "Business Strategy": ["Strategy", "Strategy and Operations", "Planning", "Business Research", "Decision Making", "Negotiation", "Innovation"],
    "Management & Leadership": ["Leadership and Management", "Change Management", "People Development", "Human Resources", "Organizational Development"],
    "Communication & Soft Skills": ["Communication", "Critical Thinking", "Problem Solving", "Collaboration", "Storytelling", "Writing"],
    "Finance & Accounting": ["Finance", "Accounting", "Financial Accounting", "Cost Accounting", "Investment Management", "Risk Management", "Cash Management"],
    "Marketing": ["Marketing", "Digital Marketing", "Marketing Management", "Branding"],
    "Cloud & IT": ["Cloud Computing", "DevOps", "Computer Networking", "Network Security", "Cloud Clients", "Internet Of Things", "Software As A Service", "Linux", "Databases", "Cybersecurity"],
    "Project Management": ["Project Management", "Agile Software Development", "Scrum (Software Development)"],
    "Design": ["User Experience Design", "User Experience", "User Interface", "Graphic Design", "Marketing Design", "Business Design"],
    "Statistics & Probability": ["Probability & Statistics", "General Statistics"],
    "Mathematics": ["Calculus", "Linear Algebra"]
}

# Set a similarity threshold for embeddings
EMBEDDING_SIMILARITY_THRESHOLD = 0.7 # Cosine similarity score above this indicates a match


def identify_skill_gaps(unique_job_market_skills: set, individual_skills: List[str], job_market_embeddings_dict: dict, individual_embeddings_dict: dict) -> dict:
    """
    Auxiliary Function: Identifies skills present in the job market data
    but not in the individual's provided list of skills, using embedding similarity
    and direct match only for coverage check. Returns a dictionary with skill gaps.
    """
    print("---IDENTIFYING SKILL GAPS (Using Embeddings Only for Coverage)---")
    if unique_job_market_skills is None:
        print("Warning: Job market skills not provided.")
        return {"skill_gaps": set()}

    if individual_skills is None:
         print("Warning: Individual skills not provided.")
         return {"skill_gaps": unique_job_market_skills.copy()}

    if not job_market_embeddings_dict or not individual_embeddings_dict:
         print("Warning: Embedding dictionaries not provided or empty. Cannot use embedding similarity.")
         # Fallback to direct string match only if embeddings are not available
         individual_skills_set = set([skill.lower() for skill in individual_skills])
         job_market_skills_set = set([skill.lower() for skill in unique_job_market_skills])
         skill_gaps = job_market_skills_set - individual_skills_set
         print("Falling back to direct string match only.")
         print(f"\nIdentified {len(skill_gaps)} Specific Skill Gaps (direct match only).")
         return {"skill_gaps": skill_gaps}


    individual_skills_set = set([skill.lower() for skill in individual_skills]) # Convert individual skills to lowercase for case-insensitive matching
    job_market_skills_set = set([skill.lower() for skill in unique_job_market_skills]) # Convert job market skills to lowercase

    skill_gaps = set()

    # Helper function to check embedding similarity
    def is_covered_by_embedding(job_market_skill_lower, individual_skills_set, job_market_embeddings_dict, individual_embeddings_dict, threshold):
        if job_market_skill_lower not in job_market_embeddings_dict:
            # print(f"Job market skill '{job_market_skill_lower}' not in job_market_embeddings_dict.") # Debug print
            return False # Cannot check embedding if no embedding exists

        job_market_embedding = job_market_embeddings_dict[job_market_skill_lower].reshape(1, -1)

        for individual_skill_lower in individual_skills_set:
            if individual_skill_lower in individual_embeddings_dict:
                individual_embedding = individual_embeddings_dict[individual_skill_lower].reshape(1, -1)
                similarity = cosine_similarity(job_market_embedding, individual_embedding)[0][0]
                if similarity >= threshold:
                    # print(f"Embedding match: '{job_market_skill_lower}' and '{individual_skill_lower}' (Similarity: {similarity:.2f})") # Debug print
                    return True
            # else:
                # print(f"Individual skill '{individual_skill_lower}' not in individual_embeddings_dict.") # Debug print
        return False


    # Identify skill gaps based on direct match or embedding similarity
    for job_market_skill in job_market_skills_set:
        is_covered = False

        # Check for direct match
        if job_market_skill in individual_skills_set:
            is_covered = True
            # print(f"Direct match: '{job_market_skill}'") # Debug print
            continue # Move to the next job market skill

        # Check for coverage by embedding similarity
        if is_covered_by_embedding(job_market_skill, individual_skills_set, job_market_embeddings_dict, individual_embeddings_dict, EMBEDDING_SIMILARITY_THRESHOLD):
             is_covered = True
             # print(f"Embedding coverage: '{job_market_skill}'") # Debug print
             continue # Move to the next job market skill


        # If the skill is not covered by any method, it's a gap
        if not is_covered:
             skill_gaps.add(job_market_skill)


    print("\nIndividual's skills (lowercase):")
    print(individual_skills_set)
    print(f"\nIdentified {len(skill_gaps)} Specific Skill Gaps (not covered by direct match or embedding similarity).")


    return {
        "skill_gaps": skill_gaps
        # Removing mapping-related output keys
        # "job_market_categories": set(),
        # "individual_categories": set(),
        # "category_gaps": set(),
        # "detailed_skill_gaps": {}
    }


# Fallback function identify_skill_gaps_mapping_only is removed as per user request for embeddings only


def recommend_learning_paths(skill_gaps_info: dict) -> str:
    """
    Auxiliary Function: Uses an LLM to generate a detailed learning path recommendation
    based on the identified skill gaps.
    """
    print("---GENERATING LEARNING PATHS (Separate Function)---")
    # Now only using skill_gaps from the input dict
    skill_gaps = skill_gaps_info.get('skill_gaps', set())


    if not skill_gaps:
        print("\nNo skill gaps identified. No recommendations needed.")
        return "No skill gaps identified."

    # Configure the LLM
    gemini_model = None
    recommendations = ""
    try:
        GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
        if GOOGLE_API_KEY:
            genai.configure(api_key=GOOGLE_API_KEY)
            # Use the working model
            gemini_model = genai.GenerativeModel('models/gemini-2.5-flash')
            print("Google Generative AI model configured for recommendations.")
        else:
             print("GOOGLE_API_KEY not found in Colab secrets. Cannot configure LLM.")
             return "Error: LLM not configured." # Return error message

    except Exception as e:
        print(f"Error configuring Google Generative AI model: {e}")
        return f"Error: LLM configuration failed: {e}" # Return error message


    print("\nGenerating personalized learning path recommendations...")

    # Prepare the prompt for the LLM, focusing on the identified skill gaps
    prompt_elements = []

    if skill_gaps:
         # Add the specific skill gaps identified by embeddings
         # Sort skill gaps alphabetically for consistent prompting
         sorted_skill_gaps = sorted(list(skill_gaps))
         prompt_elements.append(f"You have identified the following skill gaps: {', '.join(sorted_skill_gaps)}") # Include all identified gaps


    full_prompt_context = ". ".join(prompt_elements)

    # Modified prompt for detailed recommendations
    prompt = f"""Based on the following identified skill gaps:

{full_prompt_context}

Please suggest a detailed learning pathway to acquire these skills. For each key skill or group of related skills, suggest:
- Specific learning objectives.
- Recommended types of resources (e.g., beginner/intermediate/advanced online courses, books, tutorials, hands-on projects).
- A possible sequence or order for learning the skills.
- Aim for a comprehensive and actionable plan."""


    # print("\nPrompt for the LLM:") # Uncomment to see the full prompt sent to the LLM
    # print(prompt) # Uncomment to see the full prompt sent to the LLM


    if gemini_model:
        try:
            response = gemini_model.generate_content(prompt)
            recommendations = response.text
            # print("\n--- PERSONALIZED LEARNING PATH RECOMMENDATIONS (Simplified) ---") # Removed internal print
            # print(recommendations) # Removed internal print
            return recommendations
        except Exception as e:
            print(f"Error calling LLM: {e}")
            return f"Error calling LLM: {e}" # Return error message
    else:
         return "Error: LLM is not configured." # Return error message


# Define a placeholder node for storing the results
def store_results_node(state: GraphState):
    """
    LangGraph Node: Simulates storing the results of the workflow.
    This node is included in the compiled graph as a final step.
    """
    print("---STORING RESULTS (Placeholder Node)---")
    recommendations = state.get('recommendations')
    skill_gaps = state.get('skill_gaps')
    # Removing mapping-related attributes from storage summary
    # category_gaps = state.get('category_gaps')
    # detailed_skill_gaps = state.get('detailed_skill_gaps')


    if recommendations or skill_gaps:
        print("Simulating storing results (e.g., recommendations, skill gaps)...")
        # In a real implementation, you would save these to a database, file, etc.
        stored_data_summary = f"Recommendations: {recommendations[:100]}...\nSkill Gaps Count: {len(skill_gaps) if skill_gaps else 0}" # Store a summary
        return {"stored_result": stored_data_summary, "error": ""}
    else:
        print("No results to store.")
        return {"stored_result": "No results stored.", "error": ""}


# --- LangGraph Definition and Compilation (Data Processing Graph) ---
# This section defines the structure and flow of the first part of the workflow.

# Initialize the graph builder for the data processing workflow
workflow_data_processing = StateGraph(GraphState)

# Add the nodes to the data processing workflow
# These are the functions that will be executed as steps in the graph.
workflow_data_processing.add_node("load_data", load_data_node)
workflow_data_processing.add_node("wrangle_data", wrangle_data_node)
workflow_data_processing.add_node("store_results", store_results_node)


# Define transitions (edges) for the data processing graph
# These define the order in which nodes are executed.
workflow_data_processing.add_edge("load_data", "wrangle_data") # After 'load_data', go to 'wrangle_data'
workflow_data_processing.add_edge("wrangle_data", "store_results") # After 'wrangle_data', go to 'store_results'
workflow_data_processing.add_edge("store_results", END) # After 'store_results', end the graph execution


# Set the entry point for the data processing graph
# This is the node where the graph execution begins.
workflow_data_processing.set_entry_point("load_data")

# Compile the data processing graph
# This prepares the graph for execution.
app_data_processing = workflow_data_processing.compile()

print("\nFirst LangGraph (Data Processing + Placeholder Storage) compiled.")
print("The workflow is: load_data -> wrangle_data -> store_results -> END")
print("The functions `identify_skill_gaps` and `recommend_learning_paths` are called separately in the orchestration cell (8a444cd1).")


First LangGraph (Data Processing + Placeholder Storage) compiled.
The workflow is: load_data -> wrangle_data -> store_results -> END
The functions `identify_skill_gaps` and `recommend_learning_paths` are called separately in the orchestration cell (8a444cd1).


In [None]:
# Step 1: Install necessary libraries for embeddings
%pip install --quiet sentence-transformers

In [None]:
# Step 2: Load a pre-trained embedding model
from sentence_transformers import SentenceTransformer

print("Loading a pre-trained sentence embedding model...")
# Using a general-purpose model like 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Embedding model loaded successfully.")

# You can test the model with a couple of examples
# embeddings = embedding_model.encode(["Python Programming", "Data Analysis", "Cooking"])
# print("\nExample embeddings generated.")
# print(embeddings.shape) # Should be (number_of_sentences, embedding_dimension)

Loading a pre-trained sentence embedding model...
Embedding model loaded successfully.


In [None]:
# This cell orchestrates the entire workflow:
# 1. Invokes the first LangGraph (Data Processing).
# 2. Gets user skill input.
# 3. Generates embeddings for job market skills.
# 4. Generates embeddings for user skills.
# 5. Identifies skill gaps using embeddings only for coverage.
# 6. Generates and displays personalized learning pathways.

print("Starting the Skill Gap Analysis and Recommendation Workflow...") # Keep main starting print

# Step 1: Invoke the first LangGraph (Data Processing).
# We need to provide the initial state with the dataset path.
initial_state_data = {
    'dataset_path': './multi-platform-online-courses-dataset/Coursera.csv', # Replace with your actual path
    'dataframe': None,
    'skills_data': [],
    'unique_job_market_skills': set(),
    'individual_skills': [],
    'skill_gaps': set(),
    'recommendations': "",
    'error': "",
    # Keep keys in initial state for type consistency, though not used in updated identify_skill_gaps output
    'job_market_categories': set(),
    'individual_categories': set(),
    'category_gaps': set(),
    'detailed_skill_gaps': {}
}

# print("Running data processing graph...") # Suppress intermediate print
try:
    # Assuming app_data_processing is compiled in a previous cell (f09cee9e)
    result_data = app_data_processing.invoke(initial_state_data)
    # print("Data processing graph finished.") # Suppress intermediate print

    if 'unique_job_market_skills' in result_data and result_data['unique_job_market_skills']:
        unique_job_market_skills = result_data['unique_job_market_skills']
        # print(f"Job market skills successfully loaded from the data processing graph ({len(unique_job_market_skills)} unique skills).") # Suppress intermediate print

        # Step 3: Generate embeddings for job market skills
        # Assumes embedding_model is loaded in a previous cell (e.g., 0384d119)
        if 'embedding_model' in locals() and embedding_model is not None:
            print("\n--- Preparing Skill Embeddings ---") # Keep a brief progress print
            # Convert set to list for embedding
            job_market_skills_list = list(unique_job_market_skills)
            # Generate embeddings in batches if the list is large to save memory/time
            # For this size, we can do it directly
            job_market_skill_embeddings = embedding_model.encode(job_market_skills_list, show_progress_bar=False) # Suppress progress bar for cleaner output
            # print(f"Generated embeddings for {len(job_market_skills_list)} job market skills.") # Suppress intermediate print
            # Store skills and their embeddings, maybe in a dictionary for easier lookup
            job_market_embeddings_dict = dict(zip(job_market_skills_list, job_market_skill_embeddings))
            # print("Job market embeddings dictionary created.") # Suppress intermediate print


            # Step 2: Ask for user skill input (Order adjusted for clarity)
            print("\n--- USER INPUT ---") # Keep user input prompt
            print("Please enter your skills, separated by commas:")
            individual_skills_input = input()

            # Convert the input string into a list of skills, stripping whitespace
            individual_skills = [skill.strip() for skill in individual_skills_input.split(',')]

            print("\nYour skills:") # Keep displaying user skills
            print(individual_skills)

            # Step 4: Generate embeddings for user skills
            individual_embeddings_dict = {} # Initialize in case no skills are entered
            if individual_skills:
                # print("\n--- GENERATING INDIVIDUAL SKILL EMBEDDINGS ---") # Suppress intermediate print
                individual_skill_embeddings = embedding_model.encode(individual_skills, show_progress_bar=False) # Suppress progress bar for cleaner output
                # print(f"Generated embeddings for {len(individual_skills)} individual skills.") # Suppress intermediate print
                individual_embeddings_dict = dict(zip(individual_skills, individual_skill_embeddings))
                # print("Individual embeddings dictionary created.") # Suppress intermediate print
            else:
                 print("No individual skills provided.") # Keep message if no skills entered


            # Step 5: Identify skill gaps using the updated function (now uses embeddings only for coverage)
            print("\n--- IDENTIFYING SKILL GAPS ---") # Keep main step print
            # Pass the embedding dictionaries to the identify_skill_gaps function
            # The updated function now only returns 'skill_gaps'
            # print(f"Passing unique_job_market_skills ({len(unique_job_market_skills)}), individual_skills ({len(individual_skills)}), job_market_embeddings_dict ({len(job_market_embeddings_dict)}), individual_embeddings_dict ({len(individual_embeddings_dict)}) to identify_skill_gaps.") # Suppress debug print
            skill_gaps_info = identify_skill_gaps(unique_job_market_skills, individual_skills, job_market_embeddings_dict, individual_embeddings_dict)
            skill_gaps = skill_gaps_info.get('skill_gaps', set()) # Extract only skill_gaps


            # Display the results of the gap analysis (simplified output)
            print(f"\nIdentified {len(skill_gaps)} Skill Gaps (based on direct match and embedding similarity):") # Keep simplified gap summary
            # Removed display of category gaps and detailed skill gaps


            # Step 6: Generate learning path recommendations using LLM
            print("\n--- PERSONALIZED LEARNING PATH RECOMMENDATIONS ---") # Keep main step print

            # Pass the simplified skill gap information to the recommendation function
            # The recommend_learning_paths function is updated to expect this format
            recommendations = recommend_learning_paths({"skill_gaps": skill_gaps}) # Pass only the skill_gaps set


            if "Error" in recommendations:
                 print(f"\nRecommendation Error: {recommendations}")
            elif recommendations == "No skill gaps identified.":
                 print("\nNo recommendations needed based on identified gaps.")
            else:
                 # print("\n--- PERSONALIZED LEARNING PATH RECOMMENDATIONS ---") # Suppressed duplicate print
                 print(recommendations) # Final output


        else:
            print("\nError: Embedding model not found. Please run the cell to load the embedding model (e.g., cell 0384d119).")


    else:
        print("Could not retrieve job market skills from the data processing graph result.")

except NameError:
    print("\nError: Required components (e.g., app_data_processing, embedding_model, auxiliary functions) are not defined.")
    print("Please ensure you have run the necessary cells to compile the graph, load the embedding model, and define functions.")
except Exception as e:
    print(f"\nAn unexpected error occurred during the workflow: {e}")

print("\nWorkflow Finished.") # Keep a final message

Starting the Skill Gap Analysis and Recommendation Workflow...
---LOADING DATA (Graph 1)---
Dataset loaded successfully.
---WRANGLING DATA (Graph 1)---
Total unique cleaned skills extracted: 328
---STORING RESULTS (Placeholder Node)---
No results to store.

--- Preparing Skill Embeddings ---


  return forward_call(*args, **kwargs)



--- USER INPUT ---
Please enter your skills, separated by commas:
html,css,js

Your skills:
['html', 'css', 'js']

--- IDENTIFYING SKILL GAPS ---
---IDENTIFYING SKILL GAPS (Using Embeddings Only for Coverage)---

Individual's skills (lowercase):
{'css', 'html', 'js'}

Identified 328 Specific Skill Gaps (not covered by direct match or embedding similarity).

Identified 328 Skill Gaps (based on direct match and embedding similarity):

--- PERSONALIZED LEARNING PATH RECOMMENDATIONS ---
---GENERATING LEARNING PATHS (Separate Function)---
Google Generative AI model configured for recommendations.

Generating personalized learning path recommendations...
This is an exceptionally comprehensive list of skill gaps, spanning almost every major domain in technology, business, and even specialized fields. Acquiring all of these would essentially mean becoming an expert in multiple, distinct careers.

Therefore, the first and most crucial step is **prioritization**. This learning pathway will be s