In [8]:
#!/usr/bin/env python
# coding: utf-8

# In[5]:


# pip install spacy


# In[7]:


# ============================================================================
# COURSE SYLLABI + RESUME KEYWORD EXTRACTION PIPELINE
# For Canvas-Career Bridge Matching System
# ============================================================================
import pandas as pd
import re
import json
from typing import List, Dict, Set, Tuple
import spacy
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Optional: Install if needed
# !pip install spacy yake-keyword pandas
# !python -m spacy download en_core_web_sm


class CourseResumeKeywordExtractor:
    """
    Extract and normalize keywords from course descriptions and resumes.
    Uses spaCy for NLP, with n-gram generation and abbreviation expansion.
    """
    
    def __init__(self):
        """Initialize NLP models and lookup dictionaries."""
        
        print("Initializing Course & Resume Keyword Extractor...")
        
        # Load spaCy model for NLP
        print("  Loading spaCy model...")
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("  Downloading spaCy model...")
            import os
            os.system('python -m spacy download en_core_web_sm')
            self.nlp = spacy.load('en_core_web_sm')
        
        # ====================================================================
        # STOPWORDS
        # ====================================================================
        self.stopwords = {
            'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
            'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'been', 'be',
            'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
            'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
            'we', 'you', 'they', 'them', 'their', 'our', 'your', 'my', 'me', 'i',
            'he', 'she', 'it', 'who', 'what', 'where', 'when', 'why', 'how',
            'all', 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some',
            'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
            'very', 'just', 'course', 'student', 'students', 'class', 'semester',
            'week', 'weeks', 'include', 'includes', 'including', 'also', 'well',
            'use', 'using', 'used', 'learn', 'learning', 'introduce', 'introduction'
        }
        
        # ====================================================================
        # ABBREVIATION EXPANSION DICTIONARY
        # ====================================================================
        self.abbreviation_map = {
            # Machine Learning & AI
            'ml': 'machine learning',
            'ai': 'artificial intelligence',
            'nlp': 'natural language processing',
            'cv': 'computer vision',
            'dl': 'deep learning',
            'nn': 'neural networks',
            'cnn': 'convolutional neural networks',
            'rnn': 'recurrent neural networks',
            'gan': 'generative adversarial networks',
            
            # Programming & Development
            'oop': 'object oriented programming',
            'api': 'application programming interface',
            'rest': 'representational state transfer',
            'crud': 'create read update delete',
            'mvc': 'model view controller',
            'ui': 'user interface',
            'ux': 'user experience',
            'sdk': 'software development kit',
            'ide': 'integrated development environment',
            
            # Data & Databases
            'sql': 'structured query language',
            'nosql': 'non-relational database',
            'rdbms': 'relational database management system',
            'etl': 'extract transform load',
            'olap': 'online analytical processing',
            'oltp': 'online transaction processing',
            'bi': 'business intelligence',
            'eda': 'exploratory data analysis',
            
            # Statistics & Analysis
            'anova': 'analysis of variance',
            'regression': 'regression analysis',
            'pca': 'principal component analysis',
            'svm': 'support vector machine',
            'knn': 'k nearest neighbors',
            'rf': 'random forest',
            
            # Cloud & DevOps
            'aws': 'amazon web services',
            'gcp': 'google cloud platform',
            'cicd': 'continuous integration continuous deployment',
            'ci/cd': 'continuous integration continuous deployment',
            'vm': 'virtual machine',
            
            # Business & Management
            'crm': 'customer relationship management',
            'erp': 'enterprise resource planning',
            'roi': 'return on investment',
            'kpi': 'key performance indicator',
            'b2b': 'business to business',
            'b2c': 'business to consumer',
            'saas': 'software as a service',
            'paas': 'platform as a service',
            'iaas': 'infrastructure as a service',
            
            # Academic & Research
            'apa': 'american psychological association',
            'mla': 'modern language association',
            'gpa': 'grade point average',
            'stem': 'science technology engineering mathematics',
            
            # Other
            'html': 'hypertext markup language',
            'css': 'cascading style sheets',
            'xml': 'extensible markup language',
            'json': 'javascript object notation',
            'http': 'hypertext transfer protocol',
            'https': 'hypertext transfer protocol secure',
            'url': 'uniform resource locator',
            'gui': 'graphical user interface',
            'cli': 'command line interface',
            'os': 'operating system',
            'io': 'input output',
            'ar': 'augmented reality',
            'vr': 'virtual reality',
            'iot': 'internet of things',
            'gis': 'geographic information system'
        }
        
        # ====================================================================
        # SYNONYM NORMALIZATION
        # ====================================================================
        self.synonym_map = {
            # Programming synonyms
            'coding': 'programming',
            'software development': 'programming',
            'software engineering': 'programming',
            'scripting': 'programming',
            
            # Data synonyms
            'data science': 'data analysis',
            'analytics': 'data analysis',
            'data analytics': 'data analysis',
            'statistical analysis': 'statistics',
            'statistical methods': 'statistics',
            'quantitative analysis': 'statistics',
            'quantitative methods': 'statistics',
            
            # Database synonyms
            'database management': 'database',
            'data storage': 'database',
            'data warehouse': 'database',
            
            # Modeling synonyms
            'predictive modeling': 'modeling',
            'statistical modeling': 'modeling',
            'mathematical modeling': 'modeling',
            
            # Visualization synonyms
            'data visualization': 'visualization',
            'visual analytics': 'visualization',
            'graphical analysis': 'visualization',
            
            # Research synonyms
            'research methods': 'research',
            'research design': 'research',
            'empirical research': 'research',
            
            # Analysis synonyms
            'econometric analysis': 'econometrics',
            'regression modeling': 'regression',
            'time series analysis': 'time series',
            
            # Communication synonyms
            'technical writing': 'writing',
            'business writing': 'writing',
            'oral presentation': 'presentation',
            'public speaking': 'presentation'
        }
        
        # ====================================================================
        # IMPORTANT SKILLS/CONCEPTS TO PRIORITIZE
        # ====================================================================
        self.important_terms = {
            # Technical skills
            'python', 'r', 'java', 'javascript', 'sql', 'c++', 'matlab',
            'tableau', 'excel', 'power bi', 'git', 'docker', 'kubernetes',
            
            # Methodologies
            'machine learning', 'deep learning', 'data analysis', 'statistics',
            'regression', 'hypothesis testing', 'statistical inference',
            'econometrics', 'time series', 'panel data', 'causal inference',
            
            # Domain concepts
            'optimization', 'simulation', 'modeling', 'forecasting',
            'algorithm', 'data structure', 'database', 'visualization',
            'research', 'experimentation', 'survey design', 'sampling',
            
            # Soft skills
            'communication', 'teamwork', 'leadership', 'problem solving',
            'critical thinking', 'project management', 'presentation',
            
            # Business concepts
            'supply chain', 'operations', 'finance', 'marketing', 'strategy',
            'policy analysis', 'economic analysis', 'business analysis'
        }
        
        print("‚úì Extractor initialized\n")
    
    
    def clean_text(self, text: str) -> str:
        """
        Clean and normalize text.
        
        Steps:
        1. Convert to lowercase
        2. Remove HTML tags
        3. Remove special characters but keep spaces and hyphens
        4. Remove extra whitespace
        """
        if not text or not isinstance(text, str):
            return ""
        
        # Lowercase
        text = text.lower()
        
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters but keep spaces, hyphens, and forward slashes
        text = re.sub(r'[^\w\s/-]', ' ', text)
        
        # Remove standalone numbers (but keep numbers within words like "cs101")
        text = re.sub(r'\b\d+\b', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    
    def tokenize_and_lemmatize(self, text: str) -> List[str]:
        """
        Tokenize text and lemmatize using spaCy.
        
        Returns:
        - List of lemmatized tokens (excluding stopwords and short words)
        """
        doc = self.nlp(text)
        
        tokens = []
        for token in doc:
            # Skip if it's a stopword, punctuation, or very short
            if (token.text.lower() in self.stopwords or 
                token.is_punct or 
                token.is_space or 
                len(token.text) < 2):
                continue
            
            # Use lemma (base form of word)
            lemma = token.lemma_.lower()
            
            # Skip if lemmatized form is a stopword
            if lemma not in self.stopwords:
                tokens.append(lemma)
        
        return tokens
    
    
    def generate_ngrams(self, tokens: List[str], max_n: int = 3) -> List[str]:
        """
        Generate n-grams (1-grams, 2-grams, 3-grams).
        
        Prioritizes academically meaningful phrases.
        """
        ngrams = []
        
        # Add unigrams
        ngrams.extend(tokens)
        
        # Add bigrams
        for i in range(len(tokens) - 1):
            bigram = f"{tokens[i]} {tokens[i+1]}"
            ngrams.append(bigram)
        
        # Add trigrams
        for i in range(len(tokens) - 2):
            trigram = f"{tokens[i]} {tokens[i+1]} {tokens[i+2]}"
            ngrams.append(trigram)
        
        return ngrams
    
    
    def expand_abbreviations(self, ngrams: List[str]) -> Tuple[List[str], List[str]]:
        """
        Expand abbreviations found in n-grams.
        
        Returns:
        - expanded_ngrams: n-grams with abbreviations expanded
        - expansions_found: list of (abbrev, expansion) pairs found
        """
        expanded_ngrams = []
        expansions_found = []
        
        for ngram in ngrams:
            if ngram in self.abbreviation_map:
                # Found an abbreviation
                expansion = self.abbreviation_map[ngram]
                expanded_ngrams.append(expansion)
                expansions_found.append(f"{ngram} ‚Üí {expansion}")
                # Also keep the original abbreviation
                expanded_ngrams.append(ngram)
            else:
                expanded_ngrams.append(ngram)
        
        return expanded_ngrams, expansions_found
    
    
    def normalize_synonyms(self, ngrams: List[str]) -> List[str]:
        """
        Normalize synonyms to canonical forms.
        """
        normalized = []
        
        for ngram in ngrams:
            if ngram in self.synonym_map:
                canonical = self.synonym_map[ngram]
                normalized.append(canonical)
            else:
                normalized.append(ngram)
        
        return normalized
    
    
    def extract_final_keywords(self, ngrams: List[str], top_n: int = 50) -> List[str]:
        """
        Extract final keywords by:
        1. Removing duplicates
        2. Prioritizing important terms
        3. Filtering by frequency
        4. Preferring longer phrases
        """
        # Count frequencies
        ngram_counts = Counter(ngrams)
        
        # Separate into important and other
        important_keywords = []
        other_keywords = []
        
        for ngram, count in ngram_counts.items():
            if ngram in self.important_terms:
                important_keywords.append((ngram, count, len(ngram.split())))
            else:
                other_keywords.append((ngram, count, len(ngram.split())))
        
        # Sort important keywords by: length (longer = better), then frequency
        important_keywords.sort(key=lambda x: (x[2], x[1]), reverse=True)
        
        # Sort other keywords similarly
        other_keywords.sort(key=lambda x: (x[2], x[1]), reverse=True)
        
        # Combine: prioritize important terms
        final_keywords = (
            [kw[0] for kw in important_keywords] + 
            [kw[0] for kw in other_keywords]
        )
        
        # Remove duplicates while preserving order
        seen = set()
        unique_keywords = []
        for kw in final_keywords:
            if kw not in seen:
                seen.add(kw)
                unique_keywords.append(kw)
        
        return unique_keywords[:top_n]
    
    
    def process_text(self, text: str, text_type: str = "course") -> Dict:
        """
        Complete pipeline to process text and extract keywords.
        
        Args:
            text: Input text (course description or resume)
            text_type: "course" or "resume" (for logging)
            
        Returns:
            Dictionary with all intermediate and final results
        """
        # Step 1: Clean text
        cleaned_text = self.clean_text(text)
        
        # Step 2: Tokenize and lemmatize
        tokens = self.tokenize_and_lemmatize(cleaned_text)
        
        # Step 3: Generate n-grams
        ngrams = self.generate_ngrams(tokens, max_n=3)
        
        # Step 4: Expand abbreviations
        expanded_ngrams, expansions = self.expand_abbreviations(ngrams)
        
        # Step 5: Normalize synonyms
        normalized_ngrams = self.normalize_synonyms(expanded_ngrams)
        
        # Step 6: Extract final keywords
        final_keywords = self.extract_final_keywords(normalized_ngrams, top_n=50)
        
        return {
            "cleaned_text": cleaned_text,
            "tokens": tokens,
            "ngrams": ngrams[:20],  # Sample for debugging
            "expanded_abbreviations": expansions,
            "final_keywords": final_keywords
        }
    
    
    def process_courses_csv(self, csv_path: str, 
                           course_name_col: str = 'course_name',
                           course_desc_col: str = 'course_description',
                           resume_col: str = 'resume') -> Dict:
        """
        Process CSV file containing courses and resume.
        
        Args:
            csv_path: Path to CSV file
            course_name_col: Column name for course names
            course_desc_col: Column name for course descriptions
            resume_col: Column name for resume text
            
        Returns:
            Complete structured output with all keywords
        """
        print("="*80)
        print("PROCESSING COURSES + RESUME CSV")
        print("="*80 + "\n")
        
        # Read CSV
        print(f"Reading CSV: {csv_path}")
        df = pd.read_csv(csv_path)
        print(f"  Found {len(df)} courses\n")
        
        # ====================================================================
        # PROCESS COURSES
        # ====================================================================
        print("Processing course descriptions...")
        
        courses_output = []
        all_course_keywords = []
        
        for idx, row in df.iterrows():
            course_name = row[course_name_col]
            course_desc = row[course_desc_col]
            
            print(f"  [{idx+1}/{len(df)}] {course_name}")
            
            # Process this course description
            result = self.process_text(course_desc, text_type="course")
            
            # Store course keywords
            courses_output.append({
                "course_name": course_name,
                "keywords": result["final_keywords"]
            })
            
            # Add to master list
            all_course_keywords.extend(result["final_keywords"])
        
        print(f"\n‚úì Processed {len(courses_output)} courses\n")
        
        # ====================================================================
        # PROCESS RESUME (only once since it's the same in all rows)
        # ====================================================================
        print("Processing resume...")
        
        resume_text = df[resume_col].iloc[0]  # Get from first row
        resume_result = self.process_text(resume_text, text_type="resume")
        resume_keywords = resume_result["final_keywords"]
        
        print(f"  ‚úì Extracted {len(resume_keywords)} resume keywords\n")
        
        # ====================================================================
        # CREATE UNIFIED MASTER LIST
        # ====================================================================
        print("Creating unified keyword master list...")
        
        # Combine all keywords
        all_keywords_combined = all_course_keywords + resume_keywords
        
        # Remove duplicates while preserving importance
        all_keywords_unique = self.extract_final_keywords(all_keywords_combined, top_n=100)
        
        print(f"  ‚úì Unified list contains {len(all_keywords_unique)} unique keywords\n")
        
        # ====================================================================
        # BUILD FINAL OUTPUT
        # ====================================================================
        output = {
            "courses": courses_output,
            "resume_keywords": resume_keywords,
            "all_keywords": all_keywords_unique,
            "statistics": {
                "total_courses": len(courses_output),
                "total_course_keywords": len(all_course_keywords),
                "total_resume_keywords": len(resume_keywords),
                "total_unique_keywords": len(all_keywords_unique)
            }
        }
        
        print("="*80)
        print("PROCESSING COMPLETE")
        print("="*80 + "\n")
        
        return output


# ============================================================================
# USAGE EXAMPLE
# ============================================================================

if __name__ == "__main__":
    
    # Initialize extractor
    extractor = CourseResumeKeywordExtractor()
    
    # ========================================================================
    # CREATE SAMPLE CSV FOR DEMONSTRATION
    # ========================================================================
    
    sample_data = pd.DataFrame({
        'course_name': [
            'STAT 215 - Statistical Inference',
            'CS 101 - Introduction to Programming',
            'ECON 301 - Applied Econometrics',
            'EDUC 200 - Education Policy'
        ],
        'course_description': [
            'Introduction to statistical inference including hypothesis testing, confidence intervals, and regression analysis. Students will learn to apply statistical methods to real-world data using R programming. Topics include ANOVA, linear regression, and experimental design.',
            'Fundamental programming concepts using Python. Topics include data structures, algorithms, OOP principles, and API development. Students will build projects using HTML, CSS, and JavaScript.',
            'Application of statistical and econometric methods to economic data. Focus on regression models, panel data analysis, time series, and causal inference techniques. Use of Stata and R for analysis.',
            'Examination of contemporary education policy issues including equity, access, and educational outcomes. Analysis of policy interventions using data-driven approaches and program evaluation methods.'
        ],
        'resume': [
            # Same resume in all rows
            '''John Doe
            Education: Bachelor of Science in Statistics, Minor in Computer Science
            
            Skills:
            - Programming: Python, R, SQL, Java
            - Data Analysis: Statistical Analysis, ML, Data Visualization, Tableau
            - Tools: Excel, Git, AWS
            
            Experience:
            Research Assistant | Dept of Education | 2024-Present
            - Statistical analysis using R and Python
            - Created visualizations using Tableau
            - ML models for predictive analytics
            
            Data Science Club President | 2023-Present
            - Led workshops on data analysis
            - Organized hackathons
            
            Soft Skills: Communication, Teamwork, Leadership, Problem Solving'''
        ] * 4  # Repeat same resume for all rows
    })
    
    # Save to CSV
    sample_data.to_csv('sample_courses.csv', index=False)
    print("Created sample CSV: sample_courses.csv\n")
    
    # ========================================================================
    # PROCESS THE CSV
    # ========================================================================
    
    results = extractor.process_courses_csv(
        'sample_courses.csv',
        course_name_col='course_name',
        course_desc_col='course_description',
        resume_col='resume'
    )
    
    # ========================================================================
    # SAVE TO JSON
    # ========================================================================
    
    with open('extracted_keywords.json', 'w') as f:
        json.dump(results, f, indent=2)
    
    print("Saved results to: extracted_keywords.json\n")
    
    # ========================================================================
    # DISPLAY RESULTS
    # ========================================================================
    
    print("="*80)
    print("RESULTS SUMMARY")
    print("="*80 + "\n")
    
    print(f"Statistics:")
    for key, value in results['statistics'].items():
        print(f"  ‚Ä¢ {key}: {value}")
    
    print("\n" + "-"*80 + "\n")
    
    print("Course Keywords (sample):")
    for course in results['courses'][:2]:  # Show first 2
        print(f"\n{course['course_name']}:")
        print(f"  Keywords: {', '.join(course['keywords'][:10])}...")
    
    print("\n" + "-"*80 + "\n")
    
    print("Resume Keywords (first 20):")
    print(f"  {', '.join(results['resume_keywords'][:20])}")
    
    print("\n" + "-"*80 + "\n")
    
    print("Unified Master List (first 30):")
    print(f"  {', '.join(results['all_keywords'][:30])}")
    
    print("\n" + "="*80)

# ============================================================================
# JOB LISTING SEMANTIC MATCHING PIPELINE
# ============================================================================

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class JobKeywordExtractor(CourseResumeKeywordExtractor):
    """
    Extracts normalized keywords for each job listing, embeds them using
    SentenceTransformer, embeds the user's unified keyword list, performs
    semantic matching using cosine similarity, and returns the TOP 3 JOBS.
    """

    def __init__(self):
        super().__init__()

        print("\nüîπ Loading SentenceTransformer model...")
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
        print("   ‚úì Embedding model loaded\n")

    def embed(self, text: str) -> np.ndarray:
        """Return L2-normalized embedding for a text string"""
        return self.embedder.encode(text, normalize_embeddings=True)

    

    def process_jobs_csv(self,
                         csv_path: str,
                         job_name_col: str = "job_name",
                         job_desc_col: str = "job_description",
                         job_link_col: str = "job_link") -> dict:

        print("="*90)
        print("PROCESSING JOB LISTINGS + MATCHING")
        print("="*90 + "\n")

        try:
            df = pd.read_csv(csv_path, encoding="utf-8")
        except UnicodeDecodeError:
            print("‚ö† UTF-8 failed ‚Üí trying latin-1")
            df = pd.read_csv(csv_path, encoding="latin-1")
            print(f"Found {len(df)} job listings\n")

        job_keyword_dict = {}

        # ==========================================================
        # 1Ô∏è‚É£ Extract keywords per job
        # ==========================================================
        for i, row in df.iterrows():

            job_name = str(row[job_name_col])
            job_text = str(row[job_desc_col]) if pd.notna(row[job_desc_col]) else ""

            print(f"  ‚ñ∂ [{i+1}/{len(df)}] Extracting ‚Üí {job_name}")

            result = self.process_text(job_text, text_type="job")
            job_keyword_dict[job_name] = result["final_keywords"]

        print("\n‚úì Finished extracting job keywords\n")

        # ==========================================================
        # 2Ô∏è‚É£ Embed USER unified keyword list
        # ==========================================================
        print("Embedding USER master keyword list...")

        # Combine master list into one text block
        user_keywords = []
        for job_list in job_keyword_dict.values():
            user_keywords.extend(job_list)

        user_keywords = list(set(user_keywords))     # dedupe
        user_text = " ; ".join(user_keywords)
        user_vec = self.embed(user_text).reshape(1, -1)

        # ==========================================================
        # 3Ô∏è‚É£ Embed each JOB & score similarity
        # ==========================================================
        print("Computing semantic similarity...\n")

        scored_jobs = []

        for job_name, kw_list in job_keyword_dict.items():

            job_text = " ; ".join(kw_list)
            job_vec = self.embed(job_text).reshape(1, -1)

            score = float(cosine_similarity(user_vec, job_vec)[0][0])

            scored_jobs.append((job_name, score))

        scored_jobs.sort(key=lambda x: x[1], reverse=True)

        # ==========================================================
        # 4Ô∏è‚É£ RETURN ONLY TOP 3
        # ==========================================================
        top_matches = [
            {"job": job, "score": float(f"{score:.4f}")}
            for job, score in scored_jobs[:3]
        ]

        print("üî• TOP 3 MATCHES:\n")
        for m in top_matches:
            print(f"‚≠ê {m['job']} ‚Üí {m['score']}")

        print("\n‚úì SEMANTIC MATCHING COMPLETE\n")

        return {"top_matches": top_matches}

# ============================================================================
# SAMPLE USAGE BLOCK
# (You may keep, delete, or adjust paths as needed)
# ============================================================================

if __name__ == "__main__":

    # ---------------------------------------------
    # OPTIONAL: RUN JOB KEYWORD EXTRACTION
    # ---------------------------------------------

    try:
        print("\n" + "="*90)
        print("RUNNING JOB DESCRIPTION PIPELINE DEMO")
        print("="*90 + "\n")

        job_extractor = JobKeywordExtractor()

        results = job_extractor.process_jobs_csv(
            csv_path="sample_jobs.csv",      
            job_name_col="job_name",
            job_desc_col="job_description",
            job_link_col="job_link"
        )

        with open("job_keywords.json", "w") as f:
            json.dump(results, f, indent=2)

        print("\nSaved ‚Üí job_keywords.json")

    except Exception as e:
        print("\n‚ö† Job extraction block skipped (no job CSV found).")
        print("  Error =", e)




Initializing Course & Resume Keyword Extractor...
  Loading spaCy model...
‚úì Extractor initialized

Created sample CSV: sample_courses.csv

PROCESSING COURSES + RESUME CSV

Reading CSV: sample_courses.csv
  Found 4 courses

Processing course descriptions...
  [1/4] STAT 215 - Statistical Inference
  [2/4] CS 101 - Introduction to Programming
  [3/4] ECON 301 - Applied Econometrics
  [4/4] EDUC 200 - Education Policy

‚úì Processed 4 courses

Processing resume...
  ‚úì Extracted 50 resume keywords

Creating unified keyword master list...
  ‚úì Unified list contains 100 unique keywords

PROCESSING COMPLETE

Saved results to: extracted_keywords.json

RESULTS SUMMARY

Statistics:
  ‚Ä¢ total_courses: 4
  ‚Ä¢ total_course_keywords: 200
  ‚Ä¢ total_resume_keywords: 50
  ‚Ä¢ total_unique_keywords: 100

--------------------------------------------------------------------------------

Course Keywords (sample):

STAT 215 - Statistical Inference:
  Keywords: statistical inference, regression, a

In [2]:
        
# ============================================================================
# SAMPLE USAGE BLOCK
# ============================================================================

if __name__ == "__main__":

    import pandas as pd
    import json
    sample_jobs.csv = pd.read_csv('/Users/marikaclark/Downloads/MockJobScrape.csv')

    # -------------------------------------------------------------
    # 2Ô∏è‚É£ RUN JOB EXTRACTION
    # -------------------------------------------------------------
    try:
        print("\n" + "=" * 90)
        print("RUNNING JOB DESCRIPTION PIPELINE DEMO")
        print("=" * 90 + "\n")

        job_extractor = JobKeywordExtractor()

        results = job_extractor.process_jobs_csv(
            csv_path="sample_jobs.csv",
            job_name_col="job_name",
            job_desc_col="job_description",
            job_link_col="job_link"
        )

        # ---------------------------------------------------------
        # 3Ô∏è‚É£ SAVE OUTPUT
        # ---------------------------------------------------------
        with open("job_keywords.json", "w") as f:
            json.dump(results, f, indent=2)

        print("\nüíæ Saved extracted keywords ‚Üí job_keywords.json\n")

        # ---------------------------------------------------------
        # 4Ô∏è‚É£ PRINT KEYWORDS TO SCREEN
        # ---------------------------------------------------------
        print("üìä EXTRACTED KEYWORDS\n")
        for job in results["jobs"]:
            print(f"üîπ {job['job_name']}")
            print("   Keywords:", ", ".join(job["keywords"][:15]), "...\n")

        print("\nüî• MASTER KEYWORD LIST:")
        print(", ".join(results["all_keywords"][:40]), "...\n")

    except Exception as e:
        print("\n‚ö† Job extraction block skipped.")
        print("  Error =", e)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/marikaclark/Downloads/MockJobScrape'

In [9]:
# ============================================================================
# MASTER EXECUTION BLOCK   (FINAL VERSION WITH REAL PATHS)
# ============================================================================

if __name__ == "__main__":

    import pandas as pd
    import json

    print("\n" + "="*100)
    print("üöÄ RUNNING FULL PIPELINE: COURSE ‚Üí RESUME ‚Üí JOB MATCHING")
    print("="*100 + "\n")

    # ============================================================
    # 1Ô∏è‚É£ PROCESS COURSE + RESUME CSV
    # ============================================================
    try:
        extractor = CourseResumeKeywordExtractor()

        COURSE_PATH = "/Users/marikaclark/Downloads/canvas_complete_data (13).csv"

        print(f"\nüìÅ Reading COURSE data from:\n   {COURSE_PATH}\n")

        course_results = extractor.process_courses_csv(
            COURSE_PATH,
            course_name_col="Course Name",
            course_desc_col="Description",
            resume_col="Resume Info"
        )

        print("\nüéØ Extracted unified student keyword list!")
        print(f"   ‚Üí {len(course_results['all_keywords'])} unique keywords\n")

    except Exception as e:
        print("\n‚ùå ERROR WHILE PROCESSING COURSE CSV\n", e)
        raise SystemExit()

    # ============================================================
    # 2Ô∏è‚É£ PROCESS JOB LISTINGS + SEMANTIC MATCHING
    # ============================================================
    try:
        print("\n" + "="*90)
        print("üîé RUNNING JOB SEMANTIC MATCHING ENGINE")
        print("="*90 + "\n")

        job_extractor = JobKeywordExtractor()

        JOB_PATH = "/Users/marikaclark/Downloads/MockJobScrape.csv"
        

        print(f"üìÅ Reading JOB listings from:\n   {JOB_PATH}\n")

        results = job_extractor.process_jobs_csv(
            csv_path=JOB_PATH,
            job_name_col="job",
            job_desc_col="job description",
            job_link_col="job link"
        )

    except Exception as e:
        print("\n‚ùå ERROR WHILE PROCESSING JOB CSV\n", e)
        raise SystemExit()

    # ============================================================
    # 3Ô∏è‚É£ SAVE OUTPUT (OPTIONAL)
    # ============================================================
    with open("job_keywords.json", "w") as f:
        json.dump(results, f, indent=2)

    print("\nüíæ Saved ‚Üí job_keywords.json\n")

    print("\n=================== DONE ‚úÖ ===================")



üöÄ RUNNING FULL PIPELINE: COURSE ‚Üí RESUME ‚Üí JOB MATCHING

Initializing Course & Resume Keyword Extractor...
  Loading spaCy model...
‚úì Extractor initialized


üìÅ Reading COURSE data from:
   /Users/marikaclark/Downloads/canvas_complete_data (13).csv

PROCESSING COURSES + RESUME CSV

Reading CSV: /Users/marikaclark/Downloads/canvas_complete_data (13).csv
  Found 8 courses

Processing course descriptions...
  [1/8] 25F Chem1410 Section 600
  [2/8] 25F Engineering Foundations 1_TIAN
  [3/8] 25F Intro College Chem I Lab
  [4/8] 25F Multivariable Calculus
  [5/8] 2025F Econ 2010 - 091
  [6/8] 2025F Econ 2010 Pan ‚Äì 102
  [7/8] Canvas Hackathon (Professor Wright and Professor Lewis)
  [8/8] UVA Engineering Calculus Placement (June 16th-June 20th)

‚úì Processed 8 courses

Processing resume...
  ‚úì Extracted 50 resume keywords

Creating unified keyword master list...
  ‚úì Unified list contains 100 unique keywords

PROCESSING COMPLETE


üéØ Extracted unified student keyword list