In [1]:
# Step 11 - Production Inference with Pre-trained Model
print("\n=== Step 11: Production Inference (Load & Predict) ===")

import json
import numpy as np
import joblib
from tensorflow import keras
from pathlib import Path
from datetime import datetime

class ProductionResumeClassifier:
    """
    Lightweight inference class that loads pre-trained artifacts
    No training - just prediction on new resumes
    """
    
    def __init__(self, artifacts_dir='artifacts', models_dir='models'):
        """
        Load all pre-trained artifacts once during initialization
        """
        self.artifacts_dir = Path(artifacts_dir)
        self.models_dir = Path(models_dir)
        
        print("Loading pre-trained model and artifacts...")
        
        # Load trained model
        self.model = keras.models.load_model(
            self.models_dir / 'resume_classifier_complete.h5'
        )
        print("✓ Loaded model")
        
        # Load all preprocessing artifacts
        self.scaler = joblib.load(self.artifacts_dir / 'feature_scaler.pkl')
        print("✓ Loaded feature scaler")
        
        with open(self.artifacts_dir / 'skill_vocabulary.json', 'r') as f:
            self.skill_vocab = json.load(f)
        print(f"✓ Loaded skill vocabulary ({len(self.skill_vocab)} skills)")
        
        self.label_encoder = joblib.load(self.artifacts_dir / 'label_encoder.pkl')
        print("✓ Loaded label encoder")
        
        self.feature_builder = joblib.load(
            self.artifacts_dir / 'feature_vector_builder.pkl'
        )
        print("✓ Loaded feature vector builder")
        
        with open(self.artifacts_dir / 'domain_requirements.json', 'r') as f:
            self.domain_requirements = json.load(f)
        print("✓ Loaded domain requirements")
        
        # IMPORTANT: Load the complete classification pipeline from Step 9
        # This includes the explanation generation logic
        self.pipeline = joblib.load(self.artifacts_dir / 'classification_pipeline.pkl')
        print("✓ Loaded classification pipeline (with explanation generator)")
        
        # Load model manifest for metadata
        with open(self.artifacts_dir / 'model_manifest.json', 'r') as f:
            self.manifest = json.load(f)
        
        print(f"\nModel version: {self.manifest['version']}")
        print(f"Model created: {self.manifest['created_date']}")
        print(f"Test accuracy: {self.manifest['test_accuracy']:.3f}")
        print("\n✓ All artifacts loaded successfully!")
    
    def predict_single(self, resume_dict, return_probabilities=False, include_raw_scores=True, precision=3):
        """
        Predict classification for a single resume
        Uses the exact same pipeline as Step 9
        
        Args:
            resume_dict: Dictionary with resume data
            return_probabilities: If True, return class probabilities
            include_raw_scores: If True, include raw test score in feature summary
            precision: Decimal precision for numeric values
        
        Returns:
            Dictionary with prediction results (same format as Step 9)
        """
        # Use the loaded pipeline's classify_resume method directly
        # This ensures we use the EXACT same explanation generation logic
        result = self.pipeline.classify_resume(
            resume_dict, 
            include_raw_scores=include_raw_scores,
            precision=precision
        )
        
        # Add class probabilities if requested
        if return_probabilities and 'error' not in result:
            try:
                # Extract features and get predictions to retrieve probabilities
                resume_features = self._extract_features_minimal(resume_dict)
                scaled_numeric = self.scaler.transform([resume_features['numeric_features']])
                resume_features['scaled_numeric_features'] = scaled_numeric[0]
                
                feature_vector = self.feature_builder.build_final_vector(resume_features)
                model_inputs = self._prepare_model_inputs(feature_vector.reshape(1, -1))
                class_probs = self.model.predict(model_inputs, verbose=0)[0]
                
                result["class_probabilities"] = {
                    label: round(float(prob), precision)
                    for label, prob in zip(self.label_encoder.classes_, class_probs)
                }
            except Exception as e:
                result["probabilities_error"] = str(e)
        
        return result
    
    def predict_batch(self, resume_list, output_file=None, include_raw_scores=True, precision=3):
        """
        Predict classifications for multiple resumes
        Uses the pipeline's batch classification
        
        Args:
            resume_list: List of resume dictionaries
            output_file: Optional path to save results as JSON
            include_raw_scores: Include raw test scores
            precision: Decimal precision
        
        Returns:
            List of prediction results
        """
        results = []
        
        print(f"\nProcessing {len(resume_list)} resumes...")
        for i, resume in enumerate(resume_list, 1):
            if i % 10 == 0:
                print(f"  Processed {i}/{len(resume_list)}...")
            
            result = self.predict_single(
                resume, 
                return_probabilities=False,
                include_raw_scores=include_raw_scores,
                precision=precision
            )
            results.append(result)
        
        print(f"✓ Completed {len(results)} predictions")
        
        # Save if output file specified
        if output_file:
            output_path = Path(output_file)
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            with open(output_path, 'w') as f:
                json.dump(results, f, indent=2)
            print(f"✓ Saved results to {output_file}")
        
        return results
    
    def _extract_features_minimal(self, resume):
        """Minimal feature extraction for probability calculation"""
        domain_key = None
        for key, req in self.domain_requirements.items():
            if req["domain"] == resume["preferred_domain"]:
                domain_key = key
                break
        
        if domain_key is None:
            raise ValueError(f"Unknown domain: {resume['preferred_domain']}")
        
        required_skills = self.domain_requirements[domain_key]["required_skills"]
        candidate_skills = resume.get('skills', [])
        
        skill_vector = self._encode_skills(candidate_skills)
        matched, missing, ratio = self._compute_skill_matches(candidate_skills, required_skills)
        
        projects = resume.get('projects', [])
        work_exp = resume.get('work_experience', [])
        test_score = resume.get('test_score', 0)
        
        project_count = len(projects)
        project_text = " ".join(projects) if projects else ""
        
        years_exp = sum(item.get('years', 0) for item in work_exp)
        max_years = max((item.get('years', 0) for item in work_exp), default=0)
        exp_text = " ".join(item.get('title', '') for item in work_exp)
        
        return {
            'skill_vector': skill_vector,
            'skill_match_ratio': ratio,
            'matched_skills': matched,
            'missing_skills': missing,
            'project_count': project_count,
            'project_text': project_text,
            'years_experience': years_exp,
            'max_years': max_years,
            'experience_text': exp_text,
            'test_score': test_score,
            'test_score_norm': test_score / 100.0,
            'numeric_features': [years_exp, max_years, project_count]
        }
    
    def _encode_skills(self, candidate_skills):
        """Binary encoding of skills"""
        skill_vector = np.zeros(len(self.skill_vocab), dtype=int)
        normalized_skills = {s.strip().lower() for s in candidate_skills}
        
        for i, vocab_skill in enumerate(self.skill_vocab):
            if vocab_skill in normalized_skills:
                skill_vector[i] = 1
        
        return skill_vector
    
    def _compute_skill_matches(self, candidate_skills, required_skills):
        """Compute matched/missing skills"""
        candidate_set = {s.strip().lower() for s in candidate_skills}
        required_set = {s.strip().lower() for s in required_skills}
        
        matched = list(candidate_set.intersection(required_set))
        missing = list(required_set - candidate_set)
        ratio = len(matched) / len(required_set) if required_set else 0.0
        
        return matched, missing, ratio
    
    def _prepare_model_inputs(self, X):
        """Split feature vector into model inputs"""
        skill_dim = len(self.skill_vocab) + 1
        numeric_dim = 4
        
        skill_features = X[:, :skill_dim]
        numeric_features = X[:, skill_dim:skill_dim + numeric_dim]
        
        inputs = [skill_features, numeric_features]
        
        if self.feature_builder.use_text_embeddings:
            text_features = X[:, skill_dim + numeric_dim:]
            inputs.append(text_features)
        
        return inputs


# ============= USAGE EXAMPLE =============

print("\n" + "="*60)
print("STEP 11: PRODUCTION INFERENCE DEMO")
print("="*60)

# Initialize production classifier (loads everything once)
classifier = ProductionResumeClassifier()

# Example 1: Predict single resume
print("\n--- Example 1: Single Resume Prediction ---")

new_resume = {
    "id": "new_candidate_001",
    "preferred_domain": "Data Science",
    "skills": ["Python", "Pandas", "NumPy", "Scikit-learn", "SQL", "Docker"],
    "projects": ["Customer Churn Model", "Sales Forecasting"],
    "work_experience": [
        {"title": "Data Analyst", "years": 2},
        {"title": "Junior Data Scientist", "years": 1}
    ],
    "test_score": 82
}

result = classifier.predict_single(new_resume, return_probabilities=True)

print(f"\nCandidate ID: {result['metadata']['candidate_id']}")
print(f"Prediction: {result['label']}")
print(f"Confidence: {result['confidence']}")
print(f"Matched skills ({len(result['matched_skills'])}): {', '.join(result['matched_skills'])}")
print(f"Missing skills ({len(result['missing_skills'])}): {', '.join(result['missing_skills'][:5])}")
print(f"\nExplanation: {result['explanation']}")

if 'class_probabilities' in result:
    print(f"\nClass probabilities:")
    for label, prob in result['class_probabilities'].items():
        print(f"  {label}: {prob:.3f}")

if 'alternative_domain_suggestions' in result:
    print(f"\nAlternative domain suggestions:")
    for alt in result['alternative_domain_suggestions']:
        print(f"  {alt['rank']}. {alt['domain']}: {alt['skill_match_ratio']:.1%} match")

# Example 2: Batch prediction
print("\n--- Example 2: Batch Prediction ---")

# Load some test resumes
with open('data/balanced_synthetic_resumes.json', 'r') as f:
    all_resumes = json.load(f)

# Take unseen resumes (simulate new data)
new_resumes = all_resumes[1800:1810]  # Last 10 resumes as "new" data

# Batch predict
batch_results = classifier.predict_batch(
    new_resumes, 
    output_file='predictions/new_predictions.json'
)

# Show summary
print("\n--- Batch Prediction Summary ---")
label_counts = {}
for result in batch_results:
    if 'label' in result:
        label = result['label']
        label_counts[label] = label_counts.get(label, 0) + 1

for label, count in sorted(label_counts.items()):
    print(f"{label}: {count} ({count/len(batch_results)*100:.1f}%)")

# Show sample predictions
print("\n--- Sample Predictions ---")
for result in batch_results[:3]:
    if 'error' not in result:
        print(f"\n{result['metadata']['candidate_id']}: {result['label']} ({result['confidence']})")
        print(f"  {result['explanation']}")

print("\n" + "="*60)
print("✓ Step 11 Complete - Ready for Production!")
print("="*60)


=== Step 11: Production Inference (Load & Predict) ===



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\pavan\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\pavan\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\pavan\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\pavan\AppData\Roaming\Python\

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\pavan\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\pavan\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\pavan\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\pavan\AppData\Roaming\Python\

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




STEP 11: PRODUCTION INFERENCE DEMO
Loading pre-trained model and artifacts...




✓ Loaded model


AttributeError: Can't get attribute 'ResumeFeatureScaler' on <module '__main__'>