## **Load the Dataset**

In [None]:
import pandas as pd
from pathlib import Path

# Robust CSV loading (try local folder first, then repo root dataset)
_df = None
for candidate in [
    Path('job_descriptions.csv'),
    Path('./dataset/job_descriptions.csv'),
    Path(__file__).resolve().parent / 'job_descriptions.csv' if '__file__' in globals() else None
]:
    if candidate and Path(candidate).exists():
        _df = pd.read_csv(candidate)
        break
if _df is None:
    # Fallback to original (may fail if path is wrong)
    _df = pd.read_csv('job_descriptions.csv')

df = _df
print('Loaded rows:', len(df))
df.head()


## **Basic Exploration**

In [None]:
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])

print('Name of columns:', df.columns.tolist())
print('Data types:', df.info())

print('Summary:', df.describe(include='all'))

## **Check Missing Data**

In [None]:
df.isnull().sum()

Unnamed: 0,0
Job Id,0
Experience,0
Qualifications,0
Salary Range,0
location,0
Country,0
latitude,0
longitude,0
Work Type,0
Company Size,0


## **Handle Duplicates**

In [None]:
df.duplicated().sum()
df = df.drop_duplicates()

## **Handle Missing Data**

In [None]:
# Ensure expected columns exist to avoid KeyErrors downstream
for col in ['Company Profile', 'Job Posting Date', 'Country', 'Work Type', 'Job Title', 'Job Description']:
    if col not in df.columns:
        df[col] = ''

df["Company Profile"].fillna("Not Provided", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Company Profile"].fillna("Not Provided", inplace=True)


## **Data Cleaning**

In [None]:
df["Job Posting Date"] = pd.to_datetime(df["Job Posting Date"], errors="coerce")

## **Quick Exploratory Data Analysis (EDA)**

In [None]:
df["Country"].value_counts().head(10)

Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
Jordan,11
Cameroon,10
Papua New Guinea,10
Maldives,9
Solomon Islands,9
Brunei,9
Kyrgyz Republic,9
UK,9
Comoros,9
France,9


In [None]:
df["Job Title"].value_counts().head(10)

Unnamed: 0_level_0,count
Job Title,Unnamed: 1_level_1
UX/UI Designer,38
Software Engineer,23
Customer Support Specialist,15
Software Tester,14
Sales Representative,14
Procurement Manager,14
Network Administrator,13
Litigation Attorney,13
Database Administrator,13
Supply Chain Manager,13


In [None]:
df["Work Type"].value_counts()

Unnamed: 0_level_0,count
Work Type,Unnamed: 1_level_1
Part-Time,213
Contract,192
Temporary,192
Full-Time,188
Intern,181


In [None]:
# Check if Salary Range column exists before accessing it
if "Salary Range" in df.columns:
    df["Salary Range"].head(10)
else:
    print("Salary Range column not found in dataset")


Unnamed: 0,Salary Range
0,$59K-$99K
1,$56K-$116K
2,$61K-$104K
3,$65K-$91K
4,$64K-$87K
5,$59K-$93K
6,$63K-$103K
7,$65K-$102K
8,$65K-$102K
9,$60K-$80K


## **Save Cleaned Dataset**

In [None]:
df.to_csv("cleaned_jobs.csv", index=False)

## **Advanced Text Processing & Feature Engineering for JobLens**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# FAST mode for quick runs
import os
FAST = os.environ.get('JOBLENS_FAST', '0') == '1'
print('FAST mode:', FAST)

# Optional advanced libs
try:
    import spacy
    from spacy.matcher import PhraseMatcher
    _spacy_available = True
except Exception:
    _spacy_available = False

try:
    from sentence_transformers import SentenceTransformer
    _st_available = True
except Exception:
    _st_available = False

try:
    import shap
    _shap_available = True
except Exception:
    _shap_available = False

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

print("Libraries imported successfully! spaCy:", _spacy_available, "SentenceTransformers:", _st_available, "SHAP:", _shap_available)


## **Text Preprocessing Functions**

In [None]:
def clean_text(text):
    """Clean and preprocess text data - unified version"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    # Keep alphanumeric characters and spaces for better feature extraction
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join(text.split())
    return text

# Improved skill extraction using spaCy PhraseMatcher with fallback to keyword list
_skill_list = [
    'python','java','javascript','c++','c#','go','rust','sql','html','css','react','angular','vue','node','django','flask','spring',
    'kotlin','swift','typescript','pandas','numpy','scikit-learn','pytorch','tensorflow','nlp','bert','transformers','machine learning',
    'deep learning','data science','aws','gcp','azure','docker','kubernetes','git','agile','scrum','linux','bash','postgresql','mysql',
    'mongodb','redis','grpc','rest','graphql','fastapi','express','next.js','nestjs','ci/cd','terraform','ansible','snowflake','airflow'
]

if _spacy_available:
    try:
        try:
            nlp = spacy.load('en_core_web_sm')
        except Exception:
            # Attempt to download model at runtime
            from spacy.cli import download as spacy_download
            spacy_download('en_core_web_sm')
            nlp = spacy.load('en_core_web_sm')
        phrase_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        patterns = [nlp.make_doc(s) for s in _skill_list]
        phrase_matcher.add('SKILLS', patterns)
    except Exception:
        _spacy_available = False
        nlp = None
        phrase_matcher = None
else:
    nlp = None
    phrase_matcher = None

def extract_skills(text: str):
    if pd.isna(text):
        return []
    text_s = str(text)
    if _spacy_available and nlp is not None and phrase_matcher is not None:
        doc = nlp(text_s)
        skills = sorted({doc[start:end].text.lower() for _, start, end in phrase_matcher(doc)})
        if skills:
            return skills
    # Fallback: keyword presence
    t = text_s.lower()
    return sorted({kw for kw in _skill_list if kw in t})


## **Data Preprocessing for JobLens Model**

In [None]:
# Create a clean copy for ML processing
df_ml = df.copy()

# Apply consistent text cleaning
df_ml['Job Title'] = df_ml['Job Title'].astype(str).apply(clean_text)
df_ml['Job Description'] = df_ml['Job Description'].astype(str).apply(clean_text)

# Combine relevant text fields for job descriptions (guard optional 'Skills')
skills_col = 'Skills' if 'Skills' in df_ml.columns else None
_df_skills = df_ml[skills_col].fillna('') if skills_col else ''

df_ml['combined_job_text'] = (
    df_ml['Job Title'].fillna('') + ' ' +
    df_ml['Job Description'].fillna('') + ' ' +
    (_df_skills if isinstance(_df_skills, pd.Series) else _df_skills) + ' ' +
    df_ml['Company Profile'].fillna('')
)

# Clean the combined text
df_ml['cleaned_job_text'] = df_ml['combined_job_text'].apply(clean_text)

# Extract skills from job descriptions
df_ml['required_skills'] = df_ml['combined_job_text'].apply(extract_skills)
df_ml['num_skills_required'] = df_ml['required_skills'].apply(len)

# Create synthetic match score for training demo
def create_match_score(row):
    score = 50
    score += min(row['num_skills_required'] * 2, 30)
    title = str(row['Job Title']).lower()
    if 'senior' in title or 'lead' in title:
        score += 15
    elif 'junior' in title or 'entry' in title:
        score += 10
    score += np.random.normal(0, 10)
    return max(0, min(100, score))

df_ml['match_score'] = df_ml.apply(create_match_score, axis=1)

print(f"Dataset shape after preprocessing: {df_ml.shape}")
print(f"Match score statistics:")
print(df_ml['match_score'].describe())


## **Feature Engineering**

In [None]:
# Encode categorical variables with consistent encoders
le_country = LabelEncoder()
le_work_type = LabelEncoder()

# Guard missing columns then encode
if 'Country' not in df_ml.columns:
    df_ml['Country'] = 'Unknown'
if 'Work Type' not in df_ml.columns:
    df_ml['Work Type'] = 'Unknown'

df_ml['country_encoded'] = le_country.fit_transform(df_ml['Country'].fillna('Unknown'))
df_ml['work_type_encoded'] = le_work_type.fit_transform(df_ml['Work Type'].fillna('Unknown'))

# Create TF-IDF features from job descriptions
# Use fewer features in FAST mode
max_feats = 500 if FAST else 1000
tfidf = TfidfVectorizer(
    max_features=max_feats,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2
)

tfidf_features = tfidf.fit_transform(df_ml['cleaned_job_text'])
tfidf_feature_names = tfidf.get_feature_names_out()
print(f"TF-IDF features shape: {tfidf_features.shape}")

# Optional: Sentence-BERT embeddings (if available)
X_bert = None
bert_model_name = 'all-MiniLM-L6-v2'
if _st_available:
    try:
        print('Building Sentence-BERT embeddings...')
        st_model = SentenceTransformer(bert_model_name)
        X_bert = np.vstack(df_ml['cleaned_job_text'].apply(lambda x: st_model.encode(x)).values)
        print('Embeddings shape:', X_bert.shape)
    except Exception as e:
        print('Embedding build skipped due to error:', e)
        X_bert = None

# Additional numerical features
df_ml['job_title_length'] = df_ml['Job Title'].str.len().fillna(0)
df_ml['job_desc_length'] = df_ml['Job Description'].str.len().fillna(0)


## **Prepare Training Data**

In [None]:
# Select numerical features for training
numerical_features = [
    'country_encoded', 'work_type_encoded', 'num_skills_required',
    'job_title_length', 'job_desc_length'
]

X_numerical = df_ml[numerical_features].fillna(0).values
X_tfidf = tfidf_features.toarray()

# Combine features: numerical + TF-IDF (+ optional BERT)
if X_bert is not None:
    X = np.hstack([X_numerical, X_tfidf, X_bert])
else:
    X = np.hstack([X_numerical, X_tfidf])

y = df_ml['match_score'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


## **Model Training - Multiple Algorithms**

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize baseline models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=200, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42)
}

# Train and evaluate models
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name} (baseline)...")
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    model_results[name] = {
        'model': model,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'predictions': y_pred
    }

    print(f"{name} Results:")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R²: {r2:.3f}")


## **Hyperparameter Tuning and Cross-Validation**

In [None]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import randint as sp_randint, uniform as sp_uniform

# Reduce iterations in FAST mode
rf_iter = 5 if FAST else 20
gb_iter = 5 if FAST else 20

# Tune Random Forest
rf_param_dist = {
    'n_estimators': sp_randint(200, 600),
    'max_depth': [None] + list(range(5, 31, 5)),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}
rf_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), rf_param_dist, n_iter=rf_iter, cv=3, n_jobs=-1, random_state=42, scoring='r2')
print("\nTuning Random Forest...")
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_
print("RF best params:", rf_search.best_params_)

# Tune Gradient Boosting
gb_param_dist = {
    'n_estimators': sp_randint(200, 800),
    'learning_rate': sp_uniform(0.01, 0.29),
    'max_depth': sp_randint(2, 8),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 10),
    'subsample': sp_uniform(0.6, 0.4)
}
print("Tuning Gradient Boosting...")
gb_search = RandomizedSearchCV(GradientBoostingRegressor(random_state=42), gb_param_dist, n_iter=gb_iter, cv=3, n_jobs=-1, random_state=42, scoring='r2')
gb_search.fit(X_train, y_train)
gb_best = gb_search.best_estimator_
print("GB best params:", gb_search.best_params_)

# Evaluate tuned models
for name, model in [('Random Forest (tuned)', rf_best), ('Gradient Boosting (tuned)', gb_best)]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[name] = {
        'model': model,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'predictions': y_pred
    }
    print(f"{name} -> RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.3f}")

# Cross-validation on best tuned model
best_tuned_name = max(['Random Forest (tuned)', 'Gradient Boosting (tuned)'], key=lambda x: model_results[x]['r2'])
cv_scores = cross_val_score(model_results[best_tuned_name]['model'], X, y, cv=5, scoring='r2', n_jobs=-1)
print(f"CV R² ({best_tuned_name}) - mean: {cv_scores.mean():.3f}, std: {cv_scores.std():.3f}")


## **Stacking Ensemble**

In [None]:
stack = StackingRegressor(
    estimators=[('rf', rf_best), ('gb', gb_best)],
    final_estimator=LinearRegression(),
    n_jobs=-1
)
print("Training StackingRegressor...")
stack.fit(X_train, y_train)
stack_pred = stack.predict(X_test)
stack_mse = mean_squared_error(y_test, stack_pred)
stack_rmse = np.sqrt(stack_mse)
stack_mae = mean_absolute_error(y_test, stack_pred)
stack_r2 = r2_score(y_test, stack_pred)
model_results['Stacking'] = {
    'model': stack,
    'mse': stack_mse,
    'rmse': stack_rmse,
    'mae': stack_mae,
    'r2': stack_r2,
    'predictions': stack_pred
}
print(f"Stacking -> RMSE: {stack_rmse:.2f}, MAE: {stack_mae:.2f}, R²: {stack_r2:.3f}")


## **Model Performance Visualization**

In [None]:
# Create performance comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Performance metrics comparison
metrics_df = pd.DataFrame({
    name: [results['rmse'], results['mae'], results['r2']]
    for name, results in model_results.items()
}, index=['RMSE', 'MAE', 'R²'])

axes[0, 0].bar(metrics_df.columns, metrics_df.loc['RMSE'])
axes[0, 0].set_title('RMSE Comparison')
axes[0, 0].set_ylabel('RMSE')
axes[0, 0].tick_params(axis='x', rotation=45)

axes[0, 1].bar(metrics_df.columns, metrics_df.loc['MAE'])
axes[0, 1].set_title('MAE Comparison')
axes[0, 1].set_ylabel('MAE')
axes[0, 1].tick_params(axis='x', rotation=45)

axes[1, 0].bar(metrics_df.columns, metrics_df.loc['R²'])
axes[1, 0].set_title('R² Score Comparison')
axes[1, 0].set_ylabel('R² Score')
axes[1, 0].tick_params(axis='x', rotation=45)

# Prediction vs Actual for best model
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['r2'])
best_predictions = model_results[best_model_name]['predictions']

axes[1, 1].scatter(y_test, best_predictions, alpha=0.6)
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1, 1].set_xlabel('Actual Match Score')
axes[1, 1].set_ylabel('Predicted Match Score')
axes[1, 1].set_title(f'Actual vs Predicted ({best_model_name})')

plt.tight_layout()
plt.show()

print(f"\nBest performing model: {best_model_name}")


## **Feature Importance Analysis**

In [None]:
# Get the best tree-based model for feature importance
best_tree_model = None
for name, results in model_results.items():
    if name in ['Random Forest', 'Gradient Boosting', 'Random Forest (tuned)', 'Gradient Boosting (tuned)']:
        if best_tree_model is None or results['r2'] > model_results[best_tree_model]['r2']:
            best_tree_model = name

if best_tree_model:
    model = model_results[best_tree_model]['model']
    if hasattr(model, 'feature_importances_'):
        feature_importance = model.feature_importances_
        feature_names = numerical_features + list(tfidf_feature_names) + ( [f'bert_{i}' for i in range(X_bert.shape[1])] if X_bert is not None else [] )
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False).head(20)

        plt.figure(figsize=(12, 8))
        plt.barh(range(len(importance_df)), importance_df['importance'])
        plt.yticks(range(len(importance_df)), importance_df['feature'])
        plt.xlabel('Feature Importance')
        plt.title(f'Top 20 Feature Importance ({best_tree_model})')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()


## **Explainability with SHAP (optional)**

In [None]:
if _shap_available and best_tree_model:
    try:
        explainer = shap.TreeExplainer(model_results[best_tree_model]['model'])
        sample_size = 100 if FAST else 200
        sample_idx = np.random.choice(len(X_test), size=min(sample_size, len(X_test)), replace=False)
        shap_vals = explainer.shap_values(X_test[sample_idx])
        shap.summary_plot(shap_vals, features=X_test[sample_idx], feature_names=(numerical_features + list(tfidf_feature_names) + ( [f'bert_{i}' for i in range(X_bert.shape[1])] if X_bert is not None else [] )))
    except Exception as e:
        print('SHAP explainability skipped:', e)


## **Resume Matching Function**

In [None]:
def predict_resume_match(job_description, resume_text, model_name=None):
    """
    Predict how well a resume matches a job description using the trained artifacts in this notebook.
    Uses TF-IDF + numerical features and appends Sentence-BERT embeddings if available.
    """
    combined_text = clean_text(job_description)
    skills = extract_skills(job_description)

    features = {
        'country_encoded': 0,
        'work_type_encoded': 0,
        'num_skills_required': len(skills),
        'job_title_length': len(job_description),
        'job_desc_length': len(job_description)
    }

    X_num = np.array([[features[col] for col in numerical_features]])
    X_text = tfidf.transform([combined_text]).toarray()
    X_parts = [X_num, X_text]

    # Append embedding if the model was trained with embeddings
    if 'X_bert' in globals() and X_bert is not None:
        try:
            emb = st_model.encode(combined_text).reshape(1, -1)
            X_parts.append(emb)
        except Exception:
            pass

    X_combined = np.hstack(X_parts)

    # Choose model
    chosen_name = model_name or max(model_results.keys(), key=lambda x: model_results[x]['r2'])
    model = model_results[chosen_name]['model']

    if isinstance(model, LinearRegression):
        X_combined = scaler.transform(X_combined)

    match_score = float(model.predict(X_combined)[0])

    resume_skills = extract_skills(resume_text)
    job_skills = set(skills)
    resume_skills_set = set(resume_skills)

    skill_overlap = len(job_skills.intersection(resume_skills_set))
    missing_skills = list(job_skills - resume_skills_set)

    return {
        'match_score': max(0.0, min(100.0, match_score)),
        'required_skills': skills,
        'resume_skills': resume_skills,
        'skill_overlap': skill_overlap,
        'missing_skills': missing_skills,
        'recommendations': f"Consider adding these skills: {', '.join(missing_skills[:5])}"
    }


## **Test the Resume Matching Function**

In [None]:
sample_job = """
Senior Python Developer
We are looking for an experienced Python developer with expertise in machine learning,
web development using Django or Flask, and cloud technologies like AWS.
Requirements: 5+ years Python experience, SQL, Docker, Git, Agile methodology.
"""

sample_resume = """
Software Developer with 3 years experience in Python programming.
Proficient in web development using Django and Flask frameworks.
Experience with SQL databases and version control using Git.
Strong problem-solving skills and teamwork abilities.
"""

result = predict_resume_match(sample_job, sample_resume)

print("JobLens Resume Matching Results:")
print("=" * 50)
print(f"Match Score: {result['match_score']:.1f}/100")
print(f"Skills Overlap: {result['skill_overlap']}/{len(result['required_skills'])}")
print(f"Required Skills: {', '.join(result['required_skills'])}")
print(f"Resume Skills: {', '.join(result['resume_skills'])}")
print(f"Missing Skills: {', '.join(result['missing_skills'])}")
print(f"Recommendations: {result['recommendations']}")


## **Save the Trained Model and Components**

In [None]:
import joblib

best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['r2'])
best_model = model_results[best_model_name]['model']

model_package = {
    'model': best_model,
    'tfidf_vectorizer': tfidf,
    'scaler': scaler,
    'label_encoders': {
        'country': le_country,
        'work_type': le_work_type
    },
    'feature_names': numerical_features,
    'use_embeddings': X_bert is not None,
    'embedding_model': ('sentence-transformers', bert_model_name) if X_bert is not None else None,
    'model_name': best_model_name,
    'model_performance': {k: v for k, v in model_results[best_model_name].items() if k != 'model'}
}

joblib.dump(model_package, 'joblens_model.pkl')

print("Model saved successfully!")
print(f"Best model: {best_model_name}")
print(f"Model performance: R² = {model_results[best_model_name]['r2']:.3f}")


## **Model Usage Instructions for Backend Integration**

In [None]:
print("""
JobLens Model Integration Guide:
================================

1. Load the model in your Rust backend:
   - Use the saved 'joblens_model.pkl' file
   - The model package contains all preprocessing components

2. Model Input Requirements:
   - Job description text
   - Resume text (optional for job-only analysis)

3. Model Output:
   - Match score (0-100)
   - Required skills list
   - Missing skills recommendations
   - Skill overlap metrics

4. API Endpoint Design:
   POST /api/analyze-match
   {
     "job_description": "string",
     "resume_text": "string"
   }

   Response:
   {
     "match_score": float,
     "required_skills": [string],
     "missing_skills": [string],
     "recommendations": string
   }

5. For Rust Integration:
   - Use onnxruntime for tree models exported to ONNX (optional)
   - Or invoke Python for inference via a subprocess/microservice
   - Implement text preprocessing in Rust to mirror this pipeline

Next Steps:
- If using embeddings at inference, load SentenceTransformer('all-MiniLM-L6-v2') in your service
- Create API endpoints in your Rust backend
- Test with real resume and job description data
""")


## **Export Model to ONNX (Optional for Rust Integration)**

In [None]:
try:
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType

    initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]

    # Only export tree-based or linear models that skl2onnx supports well
    if any(key in best_model_name for key in ['Random Forest', 'Gradient Boosting']):
        onnx_model = convert_sklearn(best_model, initial_types=initial_type)
        with open("joblens_model.onnx", "wb") as f:
            f.write(onnx_model.SerializeToString())
        print("ONNX model saved successfully!")
        print("Use this for Rust backend integration with onnxruntime-rs")
    else:
        print("Model not exported to ONNX. Use the pickle file instead.")

except ImportError:
    print("skl2onnx not installed. Install with: pip install skl2onnx")
    print("For now, use the pickle file for model deployment")


## **Final Model Summary**

In [None]:
print("JobLens AI Model Training Complete!")
print("=" * 50)
print(f"Dataset size: {df_ml.shape[0]} job postings")
print(f"Feature count: {X.shape[1]} features")
print(f"Best model: {best_model_name}")
print(f"Model accuracy (R²): {model_results[best_model_name]['r2']:.3f}")
print(f"RMSE: {model_results[best_model_name]['rmse']:.2f}")

print("\nModel Capabilities:")
print("- Resume-job matching with 0-100 score")
print("- Skill gap analysis")
print("- Missing skill recommendations")
print("- Feature importance analysis")
print("- Optional Sentence-BERT embeddings (if available)")

print("\nFiles Generated:")
print("- joblens_model.pkl (Complete model package)")
print("- joblens_model.onnx (For Rust integration, if available)")
print("- cleaned_jobs.csv (Processed dataset)")

print("\nReady for integration with JobLens backend!")
