In [1]:
import pandas as pd
import os

def export_excel_sheets_to_csv(excel_path, output_dir):
    """
    Reads an Excel file and saves each sheet as a separate CSV file.

    Args:
        excel_path (str): The path to the input Excel file.
        output_dir (str): The directory where CSV files will be saved.
    """
    # --- 1. Ensure the output directory exists ---
    try:
        os.makedirs(output_dir, exist_ok=True)
        print(f"Output directory '{output_dir}' is ready.")
    except OSError as e:
        print(f"Error: Could not create directory {output_dir}. Reason: {e}")
        return

    # --- 2. Check if the source Excel file exists ---
    if not os.path.exists(excel_path):
        print(f"❌ Error: Input file not found at '{excel_path}'")
        print("Please ensure you have run the data consolidation step first.")
        return

    # --- 3. Load the Excel file and process each sheet ---
    try:
        xls = pd.ExcelFile(excel_path)
        sheet_names = xls.sheet_names
        print(f"Found sheets to export: {sheet_names}")

        for sheet_name in sheet_names:
            print(f"Processing sheet: '{sheet_name}'...")

            # Read the sheet into a pandas DataFrame
            df = pd.read_excel(xls, sheet_name=sheet_name)

            # Define the output path for the new CSV file
            csv_filepath = os.path.join(output_dir, f"{sheet_name}.csv")

            # Save the DataFrame to CSV, without the pandas index
            df.to_csv(csv_filepath, index=False)
            print(f"✅ Successfully exported '{sheet_name}' to '{csv_filepath}'")

        print("\nAll sheets have been exported successfully!")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# --- Define file paths ---
input_excel_file = 'Data/combined_report.xlsx'
output_csv_directory = 'Data/csv_sheets'

# --- Run the export function ---
export_excel_sheets_to_csv(input_excel_file, output_csv_directory)

Output directory 'Data/csv_sheets' is ready.
Found sheets to export: ['DMA', 'original', 'Mapping', 'Job_Roles_And_Skills']
Processing sheet: 'DMA'...
✅ Successfully exported 'DMA' to 'Data/csv_sheets/DMA.csv'
Processing sheet: 'original'...
✅ Successfully exported 'original' to 'Data/csv_sheets/original.csv'
Processing sheet: 'Mapping'...
✅ Successfully exported 'Mapping' to 'Data/csv_sheets/Mapping.csv'
Processing sheet: 'Job_Roles_And_Skills'...
✅ Successfully exported 'Job_Roles_And_Skills' to 'Data/csv_sheets/Job_Roles_And_Skills.csv'

All sheets have been exported successfully!


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Create models directory
os.makedirs('models', exist_ok=True)

def load_and_preprocess_data(file_path):
    """Load and preprocess the dataset"""
    df = pd.read_csv(file_path)
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    return df

def extract_skills_features(df):
    """Extract and encode skills from the dataset"""
    df['Technical Skill'] = df['Technical Skill'].astype(str)
    df['Programming Languages'] = df['Programming Languages'].astype(str)
    df['Soft Skills'] = df['Soft Skills'].astype(str)

    # Technical Skills
    all_tech_skills = set(skill.strip() for skills in df['Technical Skill'].dropna() for skill in skills.split('/'))
    tech_features = []
    for skill in sorted(all_tech_skills):
        if not skill: continue
        feature_name = f'Tech_{skill.replace(" ", "_").replace("/", "_")}'
        df[feature_name] = df['Technical Skill'].str.contains(skill, case=False, na=False).astype(int)
        tech_features.append(feature_name)

    # Programming Languages
    all_prog_langs = set(lang.strip() for langs in df['Programming Languages'].dropna() for lang in langs.split('/'))
    prog_features = []
    for lang in sorted(all_prog_langs):
        if not lang: continue
        feature_name = f'Prog_{lang.replace(" ", "_").replace("/", "_")}'
        df[feature_name] = df['Programming Languages'].str.contains(lang, case=False, na=False).astype(int)
        prog_features.append(feature_name)

    # Soft Skills
    all_soft_skills = set(skill.strip() for skills in df['Soft Skills'].dropna() for skill in skills.split('/'))
    soft_features = []
    for skill in sorted(all_soft_skills):
        if not skill: continue
        feature_name = f'Soft_{skill.replace(" ", "_").replace("/", "_")}'
        df[feature_name] = df['Soft Skills'].str.contains(skill, case=False, na=False).astype(int)
        soft_features.append(feature_name)

    return df, tech_features, prog_features, soft_features

def create_intelligence_features(df):
    """Create features from intelligence scores"""
    intelligence_features = []
    intel_cols = ['Linguistic', 'Musical', 'Bodily', 'Logical - Mathematical',
                  'Spatial-Visualization', 'Interpersonal', 'Intrapersonal', 'Naturalist']

    for col in intel_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(df[col].median())
            intelligence_features.append(col)

    if intelligence_features:
        df['Intelligence_Total'] = df[intelligence_features].sum(axis=1)
        df['Intelligence_Average'] = df[intelligence_features].mean(axis=1)
        intelligence_features.extend(['Intelligence_Total', 'Intelligence_Average'])

    return df, intelligence_features

def create_additional_features(df):
    """Create additional features from the dataset"""
    additional_features = []
    df['Year_Numeric'] = pd.to_numeric(df['Year'], errors='coerce').fillna(0)
    additional_features.append('Year_Numeric')

    df['Technical_Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0)
    df['Soft_Rating'] = pd.to_numeric(df['Rating.1'], errors='coerce').fillna(0)
    additional_features.extend(['Technical_Rating', 'Soft_Rating'])

    df['Has_Projects'] = df['Projects'].map({'Yes': 1, 'No': 0}).fillna(0)
    additional_features.append('Has_Projects')

    if 'Score' in df.columns:
        df['Overall_Score'] = pd.to_numeric(df['Score'], errors='coerce').fillna(0)
        additional_features.append('Overall_Score')

    df['Technical_Competency'] = (df['Technical_Rating'] * 0.6 + df['Soft_Rating'] * 0.4)
    additional_features.append('Technical_Competency')

    tech_count = df['Technical Skill'].fillna('').str.split('/').str.len()
    prog_count = df['Programming Languages'].fillna('').str.split('/').str.len()
    df['Skill_Diversity'] = (tech_count + prog_count).fillna(0)
    additional_features.append('Skill_Diversity')

    return df, additional_features

def prepare_target_variable(df):
    """Prepare target variable for job recommendation"""
    le_job = LabelEncoder()
    df['Job_Target'] = le_job.fit_transform(df['Recommended Job'].fillna('Unknown'))
    print(f"Number of unique jobs: {len(le_job.classes_)}")
    return df, le_job

def train_model(df, features, target_col, model_name="Model"):
    """A generic function to train and evaluate a model."""
    print(f"\n=== Training {model_name} ===")
    X = df[features].fillna(0)
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = RandomForestClassifier(
        n_estimators=150, random_state=42, class_weight='balanced', n_jobs=-1
    )
    model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    print(f"{model_name} - Test Accuracy: {accuracy:.4f}")
    print(f"Features used: {len(features)}")
    return model

def save_artifacts(skills_model, holistic_model, skills_features, all_features, le_job):
    """Saves all the necessary models and objects to the 'models/' directory."""
    print("\n=== Saving Models and Artifacts ===")
    model_dir = 'models'

    # Save models
    joblib.dump(skills_model, os.path.join(model_dir, 'skills_model.pkl'))
    joblib.dump(holistic_model, os.path.join(model_dir, 'holistic_model.pkl'))

    # Save feature lists
    joblib.dump(skills_features, os.path.join(model_dir, 'skills_features.pkl'))
    joblib.dump(all_features, os.path.join(model_dir, 'holistic_features.pkl'))

    # Save the Label Encoder
    joblib.dump(le_job, os.path.join(model_dir, 'career_label_encoder.pkl'))

    print(f"✅ All models and artifacts saved successfully to the '{model_dir}/' directory!")

def main():
    """Main training pipeline"""
    print("🚀 Job Recommendation Model Training Pipeline 🚀")

    # 1. Load data
    print("\n📥 Loading dataset...")
    df = load_and_preprocess_data('Data/csv_sheets/DMA.csv') # Using the newly created CSV

    # 2. Feature Engineering
    print("\n🔧 Engineering features...")
    df, tech_features, prog_features, soft_features = extract_skills_features(df)
    df, intelligence_features = create_intelligence_features(df)
    df, additional_features = create_additional_features(df)

    # Define feature sets
    skills_features = tech_features + prog_features + soft_features + ['Technical_Competency', 'Skill_Diversity', 'Has_Projects']
    all_features = skills_features + intelligence_features + additional_features

    # Remove duplicates from feature lists
    skills_features = sorted(list(set(skills_features)))
    all_features = sorted(list(set(all_features)))

    # 3. Prepare target variable
    print("\n🎯 Preparing target variable...")
    df, le_job = prepare_target_variable(df)

    # 4. [FIX] Identify and remove classes with only one sample
    print("\n🔍 Checking for rare job categories...")
    class_counts = df['Job_Target'].value_counts()
    rare_classes = class_counts[class_counts < 2].index

    if not rare_classes.empty:
        rare_job_names = le_job.inverse_transform(rare_classes)
        print(f"⚠️ Found rare job categories with only 1 sample: {list(rare_job_names)}")

        original_rows = len(df)
        df = df[~df['Job_Target'].isin(rare_classes)]
        print(f"Removed {original_rows - len(df)} rows belonging to these categories.")
        print(f"New dataset shape for training: {df.shape}")
    else:
        print("✅ No rare categories found. All classes have enough samples for splitting.")

    # 5. Train models
    skills_model = train_model(df, skills_features, 'Job_Target', "Skills-Based Model")
    holistic_model = train_model(df, all_features, 'Job_Target', "Holistic Model")

    # 6. Save all artifacts
    save_artifacts(skills_model, holistic_model, skills_features, all_features, le_job)

    print("\n🎉 Training pipeline completed successfully! 🎉")

# Run the main function
if __name__ == "__main__":
    main()

🚀 Job Recommendation Model Training Pipeline 🚀

📥 Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'Data/csv_sheets/DMA.csv'