In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv

def load_data():
    load_dotenv()
    DB_USER = os.getenv("DB_USER")
    DB_PASSWORD = os.getenv("DB_PASSWORD")
    DB_HOST = os.getenv("DB_HOST")
    DB_PORT = os.getenv("DB_PORT")
    DB_NAME = os.getenv("DB_NAME")

    connection_string = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
    engine = create_engine(connection_string)

    df = pd.read_sql("SELECT * FROM software_salaries", engine)
    return df


In [2]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import mlflow
import mlflow.sklearn
import mlflow.xgboost

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

from mlflow.tracking import MlflowClient
from mlflow.entities import Metric
from mlflow.utils.yaml_utils import YamlSafeDumper

import warnings
warnings.filterwarnings("ignore")

# Patch YAML error for MLflow
YamlSafeDumper.add_multi_representer(
    Metric,
    lambda dumper, metric: dumper.represent_scalar(
        'tag:yaml.org,2002:str',
        f"{metric.key}={metric.value:.6f}@{metric.timestamp}"
    )
)
YamlSafeDumper.add_multi_representer(
    object,
    lambda dumper, obj: dumper.represent_scalar('tag:yaml.org,2002:str', str(obj))
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np

def preprocess_data(df):
    df.drop(columns=['education', 'skills'], inplace=True, errors='ignore')
    df = df[(df['base_salary'] >= 0) & (df['bonus'] >= 0) & (df['stock_options'] >= 0) & (df['adjusted_total_usd'] >= 0)]
    df.drop_duplicates(inplace=True)

    # Fill missing
    df['experience_level'].fillna(df['experience_level'].mode()[0], inplace=True)
    df['employment_type'].fillna(df['employment_type'].mode()[0], inplace=True)

    # Binary remote feature
    df['is_remote'] = df['remote_ratio'].apply(lambda x: 1 if x == 100 else 0)

    # Total salary
    df['total_salary'] = df['base_salary'] + df['bonus'] + df['stock_options']

    # Encode categoricals
    categorical_cols = ['job_title', 'experience_level','currency', 'employment_type', 'company_size', 'company_location', 'salary_currency']
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Scale numerics
    numeric_cols = ['years_experience', 'base_salary', 'bonus', 'stock_options', 'conversion_rate']
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Log target
    df['adjusted_total_usd_log'] = np.log1p(df['adjusted_total_usd'])

    return df, label_encoders, scaler


In [4]:
def visualize_eda(df, base_dir="eda_artifacts"):
    sns.set(style='whitegrid', palette='pastel')
    uni_dir = os.path.join(base_dir, "univariate")
    bi_dir = os.path.join(base_dir, "bivariate")
    os.makedirs(uni_dir, exist_ok=True)
    os.makedirs(bi_dir, exist_ok=True)

    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    for col in numeric_cols:
        plt.figure(figsize=(6, 4))
        sns.histplot(df[col], kde=True, bins=30, color='skyblue')
        plt.title(f'Distribution of {col}')
        plt.tight_layout()
        plt.savefig(os.path.join(uni_dir, f"{col}_hist.png"))
        plt.close()

    for col in categorical_cols:
        plt.figure(figsize=(6, 4))
        sns.countplot(y=col, data=df, order=df[col].value_counts().index)
        plt.title(f'Count of {col}')
        plt.tight_layout()
        plt.savefig(os.path.join(uni_dir, f"{col}_count.png"))
        plt.close()

    target = 'adjusted_total_usd' if 'adjusted_total_usd' in df.columns else df.select_dtypes(include='float64').columns[-1]

    for col in categorical_cols:
        if target in df.columns:
            plt.figure(figsize=(6, 4))
            sns.boxplot(x=col, y=target, data=df)
            plt.title(f'{target} by {col}')
            plt.tight_layout()
            plt.savefig(os.path.join(bi_dir, f"{target}_by_{col}.png"))
            plt.close()

    if target in numeric_cols:
        numeric_cols.remove(target)

    for col in numeric_cols:
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=col, y=target, data=df)
        plt.title(f'{target} vs {col}')
        plt.tight_layout()
        plt.savefig(os.path.join(bi_dir, f"{target}_vs_{col}.png"))
        plt.close()

    if len(numeric_cols) > 1:
        plt.figure(figsize=(10, 8))
        corr = df.select_dtypes(include=['int64', 'float64']).corr()
        sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
        plt.title("Correlation Heatmap")
        plt.tight_layout()
        plt.savefig(os.path.join(bi_dir, "correlation_heatmap.png"))
        plt.close()

    mlflow.log_artifacts(uni_dir, artifact_path="eda/univariate")
    mlflow.log_artifacts(bi_dir, artifact_path="eda/bivariate")


In [5]:
import pandas as pd
import numpy as np
import re

def add_feature_engineering(df):
    df = df.copy()

    # 🎓 Seniority extraction from job title
    def extract_seniority(title):
        title = title.lower()
        if re.search(r'(intern|trainee|junior)', title):
            return 'junior'
        elif re.search(r'(senior|sr|lead)', title):
            return 'senior'
        elif re.search(r'(manager|director|head|chief)', title):
            return 'management'
        else:
            return 'mid'

    df['seniority_level'] = df['job_title'].apply(extract_seniority)

    # 🕒 Experience binning
    def bin_experience(x):
        if x < 2:
            return '0–2'
        elif x < 5:
            return '2–5'
        elif x < 10:
            return '5–10'
        else:
            return '10+'

    df['experience_bin'] = df['years_experience'].apply(bin_experience)

    # 🏠 Remote flag from remote_ratio
    df['remote_flag'] = df['remote_ratio'].apply(lambda x: 1 if x == 100 else 0)

    # 🌍 Continent mapping from company location (you can expand this as needed)
    continent_map = {
        'US': 'North America', 'CA': 'North America',
        'IN': 'Asia', 'CN': 'Asia', 'JP': 'Asia',
        'GB': 'Europe', 'FR': 'Europe', 'DE': 'Europe', 'IT': 'Europe',
        'AU': 'Oceania', 'NZ': 'Oceania',
        'BR': 'South America', 'AR': 'South America',
        'ZA': 'Africa', 'NG': 'Africa'
    }
    df['continent'] = df['company_location'].map(continent_map).fillna('Other')

    # 💱 Currency strength category (manual — you may update rates)
    strong_currencies = ['USD', 'EUR', 'GBP', 'CHF']
    weak_currencies = ['INR', 'BRL', 'IDR', 'ZAR']

    def currency_strength(curr):
        if curr in strong_currencies:
            return 'strong'
        elif curr in weak_currencies:
            return 'weak'
        else:
            return 'mid'

    df['currency_strength'] = df['currency'].apply(currency_strength)

    return df


In [6]:
from sklearn.feature_selection import SelectFromModel

def apply_feature_selection_rf(X, y, n_features=10):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    importances = model.feature_importances_
    indices = np.argsort(importances)[-n_features:]
    selected_columns = X.columns[indices]
    print(f"\n✅ Top {n_features} features by RF importance:\n", list(selected_columns))
    return X[selected_columns]


In [9]:
def train_and_evaluate_with_mlflow(df, parent_run_id):
    X = df.drop(columns=['adjusted_total_usd', 'adjusted_total_usd_log'])
    y = df['adjusted_total_usd_log']
    X = X.loc[:, X.std() > 1e-3]  # remove constant features

    X_selected = apply_feature_selection_rf(X, y, n_features=10)

    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=0.2, random_state=42
    )

    models = {
        'XGBoost': {
            'model': xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [3, 5],
                'learning_rate': [0.05, 0.1]
            }
        },
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [50, 100],
                'max_depth': [5, 10]
            }
        },
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {}
        }
    }

    best_model = None
    best_model_name = None
    best_r2 = -np.inf
    best_run_id = None
    client = MlflowClient()
    summary = []

    for name, cfg in models.items():
        with mlflow.start_run(run_name=name, nested=True) as run:
            print(f"\n🔧 Tuning {name}...")
            gs = GridSearchCV(cfg['model'], cfg['params'], scoring='r2', cv=5, n_jobs=-1)
            gs.fit(X_train, y_train)

            best_estimator = gs.best_estimator_
            best_params = gs.best_params_

            mlflow.log_params(best_params)

            y_train_pred_log = best_estimator.predict(X_train)
            y_test_pred_log = best_estimator.predict(X_test)
            y_train_pred = np.expm1(y_train_pred_log)
            y_test_pred = np.expm1(y_test_pred_log)
            y_train_true = np.expm1(y_train)
            y_test_true = np.expm1(y_test)

            train_mae = mean_absolute_error(y_train_true, y_train_pred)
            test_mae = mean_absolute_error(y_test_true, y_test_pred)
            train_r2 = r2_score(y_train_true, y_train_pred)
            test_r2 = r2_score(y_test_true, y_test_pred)

            mlflow.log_metrics({
                "Train_MAE": train_mae,
                "Train_R2": train_r2,
                "Test_MAE": test_mae,
                "Test_R2": test_r2
            })

            print(f"✅ {name} → Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}, Train MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}, Params: {best_params}")

            plt.figure(figsize=(5, 3))
            plt.bar(['Train R2', 'Test R2'], [train_r2, test_r2], color=['green', 'blue'])
            plt.title(f"{name} R2 Comparison")
            r2_plot_path = f"{name}_r2_plot.png"
            plt.savefig(r2_plot_path)
            mlflow.log_artifact(r2_plot_path)
            plt.close()
            os.remove(r2_plot_path)

            if name == "XGBoost":
                mlflow.xgboost.log_model(best_estimator, artifact_path="model")
            else:
                mlflow.sklearn.log_model(best_estimator, artifact_path="model")

            try:
                if name in ["XGBoost", "RandomForest"]:
                    print(f"📌 Generating SHAP summary plot for {name}...")
                    X_sampled = X_test[:50]
                    explainer = shap.Explainer(best_estimator, X_train)
                    shap_values = explainer(X_sampled)
                    shap.summary_plot(shap_values, X_sampled, plot_type="bar", show=False)
                    shap_path = f"{name}_shap_summary_plot.png"
                    plt.savefig(shap_path, bbox_inches="tight")
                    mlflow.log_artifact(shap_path)
                    plt.close()
                    os.remove(shap_path)
            except Exception as e:
                print(f"⚠️ SHAP failed: {e}")

            summary.append((name, train_mae, test_mae, train_r2, test_r2, best_params))

            if test_r2 > best_r2:
                best_r2 = test_r2
                best_model_name = name
                best_model = best_estimator
                best_run_id = run.info.run_id

    print("\n📈 Summary of All Models:")
    for s in summary:
        print(f"• {s[0]:15} | Train R2: {s[3]:.4f} | Test R2: {s[4]:.4f} | Test MAE: {s[2]:.2f} | Params: {s[5]}")

    print(f"\n🏆 Best Model: {best_model_name} with R2 = {best_r2:.4f}")

    if best_run_id:
        model_uri = f"runs:/{best_run_id}/model"
        print("\n📌 Registering & Promoting best model...")
        mv = mlflow.register_model(model_uri=model_uri, name="PricePredictor")

        for _ in range(10):
            info = client.get_model_version(name=mv.name, version=mv.version)
            if info.status == "READY":
                break
            time.sleep(1)

        client.transition_model_version_stage(
            name=mv.name,
            version=mv.version,
            stage="Production",
            archive_existing_versions=True
        )

        print(f"🚀 {mv.name} version {mv.version} → Production ✅")


In [10]:
def main():
    try:
        if mlflow.active_run():
            mlflow.end_run()

        df = load_data()
        df, encoders, scaler = preprocess_data(df)

        mlflow.set_experiment("USD Regression Experiment")
        with mlflow.start_run(run_name="All_Model_Comparisons") as parent_run:
            parent_run_id = parent_run.info.run_id
            train_and_evaluate_with_mlflow(df, parent_run_id)

    finally:
        if mlflow.active_run():
            mlflow.end_run()

main()



✅ Top 10 features by RF importance:
 ['bonus', 'company_location', 'stock_options', 'years_experience', 'job_title', 'salary_in_usd', 'currency', 'total_salary', 'base_salary', 'conversion_rate']

🔧 Tuning XGBoost...
✅ XGBoost → Train R2: 1.0000, Test R2: 1.0000, Train MAE: 1022.33, Test MAE: 1087.34, Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}




📌 Generating SHAP summary plot for XGBoost...

🔧 Tuning RandomForest...
✅ RandomForest → Train R2: 1.0000, Test R2: 1.0000, Train MAE: 498.86, Test MAE: 531.58, Params: {'max_depth': 10, 'n_estimators': 100}




📌 Generating SHAP summary plot for RandomForest...

🔧 Tuning LinearRegression...




✅ LinearRegression → Train R2: -1.3084, Test R2: -1.4727, Train MAE: 187156.11, Test MAE: 196646.59, Params: {}





📈 Summary of All Models:
• XGBoost         | Train R2: 1.0000 | Test R2: 1.0000 | Test MAE: 1087.34 | Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
• RandomForest    | Train R2: 1.0000 | Test R2: 1.0000 | Test MAE: 531.58 | Params: {'max_depth': 10, 'n_estimators': 100}
• LinearRegression | Train R2: -1.3084 | Test R2: -1.4727 | Test MAE: 196646.59 | Params: {}

🏆 Best Model: RandomForest with R2 = 1.0000

📌 Registering & Promoting best model...


Registered model 'PricePredictor' already exists. Creating a new version of this model...
Created version '19' of model 'PricePredictor'.


🚀 PricePredictor version 19 → Production ✅
