In [1]:
print("hello")

hello


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# --- 1. Generate Synthetic Data ---
data_size = 200
experience = np.random.uniform(0, 20, data_size)
education_levels = ['Bachelors', 'Masters', 'PhD']
education = np.random.choice(education_levels, data_size, p=[0.5, 0.3, 0.2])
skills_list = ['Python', 'Java', 'C++', 'SQL', 'Cloud']

# Generate 1-3 skills for each professional
skills_raw = [np.random.choice(skills_list, np.random.randint(1, 4), replace=False) for _ in range(data_size)]
# For simplicity in this model, we'll just use the *primary* skill (first one)
# A more complex model might use multi-hot encoding
primary_skill = [s[0] for s in skills_raw]

# Define the relationship for salary
# Base salary + coef*experience + education_bonus + skill_bonus + noise
def education_bonus(e):
    if e == 'Bachelors': return 0
    if e == 'Masters': return 15000
    if e == 'PhD': return 30000

def skill_bonus(s):
    if s == 'Python': return 5000
    if s == 'Java': return 3000
    if s == 'C++': return 4000
    if s == 'SQL': return 2000
    if s == 'Cloud': return 10000
    return 0

y = (50000 + # Base salary
     3000 * experience + 
     np.array([education_bonus(e) for e in education]) +
     np.array([skill_bonus(s) for s in primary_skill]) +
     np.random.normal(0, 8000, data_size)) # Noise

X = pd.DataFrame({
    'experience': experience,
    'education': education,
    'primary_skill': primary_skill
})

# --- 2. Setup Preprocessing and Model Pipeline ---

# Define which columns are numerical and which are categorical
numerical_features = ['experience']
categorical_features = ['education', 'primary_skill']

# Create a preprocessor using ColumnTransformer
# Numerical features will be scaled
# Categorical features will be one-hot encoded
preprocessor = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features)
)

# Create the full pipeline: preprocess, then regress
pipeline = make_pipeline(
    preprocessor,
    LinearRegression()
)

# --- 3. Evaluate Performance using 5-Fold Cross-Validation ---
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Use cross_val_score to get the scores for each fold
# We use 'neg_mean_squared_error' because scikit-learn scoring is maximization-focused
mse_scores_neg = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive
mse_scores = -mse_scores_neg

print(f"--- Running {n_splits}-Fold Cross-Validation ---")
for i, mse in enumerate(mse_scores):
    print(f"Fold {i+1}: MSE = {mse:,.2f} (RMSE = {np.sqrt(mse):,.2f})")

# --- 4. Display Final Results ---
average_mse = np.mean(mse_scores)
std_dev_mse = np.std(mse_scores)

print("\n--- Cross-Validation Results ---")
print(f"Average MSE: {average_mse:,.2f}")
print(f"Average RMSE: {np.sqrt(average_mse):,.2f}")
print(f"Standard Deviation of MSE: {std_dev_mse:,.2f}")

# --- 5. Optional: Train Final Model and Inspect Coefficients ---
final_model = pipeline.fit(X, y)

# Get feature names after one-hot encoding
try:
    # Get the OneHotEncoder from the preprocessor
    ohe = final_model.named_steps['columntransformer'].named_transformers_['onehotencoder']
    # Get the feature names
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    
    # Combine numerical and categorical feature names
    all_feature_names = numerical_features + list(ohe_feature_names)
    
    # Get coefficients
    coefficients = final_model.named_steps['linearregression'].coef_
    intercept = final_model.named_steps['linearregression'].intercept_

    print("\n--- Final Model (Trained on all data) ---")
    print(f"Intercept (Base Salary Estimate): {intercept:,.2f}")
    print("Coefficients (Impact on Salary):")
    
    # Pair names with coefficients
    coef_df = pd.DataFrame({'Feature': all_feature_names, 'Coefficient': coefficients})
    
    # Sort by absolute value of coefficient for impact
    coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
    coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False).drop(columns='Abs_Coefficient')
    
    for _, row in coef_df.iterrows():
        print(f"  {row['Feature']}: {row['Coefficient']:,.2f}")

except Exception as e:
    print(f"\nCould not retrieve feature names for coefficients: {e}")



--- Running 5-Fold Cross-Validation ---
Fold 1: MSE = 66,783,412.51 (RMSE = 8,172.11)
Fold 2: MSE = 47,737,556.76 (RMSE = 6,909.24)
Fold 3: MSE = 77,234,034.80 (RMSE = 8,788.29)
Fold 4: MSE = 45,083,014.43 (RMSE = 6,714.39)
Fold 5: MSE = 52,666,047.75 (RMSE = 7,257.14)

--- Cross-Validation Results ---
Average MSE: 57,900,813.25
Average RMSE: 7,609.26
Standard Deviation of MSE: 12,228,364.93

--- Final Model (Trained on all data) ---
Intercept (Base Salary Estimate): 98,747.54
Coefficients (Impact on Salary):
  experience: 17,451.86
  education_PhD: 17,114.65
  education_Bachelors: -15,853.47
  primary_skill_Cloud: 6,104.12
  primary_skill_SQL: -2,273.09
  primary_skill_C++: -2,258.39
  education_Masters: -1,261.18
  primary_skill_Java: -1,220.59
  primary_skill_Python: -352.05
