In [11]:
import os
import shutil
import gc
import ijson
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib


In [12]:
RAW_JSON = r"C:\Users\shaur\Downloads\hrjob\synthetic-employee-dataset.json"
PARQUET_DIR = r"C:\Users\shaur\Downloads\hrjob\employee_parquet"
MODEL_PATH = r"C:\Users\shaur\Downloads\hrjob\burnout_model.joblib"


In [9]:
if os.path.exists(PARQUET_DIR):
    shutil.rmtree(PARQUET_DIR)

os.makedirs(PARQUET_DIR, exist_ok=True)


In [13]:
def add_burnout_features(df):
    df["burnout_index"] = (
        0.4 * df["workload_score"] +
        0.3 * (1 - df["satisfaction_score"]) +
        0.2 * (1 - df["team_sentiment"]) +
        0.1 * (1 - df["performance_score"])
    )

    df["burnout_risk"] = pd.cut(
        df["burnout_index"],
        bins=[-1, 0.33, 0.66, 2],
        labels=["Low", "Medium", "High"]
    )
    return df


In [14]:
chunk_size = 50_000
buffer = []
part = 0

with open(RAW_JSON, "rb") as f:
    for i, item in enumerate(ijson.items(f, "item")):
        buffer.append(item)

        if len(buffer) == chunk_size:
            df = pd.DataFrame(buffer)

            # ✅ SAFE numeric conversion (Decimal-proof, future-proof)
            for col in df.columns:
                try:
                    df[col] = pd.to_numeric(df[col])
                except (ValueError, TypeError):
                    pass

            df = add_burnout_features(df)

            out_path = os.path.join(PARQUET_DIR, f"part_{part}.parquet")
            df.to_parquet(out_path, engine="pyarrow", index=False)

            part += 1
            buffer.clear()
            del df
            gc.collect()

            print(f"Processed rows: {i+1}")

# write remaining rows
if buffer:
    df = pd.DataFrame(buffer)
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col])
        except (ValueError, TypeError):
            pass

    df = add_burnout_features(df)

    out_path = os.path.join(PARQUET_DIR, f"part_{part}.parquet")
    df.to_parquet(out_path, engine="pyarrow", index=False)

print("✅ Parquet DATASET created successfully")


Processed rows: 50000
Processed rows: 100000
Processed rows: 150000
Processed rows: 200000
Processed rows: 250000
Processed rows: 300000
Processed rows: 350000
Processed rows: 400000
Processed rows: 450000
Processed rows: 500000
Processed rows: 550000
Processed rows: 600000
Processed rows: 650000
Processed rows: 700000
Processed rows: 750000
Processed rows: 800000
✅ Parquet DATASET created successfully


In [15]:
df = pd.read_parquet(PARQUET_DIR)
print(df.shape)
df.head()


(849999, 32)


Unnamed: 0,employee_id,role,job_level,department,tenure_months,salary,performance_score,satisfaction_score,workload_score,team_sentiment,...,stress_level,burnout_risk,left_company,turnover_reason,risk_factors_summary,turnover_probability_generated,persona_name,role_complexity_score,career_progression_score,burnout_index
0,SYN_00000000,,Mid,Research & Development,169,79704.579059,0.632482,0.623746,0.758117,0.662335,...,0.908992,Medium,False,Not Applicable,Severe Burnout Risk,0.290979,ChangeResistor,0.2,1.0,0.520408
1,SYN_00000001,Customer Success Manager,Manager,Research & Development,54,29694.288831,0.538587,0.982556,0.788416,0.934661,...,0.363321,Medium,False,Not Applicable,Low Risk,0.156002,NewEnthusiast,0.2,1.0,0.379809
2,SYN_00000002,Administrative Assistant,Entry,HR,1,62208.470185,0.624656,0.7672,0.697617,0.888559,...,0.664378,Medium,True,Personal / Relocation,Low Risk,0.233897,NewEnthusiast,0.2,0.836495,0.408709
3,SYN_00000003,Senior Manager,Manager,Research & Development,31,236066.567114,0.95932,0.185888,0.493143,0.732189,...,1.0,Medium,False,Not Applicable,Severe Burnout Risk,0.351682,OverachievingSprinter,0.2,1.0,0.499121
4,SYN_00000004,Anonymous Employee,Mid,Research & Development,131,37306.328156,0.677305,0.566706,0.56723,0.817545,...,0.723049,Medium,False,Not Applicable,Low Risk,0.27291,SeasonedExpert,0.2,1.0,0.425641


In [16]:
TARGET = "burnout_risk"

X = df.drop(columns=[TARGET, "burnout_index"], errors="ignore")
y = df[TARGET]


In [21]:
import numpy as np

bad_cols = []

for col in X.columns:
    sample = X[col].dropna().iloc[0]
    if isinstance(sample, (list, tuple, np.ndarray)):
        bad_cols.append(col)

print("Columns containing arrays/lists:", bad_cols)


Columns containing arrays/lists: ['technical_skills', 'soft_skills']


In [22]:
# Convert list/array columns to string so sklearn can encode them
for col in ['technical_skills', 'soft_skills']:
    X[col] = X[col].astype(str)


In [24]:
categorical_cols = X.select_dtypes(include="object").columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()


In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [27]:
baseline = Pipeline([
    ("prep", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ))
])

baseline.fit(X_train, y_train)
print("Baseline Results:")
print(classification_report(y_test, baseline.predict(X_test)))


Baseline Results:
              precision    recall  f1-score   support

        High       0.94      1.00      0.97      9627
         Low       0.98      1.00      0.99     25567
      Medium       1.00      0.99      1.00    134806

    accuracy                           0.99    170000
   macro avg       0.97      1.00      0.98    170000
weighted avg       0.99      0.99      0.99    170000



In [28]:
rf_model = Pipeline([
    ("prep", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=12,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)
print("Random Forest Results:")
print(classification_report(y_test, rf_model.predict(X_test)))


Random Forest Results:
              precision    recall  f1-score   support

        High       0.17      0.97      0.28      9627
         Low       0.25      0.96      0.39     25567
      Medium       0.95      0.10      0.19    134806

    accuracy                           0.28    170000
   macro avg       0.45      0.68      0.29    170000
weighted avg       0.80      0.28      0.22    170000



In [29]:
y_prob = rf_model.predict_proba(X_test)
roc = roc_auc_score(y_test, y_prob, multi_class="ovr")
print("ROC-AUC:", roc)


ROC-AUC: 0.7586950621110299


In [30]:
joblib.dump(rf_model, MODEL_PATH)
print("✅ Model saved at:", MODEL_PATH)


✅ Model saved at: C:\Users\shaur\Downloads\hrjob\burnout_model.joblib


In [31]:
sample = X.iloc[[0]]
print("Predicted Burnout Risk:", rf_model.predict(sample)[0])


Predicted Burnout Risk: Low


In [36]:
# ==============================
# STEP: Convert JSON → CSV (Raw Data Lock)
# ==============================

import pandas as pd

df = pd.read_json("synthetic-employee-dataset.json")

df.to_csv("raw_data.csv", index=False)

print("Raw data saved as raw_data.csv")
print(df.shape)
print(df.columns.tolist())

df.head()


Raw data saved as raw_data.csv
(849999, 31)
['employee_id', 'role', 'job_level', 'department', 'tenure_months', 'salary', 'performance_score', 'satisfaction_score', 'workload_score', 'team_sentiment', 'recent_feedback', 'communication_patterns', 'project_completion_rate', 'overtime_hours', 'training_participation', 'collaboration_score', 'technical_skills', 'soft_skills', 'email_sentiment', 'slack_activity', 'meeting_participation', 'goal_achievement_rate', 'stress_level', 'burnout_risk', 'left_company', 'turnover_reason', 'risk_factors_summary', 'turnover_probability_generated', 'persona_name', 'role_complexity_score', 'career_progression_score']


Unnamed: 0,employee_id,role,job_level,department,tenure_months,salary,performance_score,satisfaction_score,workload_score,team_sentiment,...,goal_achievement_rate,stress_level,burnout_risk,left_company,turnover_reason,risk_factors_summary,turnover_probability_generated,persona_name,role_complexity_score,career_progression_score
0,SYN_00000000,,Mid,Research & Development,169,79704.579059,0.632482,0.623746,0.758117,0.662335,...,0.632482,0.908992,0.866643,False,Not Applicable,Severe Burnout Risk,0.290979,ChangeResistor,0.2,1.0
1,SYN_00000001,Customer Success Manager,Manager,Research & Development,54,29694.288831,0.538587,0.982556,0.788416,0.934661,...,0.538587,0.363321,0.218996,False,Not Applicable,Low Risk,0.156002,NewEnthusiast,0.2,1.0
2,SYN_00000002,Administrative Assistant,Entry,HR,1,62208.470185,0.624656,0.7672,0.697617,0.888559,...,0.624656,0.664378,0.541531,True,Personal / Relocation,Low Risk,0.233897,NewEnthusiast,0.2,0.836495
3,SYN_00000003,Senior Manager,Manager,Research & Development,31,236066.567114,0.95932,0.185888,0.493143,0.732189,...,0.95932,1.0,1.0,False,Not Applicable,Severe Burnout Risk,0.351682,OverachievingSprinter,0.2,1.0
4,SYN_00000004,Anonymous Employee,Mid,Research & Development,131,37306.328156,0.677305,0.566706,0.56723,0.817545,...,0.677305,0.723049,0.614825,False,Not Applicable,Low Risk,0.27291,SeasonedExpert,0.2,1.0


In [39]:
# Create derived team sentiment score (1–10 scale assumed)
df["team_sentiment_score"] = (
    df["team_sentiment"] * 0.5 +
    df["collaboration_score"] * 0.5
)


In [41]:
def label_burnout(row):
    workload = row["workload_score"] * 10     # normalize to 1–10
    satisfaction = row["satisfaction_score"]  # already 1–10
    team = row["team_sentiment_score"]         # already 1–10

    score = (
        workload * 0.4 +
        (10 - satisfaction) * 0.3 +
        (10 - team) * 0.3
    )

    if score >= 7.5:
        return 2   # High risk
    elif score >= 4.5:
        return 1   # Medium risk
    else:
        return 0   # Low risk


In [42]:
df["burnout_label"] = df.apply(label_burnout, axis=1)
df["burnout_label"].value_counts()


burnout_label
2    629670
1    220329
Name: count, dtype: int64

In [43]:
df["burnout_score"] = (
    df["workload_score"] * 4 +
    (10 - df["satisfaction_score"]) * 3 +
    (10 - df["team_sentiment"]) * 3
)

df["burnout_score"].describe()


count    849999.000000
mean         58.651593
std           1.270002
min          54.258536
25%          57.767413
50%          58.652750
75%          59.542077
max          63.262017
Name: burnout_score, dtype: float64

In [44]:
low_thr = df["burnout_score"].quantile(0.33)
mid_thr = df["burnout_score"].quantile(0.66)

def label_burnout(row):
    if row["burnout_score"] >= mid_thr:
        return 2  # High
    elif row["burnout_score"] >= low_thr:
        return 1  # Medium
    else:
        return 0  # Low

df["burnout_label"] = df.apply(label_burnout, axis=1)
df["burnout_label"].value_counts()


burnout_label
2    289000
0    280500
1    280499
Name: count, dtype: int64

In [45]:
FEATURES = [
    "workload_score",
    "satisfaction_score",
    "team_sentiment",
    "tenure_months",
    "salary",
    "stress_level",
    "overtime_hours",
    "collaboration_score",
    "career_progression_score"
]

X = df[FEATURES]
y = df["burnout_label"]


In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ))
])

model.fit(X_train, y_train)


0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('scaler', ...), ('clf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"copy  copy: bool, default=True If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.",True
,"with_mean  with_mean: bool, default=True If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.",True
,"with_std  with_std: bool, default=True If True, scale the data to unit variance (or equivalently, unit standard deviation).",True

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",12
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [47]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.98     56100
           1       0.96      0.97      0.96     56100
           2       0.99      0.98      0.98     57800

    accuracy                           0.98    170000
   macro avg       0.98      0.98      0.98    170000
weighted avg       0.98      0.98      0.98    170000



In [48]:
import joblib
joblib.dump(model, "burnout_model.joblib")


['burnout_model.joblib']

In [49]:
burnout_model.joblib


NameError: name 'burnout_model' is not defined

In [54]:
FEATURE_ORDER = [
    "tenure_months",
    "stress_level",
    "career_progression_score",
    "collaboration_score",
    "workload_score",
    "satisfaction_score",
    "team_sentiment",
    "overtime_hours",
    "salary"
]


In [55]:
df_model = df[features + [target]].copy()
df_model.head()


Unnamed: 0,overtime_hours,salary,job_level,department,workload_score,satisfaction_score,team_sentiment,burnout_label
0,0.0,79704.579059,Mid,Research & Development,0.758117,0.623746,0.662335,1
1,0.0,29694.288831,Manager,Research & Development,0.788416,0.982556,0.934661,0
2,0.0,62208.470185,Entry,HR,0.697617,0.7672,0.888559,0
3,9.59168,236066.567114,Manager,Research & Development,0.493143,0.185888,0.732189,2
4,0.0,37306.328156,Mid,Research & Development,0.56723,0.566706,0.817545,1
