In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls /content/drive/MyDrive

'Colab Notebooks'
'Document from Rosie'
'FeeReceipt (2)_copy.pdf'
'goresume-compressed (1).pdf'
'goresume-compressed (2).pdf'
 goresume-compressed.pdf
 h1b_data
 IMG-20250306-WA0001.jpg
'rohg (1).jpeg'
 rohg.jpeg
'roja certificate (1).pdf'
'Screenshot_20250227_181303_PDF Scanner (1).jpg'
'Screenshot_20250227_181303_PDF Scanner.jpg'
 Screenshot_20250324_164842_Gallery.jpg
 Screenshot_20250326_183620_PhonePe.jpg
'Screenshot_20250411_192253_Google (1).jpg'
'Screenshot_20250411_192253_Google (2).jpg'
 Screenshot_20250411_192253_Google.jpg
 Screenshot_20250521_202516_PhonePe.jpg
'Screenshot 2025-08-17 130547.png'
 Screenshot_20251003_202117_Gallery.jpg
'Weather App - PM Accelerator - Google Chrome 2025-11-16 00-11-06.mp4'


In [3]:
!ls /content/drive/MyDrive/h1b_data

 LCA_Disclosure_Data_FY2024_Q4.xlsx  'LCA_Disclosure_Data_FY2026_Q1 (1).xlsx'
 LCA_Disclosure_Data_FY2025_Q4.xlsx


In [None]:
import pandas as pd

file1 = "/content/drive/MyDrive/h1b_data/LCA_Disclosure_Data_FY2024_Q4.xlsx"
file2 = "/content/drive/MyDrive/h1b_data/LCA_Disclosure_Data_FY2025_Q4.xlsx"
file3 = "/content/drive/MyDrive/h1b_data/LCA_Disclosure_Data_FY2026_Q1 (1).xlsx"

df1 = pd.read_excel(file1)
df2 = pd.read_excel(file2)
df3 = pd.read_excel(file3)

combined_df = pd.concat([df1, df2, df3], ignore_index=True)

print("Total rows:", combined_df.shape[0])
combined_df.head()

In [None]:
combined_df['VISA_CLASS'].value_counts()

Unnamed: 0_level_0,count
VISA_CLASS,Unnamed: 1_level_1
H-1B,312017
E-3 Australian,7827
H-1B1 Chile,1582
H-1B1 Singapore,1171


In [None]:
h1b_df = combined_df[combined_df['VISA_CLASS'] == 'H-1B']

print("H1B rows:", h1b_df.shape[0])

H1B rows: 312017


In [None]:
h1b_df['CASE_STATUS'].value_counts()

Unnamed: 0_level_0,count
CASE_STATUS,Unnamed: 1_level_1
Certified,282098
Certified - Withdrawn,23355
Withdrawn,4931
Denied,1633


In [None]:
useful_columns = [
    'CASE_STATUS',
    'JOB_TITLE',
    'SOC_CODE',
    'SOC_TITLE',
    'FULL_TIME_POSITION',
    'PREVAILING_WAGE',
    'WAGE_UNIT_OF_PAY',
    'WORKSITE_STATE',
    'EMPLOYER_NAME',
    'SUPPORT_H1B'
]

h1b_df = h1b_df[useful_columns]

print("Columns kept:", h1b_df.columns)
print("New shape:", h1b_df.shape)

Columns kept: Index(['CASE_STATUS', 'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE',
       'FULL_TIME_POSITION', 'PREVAILING_WAGE', 'WAGE_UNIT_OF_PAY',
       'WORKSITE_STATE', 'EMPLOYER_NAME', 'SUPPORT_H1B'],
      dtype='object')
New shape: (312017, 10)


In [None]:
h1b_df = h1b_df[h1b_df['CASE_STATUS'].isin(['Certified', 'Denied'])]

print(h1b_df['CASE_STATUS'].value_counts())
print("New shape:", h1b_df.shape)

CASE_STATUS
Certified    282098
Denied         1633
Name: count, dtype: int64
New shape: (283731, 10)


In [None]:
h1b_df['TARGET'] = h1b_df['CASE_STATUS'].map({
    'Certified': 1,
    'Denied': 0
})

print(h1b_df[['CASE_STATUS', 'TARGET']].head())

  CASE_STATUS  TARGET
0   Certified       1
1   Certified       1
2   Certified       1
3   Certified       1
4   Certified       1


In [None]:
from sklearn.utils import resample

# Separate classes
certified = h1b_df[h1b_df['TARGET'] == 1]
denied = h1b_df[h1b_df['TARGET'] == 0]

# Downsample Certified
certified_downsampled = resample(
    certified,
    replace=False,
    n_samples=len(denied),
    random_state=42
)

# Combine
balanced_df = pd.concat([certified_downsampled, denied])

print(balanced_df['TARGET'].value_counts())
print("New shape:", balanced_df.shape)

TARGET
1    1633
0    1633
Name: count, dtype: int64
New shape: (3266, 11)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Separate features and target
X = balanced_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = balanced_df['TARGET']

# Combine text columns
X['TEXT'] = X['JOB_TITLE'].astype(str) + " " + X['SOC_TITLE'].astype(str)

# Define columns
text_col = 'TEXT'
categorical_cols = ['WORKSITE_STATE', 'SUPPORT_H1B', 'FULL_TIME_POSITION']
numeric_cols = ['PREVAILING_WAGE']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=300), text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# Full pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model.fit(X_train, y_train)

print("Model trained successfully üöÄ")

Model trained successfully üöÄ


In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Separate features & target
X = balanced_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = balanced_df['TARGET']

# Combine text columns
X['TEXT'] = X['JOB_TITLE'].astype(str) + " " + X['SOC_TITLE'].astype(str)

# Define columns
text_col = 'TEXT'
categorical_cols = ['WORKSITE_STATE', 'SUPPORT_H1B', 'FULL_TIME_POSITION']
numeric_cols = ['PREVAILING_WAGE']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=300), text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# Professional XGBoost model
xgb_model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    ))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
xgb_model.fit(X_train, y_train)

print("Professional XGBoost model trained successfully üöÄ")

Professional XGBoost model trained successfully üöÄ


In [None]:
sample = X_test.iloc[[0]]
prob = xgb_model.predict_proba(sample)

print("Probability of Certified:", prob[0][1])

Probability of Certified: 0.57284665


In [None]:
X = h1b_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = h1b_df['TARGET']

In [None]:
neg = sum(y == 0)
pos = sum(y == 1)

scale_weight = neg / pos
print("Scale pos weight:", scale_weight)

Scale pos weight: 0.005788768442172579


In [None]:
!pip install xgboost



In [None]:
import pandas as pd
import joblib

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score
# Use full filtered dataset (Certified + Denied only)
X = h1b_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = h1b_df['TARGET']

# Create combined text column
X['TEXT'] = X['JOB_TITLE'].astype(str) + " " + X['SOC_TITLE'].astype(str)

# Calculate imbalance ratio
neg = sum(y == 0)
pos = sum(y == 1)

scale_weight = neg / pos
print("Scale_pos_weight:", scale_weight)
text_col = 'TEXT'
categorical_cols = ['WORKSITE_STATE', 'SUPPORT_H1B', 'FULL_TIME_POSITION']
numeric_cols = ['PREVAILING_WAGE']
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500), text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_weight,
    random_state=42,
    eval_metric='logloss'
)
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', xgb)
])
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
model.fit(X_train, y_train)

print("Base model trained successfully üöÄ")
calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv=3)

calibrated_model.fit(X_train, y_train)

print("Calibrated model trained successfully üî•")
y_pred = calibrated_model.predict(X_test)
y_prob = calibrated_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
joblib.dump(calibrated_model, "h1b_approval_model.pkl")

print("Model saved successfully ‚úÖ")

Scale_pos_weight: 0.005788768442172579
Base model trained successfully üöÄ
Calibrated model trained successfully üî•
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       327
           1       0.99      1.00      1.00     56420

    accuracy                           0.99     56747
   macro avg       0.50      0.50      0.50     56747
weighted avg       0.99      0.99      0.99     56747

ROC-AUC Score: 0.886982840578579
Model saved successfully ‚úÖ


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
y_prob = calibrated_model.predict_proba(X_test)[:, 1]

In [None]:
import numpy as np
from sklearn.metrics import f1_score

thresholds = np.arange(0.1, 0.95, 0.01)

best_threshold = 0
best_f1 = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    score = f1_score(y_test, y_pred_temp)

    if score > best_f1:
        best_f1 = score
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1)

Best Threshold: 0.8099999999999996
Best F1 Score: 0.9977974152800998


In [None]:
y_pred_final = (y_prob >= best_threshold).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_final))

              precision    recall  f1-score   support

           0       0.83      0.30      0.44       327
           1       1.00      1.00      1.00     56420

    accuracy                           1.00     56747
   macro avg       0.91      0.65      0.72     56747
weighted avg       1.00      1.00      0.99     56747



In [None]:
# ======================================================
# FINAL PROFESSIONAL H1B MODEL
# Encoding + Column Cleaning + Imbalance + Threshold
# ======================================================

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, recall_score
import joblib

# ------------------------------------------------------
# 1Ô∏è‚É£ Prepare Features & Target
# ------------------------------------------------------

X = balanced_df.drop(columns=["TARGET"])
y = balanced_df["TARGET"]

# ------------------------------------------------------
# 2Ô∏è‚É£ Encode Categorical Features
# ------------------------------------------------------

categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Encoding completed ‚úÖ")
print("Shape after encoding:", X.shape)

# ------------------------------------------------------
# 3Ô∏è‚É£ Clean Column Names (VERY IMPORTANT FIX)
# ------------------------------------------------------

X.columns = (
    X.columns
    .str.replace('[', '', regex=False)
    .str.replace(']', '', regex=False)
    .str.replace('<', '', regex=False)
    .str.replace('>', '', regex=False)
    .str.replace(' ', '_', regex=False)
)

print("Column names cleaned ‚úÖ")

# ------------------------------------------------------
# 4Ô∏è‚É£ Train-Test Split
# ------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------------
# 5Ô∏è‚É£ Handle Imbalance
# ------------------------------------------------------

neg = sum(y_train == 0)
pos = sum(y_train == 1)

scale_pos_weight = neg / pos
print("Scale_pos_weight:", scale_pos_weight)

# ------------------------------------------------------
# 6Ô∏è‚É£ Train XGBoost
# ------------------------------------------------------

model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

print("Model trained successfully üöÄ")

# ------------------------------------------------------
# 7Ô∏è‚É£ Probabilities & ROC
# ------------------------------------------------------

y_prob = model.predict_proba(X_test)[:,1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# ------------------------------------------------------
# 8Ô∏è‚É£ Threshold Optimization (Denied Recall Focus)
# ------------------------------------------------------

thresholds = np.arange(0.1, 0.95, 0.01)

best_threshold = 0
best_recall = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    recall_denied = recall_score(y_test, y_pred_temp, pos_label=0)

    if recall_denied > best_recall:
        best_recall = recall_denied
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best Denied Recall:", best_recall)

# ------------------------------------------------------
# 9Ô∏è‚É£ Final Evaluation
# ------------------------------------------------------

y_pred_final = (y_prob >= best_threshold).astype(int)

print("\nFinal Classification Report:\n")
print(classification_report(y_test, y_pred_final))

# ------------------------------------------------------
# üîü Save Model
# ------------------------------------------------------

joblib.dump(model, "h1b_professional_model.pkl")
print("Model saved successfully ‚úÖ")

# ------------------------------------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Personalized Prediction Function
# ------------------------------------------------------

def predict_h1b(input_df):
    input_df = pd.get_dummies(input_df)
    input_df = input_df.reindex(columns=X.columns, fill_value=0)

    prob = model.predict_proba(input_df)[:,1][0]
    prediction = 1 if prob >= best_threshold else 0

    return {
        "Probability_Certified": round(float(prob), 4),
        "Prediction": "Certified" if prediction == 1 else "Denied"
    }

print("\nPipeline ready for deployment üéØ")

Categorical columns: Index(['CASE_STATUS', 'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE',
       'FULL_TIME_POSITION', 'WAGE_UNIT_OF_PAY', 'WORKSITE_STATE',
       'EMPLOYER_NAME', 'SUPPORT_H1B'],
      dtype='object')
Encoding completed ‚úÖ
Shape after encoding: (3266, 4266)
Column names cleaned ‚úÖ
Scale_pos_weight: 1.0
Model trained successfully üöÄ
ROC-AUC Score: 1.0
Best Threshold: 0.1
Best Denied Recall: 1.0

Final Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       327
           1       1.00      1.00      1.00       327

    accuracy                           1.00       654
   macro avg       1.00      1.00      1.00       654
weighted avg       1.00      1.00      1.00       654

Model saved successfully ‚úÖ

Pipeline ready for deployment üéØ


In [None]:
[1]
34s
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

[2]
0s
!ls /content/drive/MyDrive
'Colab Notebooks'
'Document from Rosie'
'FeeReceipt (2)_copy.pdf'
'goresume-compressed (1).pdf'
'goresume-compressed (2).pdf'
 goresume-compressed.pdf
 h1b_data
 IMG-20250306-WA0001.jpg
'rohg (1).jpeg'
 rohg.jpeg
'roja certificate (1).pdf'
'Screenshot_20250227_181303_PDF Scanner (1).jpg'
'Screenshot_20250227_181303_PDF Scanner.jpg'
 Screenshot_20250324_164842_Gallery.jpg
 Screenshot_20250326_183620_PhonePe.jpg
'Screenshot_20250411_192253_Google (1).jpg'
'Screenshot_20250411_192253_Google (2).jpg'
 Screenshot_20250411_192253_Google.jpg
 Screenshot_20250521_202516_PhonePe.jpg
'Screenshot 2025-08-17 130547.png'
 Screenshot_20251003_202117_Gallery.jpg
'Weather App - PM Accelerator - Google Chrome 2025-11-16 00-11-06.mp4'

[3]
0s
!ls /content/drive/MyDrive/h1b_data
 LCA_Disclosure_Data_FY2024_Q4.xlsx  'LCA_Disclosure_Data_FY2026_Q1 (1).xlsx'
 LCA_Disclosure_Data_FY2025_Q4.xlsx

[5]
import pandas as pd

file1 = "/content/drive/MyDrive/h1b_data/LCA_Disclosure_Data_FY2024_Q4.xlsx"
file2 = "/content/drive/MyDrive/h1b_data/LCA_Disclosure_Data_FY2025_Q4.xlsx"
file3 = "/content/drive/MyDrive/h1b_data/LCA_Disclosure_Data_FY2026_Q1 (1).xlsx"

df1 = pd.read_excel(file1)
df2 = pd.read_excel(file2)
df3 = pd.read_excel(file3)

combined_df = pd.concat([df1, df2, df3], ignore_index=True)

print("Total rows:", combined_df.shape[0])
combined_df.head()


[6]
0s
combined_df['VISA_CLASS'].value_counts()


[7]
1s
h1b_df = combined_df[combined_df['VISA_CLASS'] == 'H-1B']

print("H1B rows:", h1b_df.shape[0])
H1B rows: 312017

[8]
0s
h1b_df['CASE_STATUS'].value_counts()


[9]
0s
useful_columns = [
    'CASE_STATUS',
    'JOB_TITLE',
    'SOC_CODE',
    'SOC_TITLE',
    'FULL_TIME_POSITION',
    'PREVAILING_WAGE',
    'WAGE_UNIT_OF_PAY',
    'WORKSITE_STATE',
    'EMPLOYER_NAME',
    'SUPPORT_H1B'
]

h1b_df = h1b_df[useful_columns]

print("Columns kept:", h1b_df.columns)
print("New shape:", h1b_df.shape)
Columns kept: Index(['CASE_STATUS', 'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE',
       'FULL_TIME_POSITION', 'PREVAILING_WAGE', 'WAGE_UNIT_OF_PAY',
       'WORKSITE_STATE', 'EMPLOYER_NAME', 'SUPPORT_H1B'],
      dtype='object')
New shape: (312017, 10)

[10]
0s
h1b_df = h1b_df[h1b_df['CASE_STATUS'].isin(['Certified', 'Denied'])]

print(h1b_df['CASE_STATUS'].value_counts())
print("New shape:", h1b_df.shape)
CASE_STATUS
Certified    282098
Denied         1633
Name: count, dtype: int64
New shape: (283731, 10)

[11]
0s
h1b_df['TARGET'] = h1b_df['CASE_STATUS'].map({
    'Certified': 1,
    'Denied': 0
})

print(h1b_df[['CASE_STATUS', 'TARGET']].head())
  CASE_STATUS  TARGET
0   Certified       1
1   Certified       1
2   Certified       1
3   Certified       1
4   Certified       1

[12]
2s
from sklearn.utils import resample

# Separate classes
certified = h1b_df[h1b_df['TARGET'] == 1]
denied = h1b_df[h1b_df['TARGET'] == 0]

# Downsample Certified
certified_downsampled = resample(
    certified,
    replace=False,
    n_samples=len(denied),
    random_state=42
)

# Combine
balanced_df = pd.concat([certified_downsampled, denied])

print(balanced_df['TARGET'].value_counts())
print("New shape:", balanced_df.shape)
TARGET
1    1633
0    1633
Name: count, dtype: int64
New shape: (3266, 11)

[13]
7s
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Separate features and target
X = balanced_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = balanced_df['TARGET']

# Combine text columns
X['TEXT'] = X['JOB_TITLE'].astype(str) + " " + X['SOC_TITLE'].astype(str)

# Define columns
text_col = 'TEXT'
categorical_cols = ['WORKSITE_STATE', 'SUPPORT_H1B', 'FULL_TIME_POSITION']
numeric_cols = ['PREVAILING_WAGE']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=300), text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# Full pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model.fit(X_train, y_train)

print("Model trained successfully üöÄ")
Model trained successfully üöÄ

[14]
8s
!pip install xgboost
Requirement already satisfied: xgboost in /usr/local/lib/python3.12/dist-packages (3.2.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.0.2)
Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.29.3)
Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from xgboost) (1.16.3)

[15]
7s
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Separate features & target
X = balanced_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = balanced_df['TARGET']

# Combine text columns
X['TEXT'] = X['JOB_TITLE'].astype(str) + " " + X['SOC_TITLE'].astype(str)

# Define columns
text_col = 'TEXT'
categorical_cols = ['WORKSITE_STATE', 'SUPPORT_H1B', 'FULL_TIME_POSITION']
numeric_cols = ['PREVAILING_WAGE']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=300), text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)

# Professional XGBoost model
xgb_model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    ))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
xgb_model.fit(X_train, y_train)

print("Professional XGBoost model trained successfully üöÄ")
Professional XGBoost model trained successfully üöÄ

[16]
0s
sample = X_test.iloc[[0]]
prob = xgb_model.predict_proba(sample)

print("Probability of Certified:", prob[0][1])
Probability of Certified: 0.57284665

[17]
0s
X = h1b_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = h1b_df['TARGET']

[18]
0s
neg = sum(y == 0)
pos = sum(y == 1)

scale_weight = neg / pos
print("Scale pos weight:", scale_weight)
Scale pos weight: 0.005788768442172579

[20]
8s
!pip install xgboost
Requirement already satisfied: xgboost in /usr/local/lib/python3.12/dist-packages (3.2.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.0.2)
Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.29.3)
Requirement already satisfied: scipy in /usr/local/lib/python3.12/dist-packages (from xgboost) (1.16.3)

[21]
1m
import pandas as pd
import joblib

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score
# Use full filtered dataset (Certified + Denied only)
X = h1b_df.drop(['CASE_STATUS', 'TARGET'], axis=1)
y = h1b_df['TARGET']

# Create combined text column
X['TEXT'] = X['JOB_TITLE'].astype(str) + " " + X['SOC_TITLE'].astype(str)

# Calculate imbalance ratio
neg = sum(y == 0)
pos = sum(y == 1)

scale_weight = neg / pos
print("Scale_pos_weight:", scale_weight)
text_col = 'TEXT'
categorical_cols = ['WORKSITE_STATE', 'SUPPORT_H1B', 'FULL_TIME_POSITION']
numeric_cols = ['PREVAILING_WAGE']
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500), text_col),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ]
)
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_weight,
    random_state=42,
    eval_metric='logloss'
)
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', xgb)
])
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
model.fit(X_train, y_train)

print("Base model trained successfully üöÄ")
calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv=3)

calibrated_model.fit(X_train, y_train)

print("Calibrated model trained successfully üî•")
y_pred = calibrated_model.predict(X_test)
y_prob = calibrated_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))
joblib.dump(calibrated_model, "h1b_approval_model.pkl")

print("Model saved successfully ‚úÖ")
Scale_pos_weight: 0.005788768442172579
Base model trained successfully üöÄ
Calibrated model trained successfully üî•
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       327
           1       0.99      1.00      1.00     56420

    accuracy                           0.99     56747
   macro avg       0.50      0.50      0.50     56747
weighted avg       0.99      0.99      0.99     56747

ROC-AUC Score: 0.886982840578579
Model saved successfully ‚úÖ
/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

[22]
9s
y_prob = calibrated_model.predict_proba(X_test)[:, 1]

[23]
0s
import numpy as np
from sklearn.metrics import f1_score

thresholds = np.arange(0.1, 0.95, 0.01)

best_threshold = 0
best_f1 = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    score = f1_score(y_test, y_pred_temp)

    if score > best_f1:
        best_f1 = score
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1)
Best Threshold: 0.8099999999999996
Best F1 Score: 0.9977974152800998

[24]
0s
y_pred_final = (y_prob >= best_threshold).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_final))
              precision    recall  f1-score   support

           0       0.83      0.30      0.44       327
           1       1.00      1.00      1.00     56420

    accuracy                           1.00     56747
   macro avg       0.91      0.65      0.72     56747
weighted avg       1.00      1.00      0.99     56747


[25]
0s
# ================================
# PROFESSIONAL H1B APPROVAL MODEL
# Imbalance Handling + Threshold Tuning + Personalized Prediction
# ================================

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, recall_score
import joblib

# -------------------------------
# 1Ô∏è‚É£ Prepare Features & Target
# -------------------------------

# TARGET already created as:
# Certified = 1
# Denied = 0

X = balanced_df.drop(columns=["TARGET"])
y = balanced_df["TARGET"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# 2Ô∏è‚É£ Handle Class Imbalance
# -------------------------------

neg = sum(y_train == 0)   # Denied
pos = sum(y_train == 1)   # Certified

scale_pos_weight = neg / pos

print("Scale_pos_weight:", scale_pos_weight)

# -------------------------------
# 3Ô∏è‚É£ Train XGBoost Model
# -------------------------------

model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

print("Model trained successfully üöÄ")

# -------------------------------
# 4Ô∏è‚É£ Get Probabilities
# -------------------------------

y_prob = model.predict_proba(X_test)[:,1]

print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# -------------------------------
# 5Ô∏è‚É£ Threshold Optimization (Denied Recall Focus)
# -------------------------------

thresholds = np.arange(0.1, 0.95, 0.01)

best_threshold = 0
best_recall = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    recall_denied = recall_score(y_test, y_pred_temp, pos_label=0)

    if recall_denied > best_recall:
        best_recall = recall_denied
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best Denied Recall:", best_recall)

# -------------------------------
# 6Ô∏è‚É£ Final Evaluation
# -------------------------------

y_pred_final = (y_prob >= best_threshold).astype(int)

print("\nFinal Classification Report:\n")
print(classification_report(y_test, y_pred_final))

# -------------------------------
# 7Ô∏è‚É£ Save Model
# -------------------------------

joblib.dump(model, "h1b_professional_model.pkl")

print("Model saved successfully ‚úÖ")

# -------------------------------
# 8Ô∏è‚É£ Personalized Prediction Function
# -------------------------------

def predict_h1b(input_df):
    prob = model.predict_proba(input_df)[:,1][0]
    prediction = 1 if prob >= best_threshold else 0

    return {
        "Probability_Certified": round(float(prob), 4),
        "Prediction": "Certified" if prediction == 1 else "Denied"
    }

print("\nPipeline ready for deployment üéØ")

Next steps:

[26]
2s
# ======================================================
# PROFESSIONAL H1B MODEL (WITH PROPER ENCODING FIX)
# ======================================================

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, recall_score
import joblib

# ------------------------------------------------------
# 1Ô∏è‚É£ Prepare Features & Target
# ------------------------------------------------------

X = balanced_df.drop(columns=["TARGET"])
y = balanced_df["TARGET"]

# ------------------------------------------------------
# 2Ô∏è‚É£ ENCODE CATEGORICAL FEATURES  üî• (FIXED PART)
# ------------------------------------------------------

categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Encoding completed ‚úÖ")
print("New shape after encoding:", X.shape)

# ------------------------------------------------------
# 3Ô∏è‚É£ Train-Test Split
# ------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------------
# 4Ô∏è‚É£ Handle Class Imbalance
# ------------------------------------------------------

neg = sum(y_train == 0)
pos = sum(y_train == 1)

scale_pos_weight = neg / pos
print("Scale_pos_weight:", scale_pos_weight)

# ------------------------------------------------------
# 5Ô∏è‚É£ Train XGBoost Model
# ------------------------------------------------------

model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

print("Model trained successfully üöÄ")

# ------------------------------------------------------
# 6Ô∏è‚É£ Get Probabilities
# ------------------------------------------------------

y_prob = model.predict_proba(X_test)[:,1]

print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# ------------------------------------------------------
# 7Ô∏è‚É£ Threshold Optimization (Denied Recall Focus)
# ------------------------------------------------------

thresholds = np.arange(0.1, 0.95, 0.01)

best_threshold = 0
best_recall = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    recall_denied = recall_score(y_test, y_pred_temp, pos_label=0)

    if recall_denied > best_recall:
        best_recall = recall_denied
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best Denied Recall:", best_recall)

# ------------------------------------------------------
# 8Ô∏è‚É£ Final Evaluation
# ------------------------------------------------------

y_pred_final = (y_prob >= best_threshold).astype(int)

print("\nFinal Classification Report:\n")
print(classification_report(y_test, y_pred_final))

# ------------------------------------------------------
# 9Ô∏è‚É£ Save Model
# ------------------------------------------------------

joblib.dump(model, "h1b_professional_model.pkl")

print("Model saved successfully ‚úÖ")

# ------------------------------------------------------
# üîü Personalized Prediction Function
# ------------------------------------------------------

def predict_h1b(input_df):
    input_df = pd.get_dummies(input_df)
    input_df = input_df.reindex(columns=X.columns, fill_value=0)

    prob = model.predict_proba(input_df)[:,1][0]
    prediction = 1 if prob >= best_threshold else 0

    return {
        "Probability_Certified": round(float(prob), 4),
        "Prediction": "Certified" if prediction == 1 else "Denied"
    }

print("\nPipeline ready for deployment üéØ")

Next steps:

[27]
25s
    }

print("\nPipeline ready for deployment üéØ")
Categorical columns: Index(['CASE_STATUS', 'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE',
       'FULL_TIME_POSITION', 'WAGE_UNIT_OF_PAY', 'WORKSITE_STATE',
       'EMPLOYER_NAME', 'SUPPORT_H1B'],
      dtype='object')
Encoding completed ‚úÖ
Shape after encoding: (3266, 4266)
Column names cleaned ‚úÖ
Scale_pos_weight: 1.0
Model trained successfully üöÄ
ROC-AUC Score: 1.0
Best Threshold: 0.1
Best Denied Recall: 1.0

Final Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       327
           1       1.00      1.00      1.00       327

    accuracy                           1.00       654
   macro avg       1.00      1.00      1.00       654
weighted avg       1.00      1.00      1.00       654

Model saved successfully ‚úÖ

Pipeline ready for deployment üéØ

[ ]


In [None]:
# ======================================================
# FINAL PROFESSIONAL H1B MODEL (LEAKAGE FIXED)
# Encoding + Column Cleaning + Imbalance + Threshold
# ======================================================

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, recall_score
import joblib

# ------------------------------------------------------
# 1Ô∏è‚É£ Prepare Features & Target (REMOVE LEAKAGE)
# ------------------------------------------------------

# IMPORTANT: Remove TARGET and CASE_STATUS
X = balanced_df.drop(columns=["TARGET", "CASE_STATUS"])
y = balanced_df["TARGET"]

print("Leakage check ‚Üí CASE_STATUS in X?:", "CASE_STATUS" in X.columns)

# ------------------------------------------------------
# 2Ô∏è‚É£ Encode Categorical Features
# ------------------------------------------------------

categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Encoding completed ‚úÖ")
print("Shape after encoding:", X.shape)

# ------------------------------------------------------
# 3Ô∏è‚É£ Clean Column Names (XGBoost Safe)
# ------------------------------------------------------

X.columns = (
    X.columns
    .str.replace('[', '', regex=False)
    .str.replace(']', '', regex=False)
    .str.replace('<', '', regex=False)
    .str.replace('>', '', regex=False)
    .str.replace(' ', '_', regex=False)
)

print("Column names cleaned ‚úÖ")

# ------------------------------------------------------
# 4Ô∏è‚É£ Train-Test Split
# ------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------------
# 5Ô∏è‚É£ Handle Imbalance
# ------------------------------------------------------

neg = sum(y_train == 0)
pos = sum(y_train == 1)

scale_pos_weight = neg / pos
print("Scale_pos_weight:", scale_pos_weight)

# ------------------------------------------------------
# 6Ô∏è‚É£ Train XGBoost Model
# ------------------------------------------------------

model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

print("Model trained successfully üöÄ")

# ------------------------------------------------------
# 7Ô∏è‚É£ Evaluate ROC
# ------------------------------------------------------

y_prob = model.predict_proba(X_test)[:,1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# ------------------------------------------------------
# 8Ô∏è‚É£ Threshold Optimization (Focus: Denied Recall)
# ------------------------------------------------------

thresholds = np.arange(0.1, 0.95, 0.01)

best_threshold = 0
best_recall = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    recall_denied = recall_score(y_test, y_pred_temp, pos_label=0)

    if recall_denied > best_recall:
        best_recall = recall_denied
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best Denied Recall:", best_recall)

# ------------------------------------------------------
# 9Ô∏è‚É£ Final Evaluation
# ------------------------------------------------------

y_pred_final = (y_prob >= best_threshold).astype(int)

print("\nFinal Classification Report:\n")
print(classification_report(y_test, y_pred_final))

# ------------------------------------------------------
# üîü Save Model
# ------------------------------------------------------

joblib.dump(model, "h1b_professional_model.pkl")
print("Model saved successfully ‚úÖ")

# ------------------------------------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Personalized Prediction Function
# ------------------------------------------------------

def predict_h1b(input_df):
    input_df = pd.get_dummies(input_df)
    input_df = input_df.reindex(columns=X.columns, fill_value=0)

    prob = model.predict_proba(input_df)[:,1][0]
    prediction = 1 if prob >= best_threshold else 0

    return {
        "Probability_Certified": round(float(prob), 4),
        "Prediction": "Certified" if prediction == 1 else "Denied"
    }

print("\nPipeline ready for deployment üéØ")

In [None]:
import numpy as np
from sklearn.metrics import f1_score, classification_report, roc_auc_score

# ================================
# GET PROBABILITIES
# ================================

y_prob = model.predict_proba(X_test)[:, 1]

print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# ================================
# FIND BEST BALANCED THRESHOLD
# ================================

thresholds = np.arange(0.05, 0.95, 0.01)

best_threshold = 0
best_f1 = 0

for t in thresholds:
    y_pred_temp = (y_prob >= t).astype(int)
    score = f1_score(y_test, y_pred_temp, average='macro')  # balanced F1

    if score > best_f1:
        best_f1 = score
        best_threshold = t

print("\nBest Threshold (Balanced):", best_threshold)
print("Best Macro F1:", best_f1)

# ================================
# FINAL PREDICTIONS USING BEST THRESHOLD
# ================================

y_pred_final = (y_prob >= best_threshold).astype(int)

print("\nFinal Classification Report:")
print(classification_report(y_test, y_pred_final))

ROC-AUC Score: 0.8940278128477774

Best Threshold (Balanced): 0.4000000000000001
Best Macro F1: 0.8021462856870616

Final Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.67      0.78       327
           1       0.74      0.94      0.83       327

    accuracy                           0.81       654
   macro avg       0.83      0.81      0.80       654
weighted avg       0.83      0.81      0.80       654



In [None]:
# ============================================================
# H1B APPROVAL PREDICTION MODEL ‚Äì FINAL VERSION
# ============================================================

import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

# ============================================================
# 1Ô∏è‚É£ PREPARE FEATURES & TARGET
# ============================================================

# Target already created as TARGET (1 = Certified, 0 = Denied)

# Drop leakage column
X = balanced_df.drop(columns=['TARGET', 'CASE_STATUS'], errors='ignore')
y = balanced_df['TARGET']

print("Leakage check ‚Üí CASE_STATUS in X?:", 'CASE_STATUS' in X.columns)

# ============================================================
# 2Ô∏è‚É£ ENCODE CATEGORICAL FEATURES
# ============================================================

categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Clean column names (XGBoost safe)
X.columns = X.columns.str.replace('[', '', regex=False)
X.columns = X.columns.str.replace(']', '', regex=False)
X.columns = X.columns.str.replace('<', '', regex=False)

print("Encoding completed ‚úÖ")
print("Shape after encoding:", X.shape)

# ============================================================
# 3Ô∏è‚É£ TRAIN TEST SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================================
# 4Ô∏è‚É£ TRAIN XGBOOST MODEL
# ============================================================

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

print("Model trained successfully üöÄ")

# ============================================================
# 5Ô∏è‚É£ EVALUATE ROC-AUC
# ============================================================

y_prob = model.predict_proba(X_test)[:, 1]
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

# ============================================================
# 6Ô∏è‚É£ USE YOUR OPTIMIZED THRESHOLD
# ============================================================

FINAL_THRESHOLD = 0.40

y_pred_final = (y_prob >= FINAL_THRESHOLD).astype(int)

print("\nFinal Classification Report (Threshold = 0.40):")
print(classification_report(y_test, y_pred_final))

# ============================================================
# 7Ô∏è‚É£ SAVE MODEL
# ============================================================

joblib.dump(model, "h1b_model_final.pkl")
print("Model saved successfully ‚úÖ")

# ============================================================
# 8Ô∏è‚É£ USER PREDICTION FUNCTION (PERSONALIZED)
# ============================================================

def predict_user(user_input_dict):

    user_df = pd.DataFrame([user_input_dict])

    # Encode same way
    user_df = pd.get_dummies(user_df)

    # Align columns with training data
    user_df = user_df.reindex(columns=X.columns, fill_value=0)

    prob = model.predict_proba(user_df)[:, 1][0]
    prediction = 1 if prob >= FINAL_THRESHOLD else 0

    return {
        "Probability_Certified": round(float(prob), 4),
        "Prediction": "Certified" if prediction == 1 else "Denied"
    }

print("\nPipeline ready for deployment üéØ")

Leakage check ‚Üí CASE_STATUS in X?: False
Categorical columns: Index(['JOB_TITLE', 'SOC_CODE', 'SOC_TITLE', 'FULL_TIME_POSITION',
       'WAGE_UNIT_OF_PAY', 'WORKSITE_STATE', 'EMPLOYER_NAME', 'SUPPORT_H1B'],
      dtype='object')
Encoding completed ‚úÖ
Shape after encoding: (3266, 4265)
Model trained successfully üöÄ
ROC-AUC Score: 0.8925689008594488

Final Classification Report (Threshold = 0.40):
              precision    recall  f1-score   support

           0       0.93      0.64      0.76       327
           1       0.73      0.95      0.82       327

    accuracy                           0.80       654
   macro avg       0.83      0.80      0.79       654
weighted avg       0.83      0.80      0.79       654

Model saved successfully ‚úÖ

Pipeline ready for deployment üéØ
