In [1]:
import pandas as pd
combined_df = pd.read_excel('/content/drive/MyDrive/ai_recruitment_dataset/combined_data.xlsx', nrows=1500)
print(combined_df.head())

          ID          Name               Role  \
0  brenbr359   brent brown    product manager   
1  jameay305   james ayala  software engineer   
2  scotri565  scott rivera      data engineer   
3  emilke232   emily kelly        ui engineer   
4  ashlra638    ashley ray     data scientist   

                                          Transcript  \
0  product manager interview transcript\n\ninterv...   
1  software engineer interview transcript\n\ninte...   
2  here is a simulated interview for scott rivera...   
3  interview transcript: emily kelly for ui engin...   
4  data scientist interview transcript\n\ncompany...   

                                              Resume  Decision  \
0  here's a sample resume for brent brown applyin...         1   
1  here's a sample resume for james ayala applyin...         1   
2  here's a sample resume for scott rivera applyi...         0   
3  here's a sample resume for emily kelly:\n\nemi...         1   
4  here's a sample resume for ashley r

In [2]:
from transformers import BertTokenizer, BertModel
import torch

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32):
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size  # Total number of batches
    print(f"Total Batches: {total_batches}")

    for i in range(total_batches):
        # Print progress
        print(f"Processing batch {i + 1}/{total_batches}...")

        # Get the current batch
        batch = texts[i * batch_size:(i + 1) * batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)

        with torch.no_grad():
            outputs = model(**inputs)

        # Use the [CLS] token representation for each text in the batch
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)

    return embeddings

In [6]:
from tqdm import tqdm
def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32):
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size  # Total number of batches

    # Use tqdm for a progress bar
    for i in tqdm(range(total_batches), desc="Processing Batches"):
        batch = texts[i * batch_size:(i + 1) * batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)

        with torch.no_grad():
            outputs = model(**inputs)

        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)

    return embeddings

#Get Embeddings

In [7]:
# Apply batch processing
texts = combined_df['Transcript'].tolist()

In [8]:
batch_embeddings = get_bert_embeddings_batch(texts, tokenizer, model, batch_size=96)

Processing Batches: 100%|██████████| 16/16 [30:27<00:00, 114.25s/it]


In [9]:
combined_df['bert_embeddings_trans'] = batch_embeddings

#Repeat above for resume and JD

In [10]:
# Apply batch processing for Resume data
resume_texts = combined_df['Resume'].tolist() # Assuming 'Resume' is the column containing resume text
batch_embeddings_resume = get_bert_embeddings_batch(resume_texts, tokenizer, model, batch_size=96)

# Add the embeddings to the DataFrame


Processing Batches: 100%|██████████| 16/16 [27:21<00:00, 102.61s/it]


In [11]:
combined_df['bert_embeddings_resume'] = batch_embeddings_resume

In [12]:
# Apply batch processing for JD data
jd_texts = combined_df['Job Description'].tolist()
batch_embeddings_jd = get_bert_embeddings_batch(jd_texts, tokenizer, model, batch_size=96)

# Add the embeddings to the DataFrame
combined_df['bert_embeddings_jd'] = batch_embeddings_jd

Processing Batches: 100%|██████████| 16/16 [01:00<00:00,  3.77s/it]


In [13]:
resume_expanded = pd.DataFrame(combined_df['bert_embeddings_resume'].tolist(), index=combined_df.index)
resume_expanded.columns = [f'resume_emb_{i}' for i in range(resume_expanded.shape[1])]

In [14]:
trans_expanded = pd.DataFrame(combined_df['bert_embeddings_trans'].tolist(), index=combined_df.index)
trans_expanded.columns = [f'trans_emb_{i}' for i in range(trans_expanded.shape[1])]

In [15]:
jd_expanded = pd.DataFrame(combined_df['bert_embeddings_jd'].tolist(), index=combined_df.index)
jd_expanded.columns = [f'jd_emb_{i}' for i in range(resume_expanded.shape[1])]

In [16]:
df_expanded = pd.concat([combined_df, trans_expanded, resume_expanded], axis=1)

In [17]:
df_expanded = df_expanded.drop(columns=['bert_embeddings_trans', 'bert_embeddings_resume','bert_embeddings_jd'])

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [20]:
label_encoder = LabelEncoder()
df_expanded['decision_encoded'] = label_encoder.fit_transform(df_expanded['Decision'])

# Step 2: Split features and target
# Assuming embeddings columns are named like 'trans_emb_0', 'resume_emb_0', etc.
embedding_columns = [col for col in df_expanded.columns if col.startswith('trans_emb_') or col.startswith('resume_emb_')]
X = df_expanded[embedding_columns]
y = df_expanded['decision_encoded']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [32]:
!pip uninstall xgboost -y
!pip uninstall scikit-learn -y

Found existing installation: xgboost 2.1.3
Uninstalling xgboost-2.1.3:
  Successfully uninstalled xgboost-2.1.3
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2


In [33]:
!pip install scikit-learn==1.0.2
!pip install xgboost

Collecting scikit-learn==1.0.2
  Downloading scikit-learn-1.0.2.tar.gz (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (pyproject.toml) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Co

In [35]:
!pip install scikit-learn==1.0.2

Collecting scikit-learn==1.0.2
  Using cached scikit-learn-1.0.2.tar.gz (6.7 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mPreparing metadata [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (pyproject.toml) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [37]:
model = XGBClassifier(max_depth=5,n_estimators=200,learning_rate=0.1, eval_metric='logloss')
model.fit(X_train, y_train)
try:
    feature_names = model.get_feature_names_out()  # For scikit-learn version 1.0 and above
except AttributeError:
    # If get_feature_names_out is not available (older versions)
    feature_names = X_train.columns  # Fallback to using column names

In [38]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8266666666666667


In [39]:
print("roc_auc:", roc_auc_score(y_test, y_pred))

roc_auc: 0.8261217948717948


#Train model with other features combined with embeddings

In [43]:
label_encoder = LabelEncoder()
df_expanded['decision_encoded'] = label_encoder.fit_transform(df_expanded['Decision'])

# Step 2: Split features and target
label_encoder = LabelEncoder()
df_expanded['decision_encoded'] = label_encoder.fit_transform(df_expanded['Decision'])

# Step 2: Split features and target
# Assuming embeddings columns are named like 'trans_emb_0', 'resume_emb_0', etc.
# Assuming embeddings columns are named like 'trans_emb_0', 'resume_emb_0', etc.
embedding_columns = [col for col in df_expanded.columns if col.startswith('trans_emb_') or col.startswith('resume_emb_')]

# Check if 'feature_1' and 'feature_2' are in the DataFrame columns
features_to_add = ['feature_1', 'feature_2']
available_features = [f for f in features_to_add if f in df_expanded.columns]

# Select embedding columns and available features
X = df_expanded[embedding_columns + available_features]
y = df_expanded['decision_encoded']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [47]:
model = XGBClassifier(max_depth=7,n_estimators=200,learning_rate=0.05, eval_metric='logloss')
model.fit(X_train, y_train)
try:
    feature_names = model.get_feature_names_out()  # For scikit-learn version 1.0 and above
except AttributeError:
    # If get_feature_names_out is not available (older versions)
    feature_names = X_train.columns  # Fallback to using column names

In [48]:
y_test_pred_proba_xgb = model.predict_proba(X_test)

In [49]:
y_test_pred_xgb = model.predict(X_test)

In [51]:
y_test_pred_proba_xgb = model.predict_proba(X_test)
y_test_pred_proba_xgb = [i[1] for i in y_test_pred_proba_xgb] # Now this line should work correctly

In [52]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_test_pred_xgb))

Accuracy: 0.81


In [53]:
print("roc_auc:", roc_auc_score(y_test, y_test_pred_proba_xgb))

roc_auc: 0.9285523504273504


In [54]:
pip install tensorflow



In [55]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

#Play Around with ANN features

In [56]:
model = Sequential([
    # Input layer
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),  # Normalize inputs for faster convergence
    Dropout(0.1),  # Dropout to prevent overfitting

    # Hidden layers
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),

    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.1),

    # Output layer
    Dense(1, activation='sigmoid')  # For binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [57]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',  # For binary classification
    metrics=['accuracy']
)

# Step 3: Set up early stopping

In [58]:
history = model.fit(
    X_train,
    y_train,
    epochs=50,  # Increase max epochs for deeper models
    batch_size=8,
    validation_split=0.2,
    verbose=1
)

Epoch 1/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.7460 - loss: 0.5277 - val_accuracy: 0.8208 - val_loss: 0.3167
Epoch 2/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8028 - loss: 0.3972 - val_accuracy: 0.8458 - val_loss: 0.2760
Epoch 3/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8010 - loss: 0.3573 - val_accuracy: 0.7875 - val_loss: 0.2899
Epoch 4/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7997 - loss: 0.3818 - val_accuracy: 0.8208 - val_loss: 0.2797
Epoch 5/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.8085 - loss: 0.3365 - val_accuracy: 0.8333 - val_loss: 0.2769
Epoch 6/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8182 - loss: 0.3458 - val_accuracy: 0.8042 - val_loss: 0.3015
Epoch 7/50
[1m120/120[0m

In [59]:
y_test_pred_nn = model.predict(X_test)
y_test_pred_nn = [i[0] for i in y_test_pred_nn]

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [60]:
test_df = pd.DataFrame()
test_df['actuals'] = y_test
test_df['xgb_pred'] = y_test_pred_proba_xgb
test_df['nn_pred'] = y_test_pred_nn

In [61]:
test_df

Unnamed: 0,actuals,xgb_pred,nn_pred
885,0,0.001208,0.004031
1200,1,0.999440,0.998738
10,1,0.484018,0.994207
1141,0,0.000934,0.001140
195,0,0.935048,0.998491
...,...,...,...
910,0,0.000846,0.000669
1038,1,0.999370,0.997739
969,1,0.998749,0.997523
499,1,0.997427,0.999336


In [62]:
test_df['mean_prob'] = (test_df['xgb_pred'] + test_df['nn_pred'])/2

In [63]:
test_df['new_pred'] = test_df['mean_prob'].round()

In [64]:
test_df

Unnamed: 0,actuals,xgb_pred,nn_pred,mean_prob,new_pred
885,0,0.001208,0.004031,0.002620,0.0
1200,1,0.999440,0.998738,0.999089,1.0
10,1,0.484018,0.994207,0.739113,1.0
1141,0,0.000934,0.001140,0.001037,0.0
195,0,0.935048,0.998491,0.966769,1.0
...,...,...,...,...,...
910,0,0.000846,0.000669,0.000757,0.0
1038,1,0.999370,0.997739,0.998554,1.0
969,1,0.998749,0.997523,0.998136,1.0
499,1,0.997427,0.999336,0.998381,1.0


In [65]:
accuracy_score(test_df['actuals'],test_df['new_pred'])

0.81

In [66]:
roc_auc_score(test_df['actuals'],test_df['xgb_pred'])

0.9285523504273504

In [67]:
roc_auc_score(test_df['actuals'],test_df['nn_pred'])

0.8867966524216524

In [68]:
roc_auc_score(test_df['actuals'],test_df['mean_prob'])

0.9290420227920227