In [45]:
import torch
import json
import yaml
import pprint
from torch.utils.data import Dataset, DataLoader
from model import HierarchicalInterviewScorer
from dataset import HierarchicalInterviewDataset
from eval import calculate_feedback
from trainer import Trainer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from model_utils import tokenize_dialogue, predict_scores, format_scores


In [46]:
config_path = "config.yaml"
checkpoint_path = "checkpoin1.pth"
test_data_path = "data.json"



In [47]:
with open(config_path, "r") as file:
    config = yaml.safe_load(file)

device = torch.device(config["train"]["device"])

with open(test_data_path, "r") as f:
    test_data = json.load(f)


In [48]:


data_path = "data.json" 
with open(data_path, "r") as f:
    full_data = json.load(f)

if not isinstance(full_data, dict):
    raise ValueError("Expected 'full_data' to be a dictionary.")

keys = list(full_data.keys())
train_keys, val_keys = train_test_split(keys, test_size=0.2, random_state=42)

train_data = {key: full_data[key] for key in train_keys}
val_data = {key: full_data[key] for key in val_keys}

train_data_path = "data/train_data.json"
val_data_path = "data/val_data.json"

with open(train_data_path, "w") as f:
    json.dump(train_data, f, indent=4)
    
with open(val_data_path, "w") as f:
    json.dump(val_data, f, indent=4)

print(f"Training data saved to {train_data_path} with {len(train_data)} samples.")
print(f"Validation data saved to {val_data_path} with {len(val_data)} samples.")


Training data saved to data/train_data.json with 110 samples.
Validation data saved to data/val_data.json with 28 samples.


In [49]:
train_data_path = "data/train_data.json"
val_data_path = "data/val_data.json"

with open(train_data_path, "r") as f:
    train_data = json.load(f)
with open(val_data_path, "r") as f:
    val_data = json.load(f)

train_dataset = HierarchicalInterviewDataset(train_data)
val_dataset = HierarchicalInterviewDataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=config["train"]["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config["train"]["batch_size"], shuffle=False)


In [50]:
model = HierarchicalInterviewScorer().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=config["train"]["learning_rate"])



trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    max_epochs=config["train"]["max_epochs"]
)


In [51]:
trainer.train()

100%|██████████| 14/14 [00:15<00:00,  1.09s/it]


Epoch 1 | Train Loss: 5.828921624592373
Validation Loss: 0.4427642151713371


100%|██████████| 14/14 [00:15<00:00,  1.09s/it]


Epoch 2 | Train Loss: 0.810506556715284
Validation Loss: 0.8056420311331749


100%|██████████| 14/14 [00:15<00:00,  1.09s/it]


Epoch 3 | Train Loss: 0.7978470836366925
Validation Loss: 0.6536044850945473


100%|██████████| 14/14 [00:15<00:00,  1.09s/it]


Epoch 4 | Train Loss: 0.7968785762786865
Validation Loss: 0.509797215461731


100%|██████████| 14/14 [00:15<00:00,  1.10s/it]


Epoch 5 | Train Loss: 0.7355893445866448
Validation Loss: 0.8296225219964981


100%|██████████| 14/14 [00:15<00:00,  1.10s/it]


Epoch 6 | Train Loss: 0.6988619012492043
Validation Loss: 0.6281289607286453


100%|██████████| 14/14 [00:15<00:00,  1.10s/it]


Epoch 7 | Train Loss: 0.6435613696064267
Validation Loss: 0.6464710086584091


100%|██████████| 14/14 [00:15<00:00,  1.12s/it]


Epoch 8 | Train Loss: 0.6761828341654369
Validation Loss: 0.5492907911539078


100%|██████████| 14/14 [00:16<00:00,  1.18s/it]


Epoch 9 | Train Loss: 0.7211620488337108
Validation Loss: 0.7410292029380798


100%|██████████| 14/14 [00:16<00:00,  1.20s/it]


Epoch 10 | Train Loss: 0.745751406465258
Validation Loss: 0.6570994555950165


100%|██████████| 14/14 [00:16<00:00,  1.20s/it]


Epoch 11 | Train Loss: 0.8036160341330937
Validation Loss: 0.6187163889408112


100%|██████████| 14/14 [00:17<00:00,  1.22s/it]


Epoch 12 | Train Loss: 0.6884336067097527
Validation Loss: 0.5743231549859047


100%|██████████| 14/14 [00:17<00:00,  1.22s/it]


Epoch 13 | Train Loss: 0.7872324160167149
Validation Loss: 1.030325673520565


100%|██████████| 14/14 [00:16<00:00,  1.21s/it]


Epoch 14 | Train Loss: 0.7126115124140467
Validation Loss: 0.49297795444726944


100%|██████████| 14/14 [00:17<00:00,  1.23s/it]


Epoch 15 | Train Loss: 0.687884047627449
Validation Loss: 0.8639805912971497


100%|██████████| 14/14 [00:16<00:00,  1.18s/it]


Epoch 16 | Train Loss: 0.7557264694145748
Validation Loss: 0.7659858465194702


100%|██████████| 14/14 [00:16<00:00,  1.16s/it]


Epoch 17 | Train Loss: 0.6286451433386121
Validation Loss: 0.6017658561468124


100%|██████████| 14/14 [00:17<00:00,  1.29s/it]


Epoch 18 | Train Loss: 0.6652419183935437
Validation Loss: 0.47147973626852036


100%|██████████| 14/14 [00:17<00:00,  1.24s/it]


Epoch 19 | Train Loss: 0.7107865065336227
Validation Loss: 0.7395778745412827


100%|██████████| 14/14 [00:17<00:00,  1.26s/it]


Epoch 20 | Train Loss: 0.6854832353336471
Validation Loss: 0.7612397894263268


100%|██████████| 14/14 [00:17<00:00,  1.25s/it]


Epoch 21 | Train Loss: 0.6210690992219108
Validation Loss: 0.6595550999045372


100%|██████████| 14/14 [00:16<00:00,  1.21s/it]


Epoch 22 | Train Loss: 0.7282033903258187
Validation Loss: 0.586664155125618


100%|██████████| 14/14 [00:17<00:00,  1.22s/it]


Epoch 23 | Train Loss: 0.6055919783455985
Validation Loss: 0.8000767976045609


100%|██████████| 14/14 [00:17<00:00,  1.23s/it]


Epoch 24 | Train Loss: 0.5805834255048207
Validation Loss: 0.7724829390645027


100%|██████████| 14/14 [00:17<00:00,  1.26s/it]


Epoch 25 | Train Loss: 0.6373833758490426
Validation Loss: 0.7732015252113342


100%|██████████| 14/14 [00:17<00:00,  1.25s/it]


Epoch 26 | Train Loss: 0.5720645657607487
Validation Loss: 0.672889344394207


100%|██████████| 14/14 [00:17<00:00,  1.22s/it]


Epoch 27 | Train Loss: 0.6040290700537818
Validation Loss: 0.8964478000998497


100%|██████████| 14/14 [00:18<00:00,  1.31s/it]


Epoch 28 | Train Loss: 0.6023351294653756
Validation Loss: 0.51471097022295


100%|██████████| 14/14 [00:17<00:00,  1.23s/it]


Epoch 29 | Train Loss: 0.5764913633465767
Validation Loss: 1.1227686703205109


100%|██████████| 14/14 [00:19<00:00,  1.40s/it]


Epoch 30 | Train Loss: 0.5617868580988475
Validation Loss: 0.5746353231370449


In [52]:
test_dataset = HierarchicalInterviewDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=config["train"]["batch_size"], shuffle=False)


In [58]:
torch.save(model.state_dict(), "checkpoint1.pth")

In [53]:
torch.save(model, "checkpoint1.pth")

In [54]:

all_true_scores = []
all_predicted_scores = []

model.eval()
with torch.no_grad():
    for dialogue_turns, labels in test_loader:
        dialogue_turns = {k: v.to(device) for k, v in dialogue_turns.items()}
        labels = labels.to(device)
        predictions = model(dialogue_turns)
        all_true_scores.extend(labels.cpu().numpy())
        all_predicted_scores.extend(predictions.cpu().numpy())

all_true_scores_tensor = torch.tensor(all_true_scores)
all_predicted_scores_tensor = torch.tensor(all_predicted_scores)



In [55]:

feedback = calculate_feedback(all_predicted_scores_tensor, all_true_scores_tensor)

# import pprint
# pp = pprint.PrettyPrinter(indent=4)
# for idx, candidate_feedback in enumerate(feedback):
#     print(f"Feedback for Candidate {idx + 1}:")
#     pp.pprint(candidate_feedback)
#     print("\n")


In [59]:
def load_trained_model(checkpoint_path, device="cpu"):
    from model import HierarchicalInterviewScorer

    model = HierarchicalInterviewScorer(hidden_size=768, num_dialogue_layers=2, dropout=0.3)
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model = model.to(device)
    return model

In [60]:
model = load_trained_model("checkpoint1.pth", device=device)

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


In [68]:
dialogue = [
    "Hello, how are you?",
    "I'm doing well, thank you!",
    "What about yourself?",
    "I'm good too. It's a sunny day."
]

tokenized_input = tokenize_dialogue(conversation, max_turns=20, max_length=512)


In [69]:
tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}

predicted_scores = predict_scores(model, tokenized_input, device=device)

formatted_scores = format_scores(predicted_scores)
print("Predicted Scores:", formatted_scores)

Predicted Scores: {'Overall Score': 4.872854709625244, 'Recommendation Score': 4.576309680938721, 'Structured Answers Score': 4.527109622955322}


In [84]:
import json

val_data_path = "data/val_data.json"
with open(val_data_path, "r") as f:
    val_data = json.load(f)

print(f"Number of validation samples: {len(val_data)}")
print("Example Sample:", list(val_data.keys())[1])


Number of validation samples: 28
Example Sample: pp55


In [85]:
sample_key = list(val_data.keys())[1] 
sample = val_data[sample_key]

print("Sample Key:", sample_key)
print("Sample Dialogue:", sample["Transcript"])
print("Ground Truth Scores:", sample.get("Overall"), sample.get("RecommendHiring"), sample.get("StructuredAnswers"))


Sample Key: pp55
Sample Dialogue: Interviewer: Ready?|Interviewee: Yeah  sure.|Interviewer: How are you doing today?|Interviewee: Ummm  I'm great. Thank you.|Interviewer: OK Tell me a bit about your background|Interviewee: OK well I am a math major  but I have a lot of background in kind of different areas. So freshman year I did some work  um  at the Department of Energy  um  working the CFO's office so I'm getting kind of a feel for the budget um also risk matrices. Then I kinda branched more towards CS aspects  so  I did a bit of coding here and there in most research aspects and also  um  for companies.|Interviewer: Great. And do you think you are a leader  and why?|Interviewee: Ummm  definitely. So I'm president of The Undergrad Society of Learner Math and this basically gives me a great opportunity to  um  kind of help underclassmen and  also  younger girls  so those in highschool  middle school and kind of get them to become more active in the math community um participate in mo

In [86]:
tokenized_input = tokenize_dialogue(sample["Transcript"], max_turns=20, max_length=512)


In [87]:
model = load_trained_model("checkpoint1.pth", device=device)
predicted_scores = predict_scores(model, tokenized_input, device=device)
formatted_scores = format_scores(predicted_scores)

print("Predicted Scores:", formatted_scores)


  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Predicted Scores: {'Overall Score': 4.570374488830566, 'Recommendation Score': 4.2505927085876465, 'Structured Answers Score': 4.107512950897217}


In [29]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer.cls_token)

candidate_1_data = test_dataset[1]
dialogue_turns, labels = candidate_1_data

print("Original Dialogue for Candidate 1:")
for turn_idx, input_ids in enumerate(dialogue_turns["input_ids"]):
    decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    print(f"Turn {turn_idx + 1}: {decoded_text}")

print(labels)

[CLS]
Original Dialogue for Candidate 1:
Turn 1: so how you doing?
Turn 2: great how about you?
Turn 3: i'm okay.
Turn 4: i'm a little [??? ] by the resurgence of the hurricane but..
Turn 5: i understand i got to get home later too and i'm worried.
Turn 6: yeah we were half way here and it just turned on a switch lightening winds and everything.
Turn 7: oh wow.
Turn 8: so yeah.
Turn 9: so uh tell me about yourself.
Turn 10: okay um i'm a junior here at mit um i'm studying aerospace engineering um my my interests are generally in food mechanics control um design um mostly aircraft but protozoan [??? ] problems in general. um i like to um i like to play music listen to music um i run do some other team sports i like to play hockey.
Turn 11: tell me about a time when you demonstrated leadership.
Turn 12: um okay so uh one of the projects that i've worked on since coming to mit um was during my freshman year i worked in the in the uh artificial intelligence group in uh cesil [ sp ] which i



In [63]:
conversation = """
User: Hi I'm Bob
Interviewer: Hello Bob, nice to meet you. I'm Burdell, the interviewer today. It's great to have you here. Can you start by telling me a little bit about yourself? What motivated you to apply for this role, and what do you know about Microsoft and our products?
User: Sure I'm a third year CS major
Interviewer: So you're a third-year computer science major. That's impressive. What specific areas of computer science have you been interested in, and do you have any experience with any of the technologies we use here at Microsoft?
User: I'm interested in ML. Do you want to start with a technical question
Interviewer: Machine Learning is a fascinating field. Let's dive into that. Can you explain the concept of a "black box" in machine learning, and how it relates to the problem of feature engineering in a certain type of machine learning model?
User: I don't know that. Can you tell me the answer
Interviewer: No worries, Bob. A black box refers to a machine learning model that is not transparent or explainable in terms of its internal workings or the relationships between the inputs and outputs. This can make it difficult to interpret or understand the model's performance. In a classification problem, a black box model might output a probability distribution over multiple classes, without providing any insight into which class the model is most likely to predict. In the context of feature engineering, a black box model might not provide any information about the features used in the model, making it hard to understand how the model is selecting those features.
Now, let's move on to the next question. Can you explain the concept of a "k-fold cross-validation" in machine learning, and how it is used to evaluate the performance of a model?
User: k fold is folding pancakes together so that we can check the quality of pancakes
Interviewer: (laughs) Ah, nice one, Bob! I think we can safely say that's not a common use of k-fold cross-validation in the culinary world. Seriously though, k-fold cross-validation is a technique used to evaluate the performance of a machine learning model by splitting the data into multiple folds and using each fold for training and the remaining folds for testing. This helps to reduce overfitting and provides a more accurate estimate of the model's performance.
Now, let's move on to a technical question. Can you explain the concept of "stateless" and "stateful" in the context of a neural network, and how they relate to the training process?
User: k fold is folding pancakes together so that we can check the quality of pancakes
Interviewer: (laughs) Ah, I think we've had enough culinary analogies for one day, Bob. I'm afraid k-fold cross-validation isn't related to pancakes. Seriously though, stateless and stateful are key concepts in neural networks. A stateless network is one where the model's internal state is not preserved between different forward passes, whereas a stateful network is one where the model's internal state is preserved between different forward passes, allowing the model to learn long-term dependencies. In practice, this means that a stateless network might not retain any information about previous inputs, while a stateful network might retain some information and use it to make predictions.
Now, let's move on to another question. Can you walk me through an example of how you would implement a simple linear regression model in C, and include error handling and validation to ensure the model is robust and reliable?
"""


In [22]:
conversation = """Turn 1: so how you doing?
Turn 2: great how about you?
Turn 3: i'm okay.
Turn 4: i'm a little [??? ] by the resurgence of the hurricane but..
Turn 5: i understand i got to get home later too and i'm worried.
Turn 6: yeah we were half way here and it just turned on a switch lightening winds and everything.
Turn 7: oh wow.
Turn 8: so yeah.
Turn 9: so uh tell me about yourself.
Turn 10: okay um i'm a junior here at mit um i'm studying aerospace engineering um my my interests are generally in food mechanics control um design um mostly aircraft but protozoan [??? ] problems in general. um i like to um i like to play music listen to music um i run do some other team sports i like to play hockey.
Turn 11: tell me about a time when you demonstrated leadership.
Turn 12: um okay so uh one of the projects that i've worked on since coming to mit um was during my freshman year i worked in the in the uh artificial intelligence group in uh cesil [ sp ] which is a lab a computer science lab here at mit.
Turn 13: okay.
Turn 14: and um i was working on a project with another student actually another freshman uh basically we were tasked with the sort of bringing this project starting this project bring it up to the level where it could be presented at conference.
Turn 15: mm - hmm.
Turn 16: um together and we were kind of left to work out the cooperation on our own and um there was um as the conference got closer about halfway um about halfway through the year freshman year ah there were a bunch of issues that we had with um with um basically meeting the expectations that had been set before us in terms of what we were going to present. um and finishing in a timely basis.
Turn 17: mm - hmm.
Turn 18: and it came down to whether we were really going to be able to and there were other basically parts other people's work in the same group where work depended on us finishing in time so it really um really we had to make a decision weather we were just going to basically get done what we could but to the quality standard that the standard of quality that was expected of us or to um try to finish everything and to have other people's work potentially suffer because of our [?? ] hadn't really been up to snuff. and um and so we kind of had a little bit of a disagreement about this
Turn 19: tell me about a time when you were working on a team and you were faced with a challenge. how did you solve the problem?
"""

In [23]:
new_dialogue = [
    "Turn 1: so how you doing?"
    "Turn 2: I hate you"
    "Turn 3: i'm okay."
    "Turn 4: ."
    "Turn 5: i understand i got to get home later too and i'm worried."
    "Turn 6: yeah we were half way here and it just turned on a switch lightening winds and everything.",
]

tokenized_input = tokenize_dialogue(new_dialogue)
predicted_scores = predict_scores(model, tokenized_input, device=device)
formatted_scores = format_scores(predicted_scores)
print("Predicted Scores for the New Dialogue:")
for label, score in formatted_scores.items():
    print(f"{label}: {score:.2f}")

Predicted Scores for the New Dialogue:
Overall Score: 5.14
Recommendation Score: 5.12
Structured Answers Score: 5.14


In [24]:
import joblib

In [25]:
model = joblib.load("data/y_overall_model.joblib")
prediction = model.predict("How, are you, I am good.")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: node array from the pickle has an incompatible dtype:
- expected: {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}
- got     : [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]