In [1]:
# Imports
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [35]:
import pandas as pd
import chardet

# Detect encoding first
with open("Career_Decision_Dataset.csv", 'rb') as f:
    result = chardet.detect(f.read())

# load the dataset using the detected encoding
df = pd.read_csv("Career_Decision_Dataset.csv", encoding=result['encoding'])

# Check the loaded data
df.head()


Unnamed: 0,Input Prompt,Output Scenario
0,How does life look as a product manager?,Product management involves leadership and cro...
1,Tell me about the challenges of being a academia.,"Academia allows deep intellectual engagement, ..."
2,What are the pros and cons of choosing Should ...,"Finance offers high earning potential, but it ..."
3,Is AI and machine learning the right career fo...,AI and ML are cutting-edge fields with strong ...
4,Is pursuing Should I join the armed forces wor...,"The armed forces offer honor and discipline, b..."


In [28]:
print(df.columns)


Index(['Input Prompt', 'Output Scenario'], dtype='object')


In [36]:
df['prompt'] = "Input Prompt: " + df['Input Prompt'] + "\nOutput Scenario: " + df['Output Scenario']


In [37]:
# Create Hugging Face Dataset
dataset = Dataset.from_pandas(df[['prompt']])

In [46]:
dataset[0]

{'prompt': 'Input Prompt: How does life look as a product manager?\nOutput Scenario: Product management involves leadership and cross-functional skills, but it can be stressful and requires balancing many priorities. Success in this field depends heavily on networking, practical experience, and staying updated with emerging trends.'}

In [38]:
# Load GPT-Neo and tokenizer
model_name = "EleutherAI/gpt-neo-125m"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

In [39]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token


In [40]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["prompt"])

Map: 100%|██████████| 210/210 [00:00<00:00, 2368.36 examples/s]


In [41]:
# Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [42]:
training_args = TrainingArguments(
    output_dir=r"D:\Career_Decision-bot\career_GPT_advisor_chatbot_125m_model",
    overwrite_output_dir=True,
    num_train_epochs=5,  
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    fp16=False,
    report_to="none"
)


In [43]:
import torch
torch.cuda.empty_cache()


In [44]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [45]:
# Start Fine-Tuning
trainer.train()

Step,Training Loss
100,1.0263
200,0.2563
300,0.1982
400,0.1819
500,0.1489


TrainOutput(global_step=525, training_loss=0.3520356205531529, metrics={'train_runtime': 410.2033, 'train_samples_per_second': 2.56, 'train_steps_per_second': 1.28, 'total_flos': 137133726105600.0, 'train_loss': 0.3520356205531529, 'epoch': 5.0})

In [52]:
# Define the directory path on your D: drive
output_dir = r"D:\Career_Decision-bot\career_GPT_advisor_chatbot_125m_model"

# Save the fine-tuned model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



('D:\\Career_Decision-bot\\career_GPT_advisor_chatbot_125m_model\\tokenizer_config.json',
 'D:\\Career_Decision-bot\\career_GPT_advisor_chatbot_125m_model\\special_tokens_map.json',
 'D:\\Career_Decision-bot\\career_GPT_advisor_chatbot_125m_model\\vocab.json',
 'D:\\Career_Decision-bot\\career_GPT_advisor_chatbot_125m_model\\merges.txt',
 'D:\\Career_Decision-bot\\career_GPT_advisor_chatbot_125m_model\\added_tokens.json')

In [51]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the trained model and tokenizer
#model = GPTNeoForCausalLM.from_pretrained(r"E:\InterviewGenie\interview_genie_125m_model")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device (GPU or CPU)
model = model.to(device)

# Example prompt
input_prompt = input("Enter your question: ")
# Tokenize the input prompt and move the input tensors to the same device as the model
inputs = tokenizer(input_prompt, return_tensors="pt").to(device)

# Generate text based on the prompt
outputs = model.generate(
    inputs['input_ids'],            # Input token IDs
    max_length=200,                 # Maximum length of the generated sequence (increase if needed)
    num_beams=5,                    # Number of beams for beam search (higher gives better results)
    no_repeat_ngram_size=2,         # Prevent repetition of n-grams
    temperature=0.5,                 # Lower temperature for more deterministic results
    top_p=0.9,                      # Top-p sampling for more controlled randomness
    pad_token_id=tokenizer.eos_token_id  # Ensure padding uses EOS token
)

# Print the generated token IDs before decoding (for debugging)
#print(f"Generated Token IDs: {outputs}")

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
#answer_start = generated_text.lower().find("Output Scenario:")





I want to become a Artist or a ML engineer?
Output Scenario: Art and ML are cutting-edge fields with strong future prospects, but they require deep knowledge in math and computer science. Success in this field depends heavily on networking, practical experience, and staying updated with emerging trends. Additionally, this career often involves working in fast-paced, high-pressure environments. It's a path that can be immensely rewarding for those with a genuine passion and resilience.
