In [None]:
# Imports
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd
import torch

In [None]:
import pandas as pd
import chardet

# Detect encoding first
with open("Career_Decision_Dataset.csv", 'rb') as f:
    result = chardet.detect(f.read())

# load the dataset using the detected encoding
df = pd.read_csv("Career_Decision_Dataset.csv", encoding=result['encoding'])

# Check the loaded data
df.head()


In [None]:
print(df.columns)


In [None]:
df['prompt'] = "Input Prompt: " + df['Input Prompt'] + "\nOutput Scenario: " + df['Output Scenario']


In [None]:
# Create Hugging Face Dataset
dataset = Dataset.from_pandas(df[['prompt']])

In [None]:
dataset[0]

In [None]:
# Load GPT-Neo and tokenizer
model_name = "EleutherAI/gpt-neo-125m"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

In [None]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["prompt"])

In [None]:
# Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir=r"D:\Career_Decision-bot\career_GPT_advisor_chatbot_125m_model",
    overwrite_output_dir=True,
    num_train_epochs=5,  
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    fp16=False,
    report_to="none"
)


In [None]:
import torch
torch.cuda.empty_cache()


In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
# Start Fine-Tuning
trainer.train()

In [None]:
# Define the directory path on your D: drive
output_dir = r"D:\Career_Decision-bot\career_GPT_advisor_chatbot_125m_model"

# Save the fine-tuned model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



In [None]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the trained model and tokenizer
#model = GPTNeoForCausalLM.from_pretrained(r"E:\InterviewGenie\interview_genie_125m_model")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device (GPU or CPU)
model = model.to(device)

# Example prompt
input_prompt = input("Enter your question: ")
# Tokenize the input prompt and move the input tensors to the same device as the model
inputs = tokenizer(input_prompt, return_tensors="pt").to(device)

# Generate text based on the prompt
outputs = model.generate(
    inputs['input_ids'],            # Input token IDs
    max_length=200,                 # Maximum length of the generated sequence (increase if needed)
    num_beams=5,                    # Number of beams for beam search (higher gives better results)
    no_repeat_ngram_size=2,         # Prevent repetition of n-grams
    temperature=0.5,                 # Lower temperature for more deterministic results
    top_p=0.9,                      # Top-p sampling for more controlled randomness
    pad_token_id=tokenizer.eos_token_id  # Ensure padding uses EOS token
)

# Print the generated token IDs before decoding (for debugging)
#print(f"Generated Token IDs: {outputs}")

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
#answer_start = generated_text.lower().find("Output Scenario:")



