In [1]:
%%capture
! pip install langchain_huggingface langchain_groq

In [2]:
system = "You are a senior recruiter with 10 years of experience. You always hire the best talents for companies."

prompt = """

Generate 5 questions for the category {category} Topic: {topic}

Please keep the questions varied and don't maintain the same meaning.

Please conform with the following structure:
- Question: question content
- Difficulty: question difficulty. Easy/Medium/Hard
- Category: question category

"""

In [None]:
import requests
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate

# from dotenv import load_dotenv
# from langchain_google_genai import ChatGoogleGenerativeAI

from google.colab import userdata

# Google Gemini
# model = ChatGoogleGenerativeAI(model="gemini-pro", temperature = 0.3)

llm = "llama-3.1-70b-versatile"

# Groq

model = ChatGroq( model_name=llm, temperature = 0, groq_api_key = userdata.get('GROQ_API_KEY')
)


In [None]:
job_description = (
    "As a Data Scientist at 3D-Factory, you will be responsible for analyzing complex datasets to extract actionable insights. "
    "Your role will involve developing and deploying predictive models and machine learning algorithms to tackle business challenges. "
    "You will create clear visualizations and reports to effectively communicate findings to non-technical stakeholders, identify opportunities "
    "for process improvements based on data, and collaborate with cross-functional teams to deliver data-driven solutions.\n\n"
    "To succeed in this role, you should have a Bachelor’s or Master’s degree in Data Science, Computer Science, Statistics, Mathematics, "
    "or a related field, with advanced degrees being a plus. You need to be proficient in programming languages such as Python or R and have "
    "experience with machine learning and data analysis libraries like scikit-learn, TensorFlow, Keras, or PyTorch. Additionally, you should be "
    "skilled in SQL and adept at managing both relational and non-relational databases. A solid knowledge of data visualization tools such as Tableau, "
    "Power BI, or matplotlib is essential, along with strong analytical skills and expertise in statistical analysis and predictive modeling. The ability "
    "to clearly communicate complex results to non-technical audiences is crucial. Previous experience as a Data Scientist or in a similar role is preferred, "
    "with relevant internships or projects being appreciated. Experience with Big Data tools like Hadoop or Spark and familiarity with cloud environments "
    "such as AWS, Azure, or Google Cloud are desirable."
)
prompt = """Identify topics that can be used for interview questions based on this job description:

{job_description}

Please respect the following structure:
[TOPIC_1, TOPIC_2, TOPIC_3]"""

In [None]:
prompt = PromptTemplate(input_variables=["job_description"], template = prompt)
chain = prompt | model

# print(response.json())
#topic = "data science"


content = chain.invoke({'job_description': job_description }).content.strip()
content

In [None]:
topics = [topic.strip() for topic in content[content.index('[')+1:content.index(']')].split(',')]

In [None]:
# To-DO
def generate_topics_from_job_description(job_description):
  pass
def generate_questions_from_topic(category, topic):
  pass


In [None]:
topics

In [None]:
category = "Data Science"

prompt = """

Generate 5 questions for the category {category} Topic: {topic}

Please keep the questions varied and don't maintain the same meaning.

Please conform with the following structure:
[question_content, difficulty_level]
[question_content, difficulty_level]
[question_content, difficulty_level]

"""

questions = []
prompt_temp = PromptTemplate(input_variables=["category", "topic"], template = prompt)
chain = prompt_temp | model

for topic in topics:

  # GEN

  questions_raw = chain.invoke({'topic': topic, 'category': category }).content.strip()
  # type questions_raw? str
  # split
  loc_questions = [question[question.index('[')+1: question.index(']')].replace('"', '').split(',') for question in questions_raw.split('\n')]

  questions += loc_questions

  # structure is not stable

In [None]:
questions

In [None]:
len(questions)

In [None]:
import ast
loc_questions = [question[question.index('[')+1: question.index(']')].replace('"', '').split(',') for question in questions_raw.split('\n')]

In [None]:
%%capture
!pip install flash_attn==2.5.8 torch==2.3.1 accelerate==0.31.0 transformers==4.43.0

In [None]:
# Open Source
#
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256
)

hf = HuggingFacePipeline(pipeline=pipe)


In [None]:
hf.invoke(prompt) # cpu

In [None]:
hf.invoke(f'You are a recruiter. You will be given a test to a job applicant. Question for an interview about the topic {topic}: The first question is')

In [None]:
from torch import cuda

cuda.empty_cache()

In [None]:
%%capture
!pip install accelerate bitsandbytes xformers adjustText transformers

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation types
)

# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token = hf_token)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
    token = hf_token,
  )

model.eval()

# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1 #
)

prompt = f"Generate an interview question for the topic: {topic}"
res = generator(prompt)
res[0]['generated_text']