In [5]:
import fitz


def extract_pdf_text(filename):

    doc = fitz.open(filename=filename)
    text= ""
    for page in doc:
        text += page.get_text("text")

    return text


Extracted_text = extract_pdf_text(filename="dataset/Resume.pdf")

In [None]:
from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize

model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2")

# Tokenize the document into sentences
tokenized_setences = sent_tokenize(Extracted_text)


# Converting the sentences to vector encoding

embeddings = model.encode(tokenized_setences)

#### Creating the Retriever (FAISS)

In [7]:
import faiss
import numpy as np

embedding_np = np.array(embeddings)

index = faiss.IndexFlatL2(embedding_np.shape[1])

index.add(embedding_np)

#### Query processor

In [18]:
def process_query(user_query , top_k = 3):

    user_query_embeddings = model.encode([user_query])

    distance ,indices = index.search(user_query_embeddings , top_k)

    return indices[0] , distance[0]


User_Query = "What skills does the candidate have ?"
indices ,distances = process_query(user_query=User_Query)

print("Indices" , indices)
print("Distances " , distances)

Indices [3 4 1]
Distances  [1.1575606 1.3573604 1.5754138]


#### Create generator

In [None]:
from transformers import pipeline


generator = pipeline(task="text-generation" , model="gpt2")

relevant_text = '\n'.join([tokenized_setences[index] for index in indices])

response = generator(f"Context :{relevant_text} Question : {User_Query}" , max_length = 200)

print("Generated Resposne :- \n\n" , response[0]['generated_text'])

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Resposne :- 

 Context :AI skills :-
Finetuning(llama3.1 , GPT 3.5),Hugging Face,Scikit-learn,TensorFlow,Keras.
Other-Skills :-
Flask, Requests,Socketio,Threading, Azure service bus,NLTK OpenPyXL.
Data-Skills: :-
Pandas,NumPy,Seaborn,Matplotlib,Scapy,BeautifulSoup. Question : What skills does the candidate have ?
Answer : I am a Computer Science student. I use many 3rd party frameworks, some are very useful. When you are working on the 3rd party language you must use different kind of approach that are not only in your hands but also have different learning experience. I use a large variety of frameworks in multiple languages to develop and test your skills. I have to believe that this tool also has a lot of practical aspects which are not


In [23]:
print(response[0])

{'generated_text': 'Context :AI skills :-\nFinetuning(llama3.1 , GPT 3.5),Hugging Face,Scikit-learn,TensorFlow,Keras.\nOther-Skills :-\nFlask, Requests,Socketio,Threading, Azure service bus,NLTK OpenPyXL.\nData-Skills: :-\nPandas,NumPy,Seaborn,Matplotlib,Scapy,BeautifulSoup. Question : What skills does the candidate have ?\nAnswer : I am a Computer Science student. I use many 3rd party frameworks, some are very useful. When you are working on the 3rd party language you must use different kind of approach that are not only in your hands but also have different learning experience. I use a large variety of frameworks in multiple languages to develop and test your skills. I have to believe that this tool also has a lot of practical aspects which are not'}
