In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains.summarize import load_summarize_chain
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
import torch
import base64

#model and tokenizer loading
checkpoint = "LaMini-Flan-T5-248M"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32)

#file loader and preprocessing
def file_preprocessing(file):
    loader =  PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts

#LLM pipeline
def llm_pipeline(filepath):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500, 
        min_length = 50)
    input_text = file_preprocessing(filepath)
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [5]:
print(llm_pipeline('TanveerSingh_Resume.pdf'))

page_content='Tanveer  Singh  Gupta  \n MSc.  Statistics  and Data  Science  \n  Email : - tanveersingh182764@gmail.com  Contact :- +91 9082289392     Tanveer  Singh  Gupta       Github   \nEDUCATION' metadata={'source': 'TanveerSingh_Resume.pdf', 'page': 0}
page_content='EDUCATION  \nQualification  Institute  Board  / University  Year  % / CGPA  \nMSc.  Statistics  and DS NSoMASA,  NMIMS,  Mumbai   NMIMS  2022 -2024  3.93/4' metadata={'source': 'TanveerSingh_Resume.pdf', 'page': 0}
page_content='BS (DS and Applications)  IIT Madras,  Chennai   IIT Madras  2020 -2024  8.5/10  \nBSc. (Statistics)  SIES  College,  Mumbai   Mumbai  University  2019 -2022  9.87/10' metadata={'source': 'TanveerSingh_Resume.pdf', 'page': 0}
page_content='HSC  Christ  Academy,  Navi  Mumbai   HSC  2019  79% \nSSC Sacred  Heart  High  School,  Mumbai   SSC 2017  86% \n \nINTERNSHIPS  20 Months  \n \nHDFC Bank,' metadata={'source': 'TanveerSingh_Resume.pdf', 'page': 0}
page_content='INTERNSHIPS  20 Months  \n \

Tanveer Singh Gupta, MSc. Statistics and Data Science, is a graduate of the University of Mumbai and a professor at the National Institute of Business Administration (NIBA). He has developed a state-of-the-art recommendation system using deep neural network-based recommendation systems for 23+ bank products on a base of 6.5+ crore customers, leveraging capabilities of multiple recommendation models such as YouTube-Ranking, SIM, Two-tower, and n ID-Face verification model for face detection and mtcnn for face verification. The study aims to evaluate and improve the accuracy of computed tomography image reconstruction through various techniques such as Least Square Method, Filtered Back Projection with different filters, Kaczmarz iterative method, and Expectation-Maximization Algorithm. He has also developed predictive models for real-time inventory management, strategizing SKUplanning, and pinpointing high-revenue products and emerging market trends thereby enhancing overall business pe

In [15]:
#file loader and preprocessing
def file_preprocessing(file):
    loader =  PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(pages)
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts

#LLM pipeline
def llm_pipeline(filepath):
    pipe_sum = pipeline(
        'summarization',
        model = base_model,
        tokenizer = tokenizer,
        max_length = 500, 
        min_length = 50)
    input_text = file_preprocessing(filepath)
    result = pipe_sum(input_text)
    result = result[0]['summary_text']
    return result

In [16]:
llm_pipeline('TanveerSingh_Resume.pdf')

page_content='Tanveer  Singh  Gupta  \n MSc.  Statistics  and Data  Science  \n  Email : - tanveersingh182764@gmail.com  Contact :- +91 9082289392     Tanveer  Singh  Gupta       Github   \nEDUCATION  \nQualification  Institute  Board  / University  Year  % / CGPA  \nMSc.  Statistics  and DS NSoMASA,  NMIMS,  Mumbai   NMIMS  2022 -2024  3.93/4  \nBS (DS and Applications)  IIT Madras,  Chennai   IIT Madras  2020 -2024  8.5/10  \nBSc. (Statistics)  SIES  College,  Mumbai   Mumbai  University  2019 -2022  9.87/10  \nHSC  Christ  Academy,  Navi  Mumbai   HSC  2019  79% \nSSC Sacred  Heart  High  School,  Mumbai   SSC 2017  86% \n \nINTERNSHIPS  20 Months  \n \nHDFC Bank,  \nMumbai  Position: - Data Science Intern (Marketing Analytics)  \n• Developing a state -of-the-art recommendation system using deep neural network -based recommendation \nsystems for 23+ bank products on a base of 6.5+ crore customers, leveraging capabilities of multiple  \nrecommendation models such as ( YouTube -Rankin

'Tanveer Singh Gupta MSc. Statistics and Data Science (MSc. DS. NSoMASA, NMIMS 2022 -2024 3.93/4 BS (DS and Applications) - Developed a state-of-the-art recommendation system using deep neural network-based recommendation systems for 23+ bank products on a base of 6.5+ crore customers. Leveraged S-bert encodings for grouping documents of similar context, reducing the overall redundant documents and consolidating them.'