# Experimenting

In [17]:
from DocumentLoader import load_and_read_pdf
from Chunking import combine_text
from langchain_experimental.text_splitter import SemanticChunker
from langchain_cohere.embeddings import CohereEmbeddings

In [18]:
pages = load_and_read_pdf(file_path = "VisionTransformers.pdf")
combined_text = combine_text(pages)

In [19]:
cohere_embeddings = CohereEmbeddings(cohere_api_key="NgN2P0LvOckIGwvcj0xcUfYluIzTEaVZ9Y9zxpo8", model="embed-english-light-v3.0")

In [20]:
text_splitter = SemanticChunker(cohere_embeddings)

In [21]:
docs = text_splitter.create_documents([combined_text])
print(docs[0].page_content)

Published as a conference paper at ICLR 2021
AN IMAGE IS WORTH 16X16 W ORDS :
TRANSFORMERS FOR IMAGE RECOGNITION AT SCALE
Alexey Dosovitskiy∗,†, Lucas Beyer∗, Alexander Kolesnikov∗, Dirk Weissenborn∗,
Xiaohua Zhai∗, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer,
Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby∗,†
∗equal technical contribution, †equal advising
Google Research, Brain Team
{adosovitskiy, neilhoulsby}@google.com
ABSTRACT
While the Transformer architecture has become the de-facto standard for natural
language processing tasks, its applications to computer vision remain limited. In
vision, attention is either applied in conjunction with convolutional networks, or
used to replace certain components of convolutional networks while keeping their
overall structure in place. We show that this reliance on CNNs is not necessary
and a pure transformer applied directly to sequences of image patches can perform
very well on image classiﬁcation tasks. When pre-trai

In [22]:
for doc in docs:

    cur_content = doc.page_content
    print(len(cur_content))

14735
432
2731
2923
10161
427
1001
2590
127
1272
86
301
589
335
533
318
61
180
19
149
1117
1076
2965
2742
754
582
1986
1348
3809
4
163
8083
3737


In [24]:
print(docs[1].page_content)

the test sets of the
downstream tasks following Kolesnikov et al. (2020). We transfer the models trained on these
dataset to several benchmark tasks: ImageNet on the original validation labels and the cleaned-up
ReaL labels (Beyer et al., 2020), CIFAR-10/100 (Krizhevsky, 2009), Oxford-IIIT Pets (Parkhi et al.,
2012), and Oxford Flowers-102 (Nilsback & Zisserman, 2008). For these datasets, pre-processing
follows Kolesnikov et al.


In [18]:
#Breakpoints: https://python.langchain.com/docs/how_to/semantic-chunker/#install-dependencies

In [5]:
text_splitter = SemanticChunker(cohere_embeddings,breakpoint_threshold_type = 'gradient')
docs = text_splitter.create_documents([combined_text])
# print(docs[0].page_content)

In [6]:
for doc in docs:

    cur_content = doc.page_content
    print(len(cur_content))

617
9656
4411
420
2783
2873
10190
416
3618
127
739
631
298
381
536
368
165
206
383
130
956
170
3979
2310
1763
2013
1354
1593
2199
63
163
8011
3814


In [33]:
print(docs[3].page_content)

We report results on downstream datasets either through few-shot or ﬁne-tuning accuracy. Fine-tuning accuracies capture the performance of each model after ﬁne-tuning it on the respective
dataset. Few-shot accuracies are obtained by solving a regularized least-squares regression problem
that maps the (frozen) representation of a subset of training images to{−1,1}K target vectors. This
formulation allows us to recover the exact solution in closed form. Though we mainly focus on
ﬁne-tuning performance, we sometimes use linear few-shot accuracies for fast on-the-ﬂy evaluation
where ﬁne-tuning would be too costly. 4.2 C OMPARISON TO STATE OF THE ART
We ﬁrst compare our largest models – ViT-H/14 and ViT-L/16 – to state-of-the-art CNNs from
the literature. The ﬁrst comparison point is Big Transfer (BiT) (Kolesnikov et al., 2020), which
performs supervised transfer learning with large ResNets. The second is Noisy Student (Xie et al.,
2020), which is a large EfﬁcientNet trained using semi-supe

# Creating code

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
def calculate_num_splits(cur_text_len, max_chunk_size):
    
    num_splits = 2

    while(cur_text_len/num_splits > max_chunk_size):
        num_splits += 1
    
    return num_splits 

def recursive_split_big_chunks(cur_text, cur_text_len, max_chunk_size):
    num_splits = calculate_num_splits(cur_text_len, max_chunk_size)

    chunk_size_of_recursive_splitter = cur_text_len/num_splits

    recursive_text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size_of_recursive_splitter,
                                                                chunk_overlap=int(0.1*cur_text_len),
                                                                length_function=len,
                                                                is_separator_regex=True)
    
    chunks_obj = recursive_text_splitter.create_documents([cur_text])

    chunks_text = [cur_obj.page_content for cur_obj in chunks_obj]

    return chunks_text

def post_process_chunks_of_text(chunks_of_text, min_chunk_size, max_chunk_size):

        final_chunks = []
        forward_text = ""

        for cur_text in chunks_of_text:

            cur_text += forward_text
            
            cur_text_len = len(cur_text)

            if cur_text_len > max_chunk_size:
                final_chunks += recursive_split_big_chunks(cur_text, cur_text_len, max_chunk_size)
            
            elif cur_text_len < min_chunk_size:
                 
                if final_chunks:
                    final_chunks[-1] += cur_text
                else:
                    forward_text = cur_text 
            
            else:
                final_chunks.append(cur_text)
        
        return final_chunks
            


In [11]:
def semantic_chunking(text, min_chunk_size, max_chunk_size):

    cohere_api_key = "NgN2P0LvOckIGwvcj0xcUfYluIzTEaVZ9Y9zxpo8" # This will be save in environment

    cohere_embeddings = CohereEmbeddings(cohere_api_key=cohere_api_key, model="embed-english-light-v3.0")

    text_splitter = SemanticChunker(cohere_embeddings,breakpoint_threshold_type = 'gradient')
    
    chunks_obj = text_splitter.create_documents([combined_text])

    chunks_of_text = [cur_obj.page_content for cur_obj in chunks_obj]

    return post_process_chunks_of_text(chunks_of_text, min_chunk_size, max_chunk_size)


In [None]:
recursive_split_big_chunks