# Loaders

In [1]:
from langchain.document_loaders import TextLoader

In [2]:
loader = TextLoader("stock_news.txt")
data = loader.load()

In [3]:
data

[Document(page_content='RBC Capital upgraded shares of GoDaddy (NYSE:GDDY) to Outperform from Sector Perform in a note Wednesday, raising the price target to $124 per share, up from $90.\n\nAnalysts told investors that the company is "structurally differentiated" and under-owned.\n\n"GDDY\'s an under-owned, durably growing cash machine with a dash of AI," wrote the analysts. "We like its structurally hedged customer acquisition model, have rising confidence in margin expansion coming out of our conference and lastly, like management\'s openness to increasingly managing top-line expectations."\n\nThe analysts also noted that the company\'s generative AI-focused dinner next week is timely, while the mid-Q1 analyst day should improve visibility into the company\'s strong long-term earnings and free cash flow generation.\n\n"The stock could be viewed as trading at mid-single digit multiple on very reachable longer-term assumptions," added the analysts.', metadata={'source': 'stock_news.txt

In [4]:
from langchain.document_loaders.csv_loader import CSVLoader

In [5]:
loader = CSVLoader("C:\\Users\\priya\\Downloads\\Bengaluru_House_Data.csv")
data = loader.load()

In [6]:
len(data)

13320

In [9]:
data[0]

Document(page_content='area_type: Super built-up  Area\navailability: 19-Dec\nlocation: Electronic City Phase II\nsize: 2 BHK\nsociety: Coomee\ntotal_sqft: 1056\nbath: 2\nbalcony: 1\nprice: 39.07', metadata={'source': 'C:\\Users\\priya\\Downloads\\Bengaluru_House_Data.csv', 'row': 0})

In [8]:
type(data[0])

langchain.schema.document.Document

In [10]:
data[0].metadata

{'source': 'C:\\Users\\priya\\Downloads\\Bengaluru_House_Data.csv', 'row': 0}

In [12]:
loader = CSVLoader("C:\\Users\\priya\\Downloads\\Bengaluru_House_Data.csv", source_column="location")
data = loader.load()

In [13]:
data[0].metadata

{'source': 'Electronic City Phase II', 'row': 0}

# Splitters

In [53]:
text = """Picture this – you’re working on a really cool data science project and have applied the latest state-of-the-art library to get a pretty good result. And boom! A few days later, there’s a new state-of-the-art framework in town that has the potential to further improve your model.

That is not a hypothetical scenario – it’s the reality (and thrill) of working in the field of Natural Language Processing (NLP)! The last two years have been mind-blowing in terms of breakthroughs. I get to grips with one framework and another one, potentially even better, comes along.

Google’s BERT is one such NLP framework. I’d stick my neck out and say it’s perhaps the most influential one in recent times (and we’ll see why pretty soon).

It’s not an exaggeration to say that BERT has significantly altered the NLP landscape. Imagine using a single model that is trained on a large unlabelled dataset to achieve State-of-the-Art results on 11 individual NLP tasks. And all of this with little fine-tuning. That’s BERT! It’s a tectonic shift in how we design NLP models.

BERT has inspired many recent NLP architectures, training approaches and language models, such as Google’s TransformerXL, OpenAI’s GPT-2, XLNet, ERNIE2.0, RoBERTa, etc.

I aim to give you a comprehensive guide to not only BERT but also what impact it has had and how this is going to affect the future of NLP research. And yes, there’s a lot of Python code to work on, too!"""

In [14]:
from langchain.text_splitter import CharacterTextSplitter

In [54]:
splitter = CharacterTextSplitter(
    separator=".",
    chunk_size=200,
    chunk_overlap=0
)

chunks = splitter.split_text(text)

In [55]:
len(chunks)

11

In [56]:
for chunk in chunks:
    print(len(chunk))

148
129
197
129
115
85
178
62
167
147
54


In [57]:
chunks

['Picture this – you’re working on a really cool data science project and have applied the latest state-of-the-art library to get a pretty good result',
 'And boom! A few days later, there’s a new state-of-the-art framework in town that has the potential to further improve your model',
 'That is not a hypothetical scenario – it’s the reality (and thrill) of working in the field of Natural Language Processing (NLP)! The last two years have been mind-blowing in terms of breakthroughs',
 'I get to grips with one framework and another one, potentially even better, comes along.\n\nGoogle’s BERT is one such NLP framework',
 'I’d stick my neck out and say it’s perhaps the most influential one in recent times (and we’ll see why pretty soon)',
 'It’s not an exaggeration to say that BERT has significantly altered the NLP landscape',
 'Imagine using a single model that is trained on a large unlabelled dataset to achieve State-of-the-Art results on 11 individual NLP tasks. And all of this with lit

In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [101]:
r_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", "."],
    chunk_size=200,
    chunk_overlap=0
)

chunks = r_splitter.split_text(text)

In [102]:
len(chunks)

11

In [103]:
for chunk in chunks:
    print(len(chunk))

148
132
197
90
157
85
180
65
168
147
56


In [104]:
chunks

['Picture this – you’re working on a really cool data science project and have applied the latest state-of-the-art library to get a pretty good result',
 '. And boom! A few days later, there’s a new state-of-the-art framework in town that has the potential to further improve your model.',
 'That is not a hypothetical scenario – it’s the reality (and thrill) of working in the field of Natural Language Processing (NLP)! The last two years have been mind-blowing in terms of breakthroughs',
 '. I get to grips with one framework and another one, potentially even better, comes along.',
 'Google’s BERT is one such NLP framework. I’d stick my neck out and say it’s perhaps the most influential one in recent times (and we’ll see why pretty soon).',
 'It’s not an exaggeration to say that BERT has significantly altered the NLP landscape',
 '. Imagine using a single model that is trained on a large unlabelled dataset to achieve State-of-the-Art results on 11 individual NLP tasks. And all of this wi