In [None]:
# To install all available features
!pip3 install "crawl4ai[all]"

# After installation, download the necessary models for better performance:
!crawl4ai-download-models

# Lastly, install Playwright dependencies
!playwright install 

In [2]:
with open('D:\Desktop\chatbot4group\data\Employee Handbook – Nexcel Info Site.html', 'r') as file:
    html_as_string = file.read()

In [None]:
print(html_as_string)

In [1]:
import requests

url = 'https://nexcel.info/employee-handbook-2/'  # Replace with your desired URL
response = requests.get(url)

if response.status_code == 200:
    html_content = response.text
else:
    print('Failed to retrieve the document')

In [None]:
from bs4 import BeautifulSoup

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Pretty-printing the parsed HTML
print(soup.prettify())

In [4]:
title = soup.title.string
print(f'Page Title: {title}')

Page Title: Employee Handbook – Nexcel Info Site


In [None]:
all_text = soup.get_text()
print(all_text)

In [None]:
links = [a['href'] for a in soup.find_all('a', href=True)]
print('Links Found:', links)

In [None]:
data = {
    'title': title,
    'text': all_text,
    'links': links
}

print('Structured Data:', data)

In [None]:
from langchain.schema import Document

document = Document(
    page_content=data['text'],
    metadata={
        'title': data['title'],
        'links': data['links'],
    }
)

print('LangChain Document Created:', document)

In [31]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Splitting the document into manageable chunks
text_splitter = RecursiveCharacterTextSplitter()
text_splitter._chunk_size = 500
text_splitter._chunk_overlap = 20
documents = text_splitter.split_documents([document])

print(f'Number of Chunks Created: {len(documents)}')

Number of Chunks Created: 164


In [26]:
import re


def format_text(text):
    # Step 1: Remove excess whitespace and ensure consistent formatting
    text = re.sub(r'\n+', '\n', text.strip())
    
    # Step 2: Break the text into major sections based on patterns like '------'
    sections = re.split(r'\n[-=]+\n', text)
    
    formatted_text = ''
    
    for section in sections:
        # Step 3: For each section, process the title and content
        lines = section.strip().split('\n')
        title = lines[0]
        content = '\n'.join(lines[1:])
        
        # Step 4: Add title to formatted text
        formatted_text += title + '\n' + '-' * len(title) + '\n'
        
        # Step 5: Handle indentation for nested bullet points
        formatted_text += format_content(content) + '\n\n'
    
    return formatted_text.strip()



def format_content(content):
    lines = content.split('\n')
    formatted_lines = []
    indent_level = 0

    for line in lines:
        if line.startswith('-'):
            # Bullet points, so we add indentation
            formatted_lines.append('  ' * indent_level + line)
        else:
            # Non-bullet point, reset indentation
            if line.strip() == '':
                formatted_lines.append('')
            else:
                formatted_lines.append('  ' * indent_level + line)
        
        # Increase indent if the line is a section heading
        if re.search(r'[A-Za-z\s]+$', line):
            indent_level += 1
    
    return '\n'.join(formatted_lines)



def format_content_for_markdown(content):
    lines = content.split('\n')
    formatted_lines = []
    indent_level = 0

    for line in lines:
        if line.startswith('-'):
            # Bullet points, add indentation
            formatted_lines.append('  ' * indent_level + line)
        else:
            # Non-bullet point, reset indentation
            if line.strip() == '':
                formatted_lines.append('')
            else:
                formatted_lines.append('  ' * indent_level + line)
        
        # Increase indent if the line is a sub-section heading
        if re.search(r'[A-Za-z\s]+$', line):
            indent_level += 1
    
    return '\n'.join(formatted_lines)



def format_to_markdown(text):
    # Step 1: Remove excess whitespace and ensure consistent formatting
    text = re.sub(r'\n+', '\n', text.strip())
    
    # Step 2: Break the text into major sections based on patterns like '------'
    sections = re.split(r'\n[-=]+\n', text)
    
    markdown_text = ''
    
    for section in sections:
        # Step 3: For each section, process the title and content
        lines = section.strip().split('\n')
        title = lines[0]
        content = '\n'.join(lines[1:])
        
        # Step 4: Add title to markdown text as H1
        markdown_text += f'# {title}\n\n'
        
        # Step 5: Handle indentation for nested bullet points
        markdown_text += format_content_for_markdown(content) + '\n\n'
    
    return markdown_text.strip()

In [27]:
t = format_content(documents[0].page_content)

In [1]:
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate.from_template(
  "Write a delicious recipe for {dish} with a {flavor} twist."
)

# Formatting the prompt with new content
formatted_prompt = prompt_template.format(dish="pasta", flavor="spicy")

print(formatted_prompt)

Write a delicious recipe for pasta with a spicy twist.


In [None]:
import time

def nextSquare():
    i = 1
 
    # An Infinite loop to generate squares
    while True:
        yield i*i
        i += 1  # Next execution resumes
        time.sleep(4)
        # from this point
 
 
# Driver code to test above generator
# function
for num in nextSquare():
    if num > 100:
        break
    print(num)

In [2]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

examples = [
    {
        "question": "What is the tallest mountain in the world?",
        "answer": "Mount Everest",
    },
    {"question": "What is the largest ocean on Earth?", "answer": "Pacific Ocean"},
    {"question": "In which year did the first airplane fly?", "answer": "1903"},
]

example_prompt = PromptTemplate(
    input_variables=["question", "answer"],
    template="Question: {question}\n{answer}",
)
prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"],
)

print(
    prompt_template.format(
        input="What is the name of the famous clock tower in London?"
    )
)

Question: What is the tallest mountain in the world?
Mount Everest

Question: What is the largest ocean on Earth?
Pacific Ocean

Question: In which year did the first airplane fly?
1903

Question: What is the name of the famous clock tower in London?
