<table align="center">
  <td align="center"><a target="_blank" href="https://colab.research.google.com/github/ogirimah/generative-ai-workshop/blob/main/workshop.ipynb">
        <img src="https://i.ibb.co/2P3SLwK/colab.png"  style="padding-bottom:5px;" />Run in Google Colab</a></td>
  <td align="center"><a target="_blank" href="https://github.com/ogirimah/generative-ai-workshop/workshop.ipynb">
        <img src="https://i.ibb.co/xfJbPmL/github.png"  height="70px" style="padding-bottom:5px;"  />View Source on GitHub</a></td>
</table>

**Dependencies**


*   OpenAI
*   Langchain
*   Tiktoken
*   Ruby
*   Wayback_machine_downloader
*   Unstructured

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -Uq \
  openai \
  langchain \
  tiktoken \
  unstructured

In [None]:
!sudo apt install -q ruby-full
!gem install -q wayback_machine_downloader

In [None]:
!wayback_machine_downloader https://ask.herts.ac.uk

In [None]:
!cp ./drive/MyDrive/delete_script delete_script
!chmod +x delete_script
!./delete_script

In [None]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('websites')
docs = loader.load()
len(docs)

In [None]:
docs[0]

In [None]:
print(docs[0].page_content)

In [None]:
print(docs[5].page_content)

In [None]:
docs[6].metadata['source'].replace('websites/', 'https://')

In [None]:
import tiktoken

tiktoken.encoding_name_for_model('gpt-3.5-turbo')

In [None]:
tokenizer = tiktoken.get_encoding('cl100k_base')

In [None]:
# Create the token length function and test it
def token_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

token_len('This is just a sample text to test the token_len function'
          'The token length of this function is found below')

In [None]:
token_counts = [token_len(doc.page_content) for doc in docs]

In [None]:
print(f"""Min: {min(token_counts)}
Avg: {int(sum(token_counts) / len(token_counts))}
Max: {max(token_counts)}""")

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=token_len,
    separators=['\n\n', '\n', ' ', '']
)

In [None]:
chunks = text_splitter.split_text(docs[6].page_content)
len(chunks)

In [None]:
import hashlib
hasher = hashlib.md5()  # this will convert URL into unique ID

url = docs[5].metadata['source'].replace('websites/', 'https://')
print(url)

# convert URL to unique ID
hasher.update(url.encode('utf-8'))
unique_id = hasher.hexdigest()[:12]
print(unique_id)

In [None]:
data = [
    {
        'id': f'{unique_id}-{i}',
        'text': text_chunk,
        'source': url
    } for i, text_chunk in enumerate(chunks)
]
data

In [None]:
from tqdm.auto import tqdm

documents = []

for doc in tqdm(docs):
    url = doc.metadata['source'].replace('websites/', 'https://')
    hasher.update(url.encode('utf-8'))
    unique_id = hasher.hexdigest()[:12]
    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f'{unique_id}-{i}',
            'text': chunk,
            'source': url
        })

len(documents)

To save the .json1 file

In [None]:
import json

with open('ask_herts.jsonl', 'w') as f:
    for doc in documents:
        f.write(json.dumps(doc) + '\n')