In [None]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os

# User-Agent header
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    try:
        # Modify this line to include the headers parameter
        req = urllib.request.Request(url, headers=headers)
        with urllib.request.urlopen(req) as response:
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    parser = HyperlinkParser()
    parser.feed(html)
    return parser.hyperlinks


# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "ci.hartford.wi.us"
# Full URL including the specific path to start crawling from
full_url = "https://ci.hartford.wi.us/440/Experience-Downtown"


# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])


# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

    
        # If the link is a URL, check if it is within the same domain
        try: 
            if re.search(HTTP_URL_PATTERN, link):
                # Parse the URL and check if the domain is the same
                url_obj = urlparse(link)
                if url_obj.netloc == local_domain:
                    clean_link = link
        except Exception as e:
            print(e)
                 

        # If the link is not a URL, check if it is a relative link
        else:
            try:
                if link.startswith("/"):
                    link = link[1:]
                elif link.startswith("#") or link.startswith("mailto:"):
                    continue
                clean_link = "https://" + local_domain + "/" + link
            except Exception as e:
                print(e)

        if clean_link is not None:
            try:
                if clean_link.endswith("/"):
                    clean_link = clean_link[:-1]
                clean_links.append(clean_link)
            except Exception as e:
                print(e)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = {url}

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress
        try:
            # Save text from the url to a <url>.txt file
            with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:
    
                # Get the text from the URL using BeautifulSoup
                soup = BeautifulSoup(requests.get(url).text, "html.parser")
    
                # Get the text but remove the tags
                text = soup.get_text()
    
                # If the crawler gets to a page that requires JavaScript, it will stop the crawl
                if ("You need to enable JavaScript to run this app." in text):
                    print("Unable to parse page " + url + " due to JavaScript being required")
                
                # Otherwise, write the text to the file in the text directory
                f.write(text)
        except Exception as e:
            print(e)  
        
        try:    
            # Get the hyperlinks from the URL and add them to the queue
            for link in get_domain_hyperlinks(local_domain, url):
                try:
                    if link not in seen:
                        queue.append(link)
                        seen.add(link)
                except Exception as e:
                    print(e)          
        except Exception as e:
            print(e)            

crawl(full_url)

In [None]:
def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie

In [None]:
import pandas as pd
import os

# Create a list to store the text files
texts=[]

domain_dir = os.path.join("text", domain)
abs_domain_dir = os.path.abspath(domain_dir)

if os.path.exists(abs_domain_dir):
    # Get all the text files in the text directory
    for file in os.listdir("text/" + domain + "/"):
        
        try:
            # Open the file and read the text
            with open("text/" + domain + "/" + file, "r") as f:
                text = f.read()
        
                # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
                texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
        except Exception as e:
            print(f"Exception occurred during reading file '{text}': {e}")       
else:
    print(f"Directory '{abs_domain_dir}' does not exist.")
# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text'])

# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')
df.head()

In [None]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()

In [None]:
max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    # Add the last chunk to the list of chunks
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks
    

shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])
    
    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

In [None]:
df = pd.DataFrame(shortened, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.n_tokens.hist()

In [None]:
import openai
api_key = os.getenv('OPENAI_API_KEY')
openai.api_key=api_key
df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df.to_csv('processed/embeddings.csv')
df.head()

In [None]:
import pandas as pd
import numpy as np
import pickle
from ast import literal_eval
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity

df=pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(literal_eval).apply(np.array)

df.head()

# SAVE
with open('df.pkl', 'wb') as f:
    pickle.dump(df, f)


In [2]:
def get_random_port(min_port=1024, max_port=49151):
    while True:
        try:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                port = random.randint(min_port, max_port)
                s.bind(('', port))
                _, bound_port = s.getsockname()
                if bound_port == port:
                    return port
        except OSError:
            continue

In [5]:
import streamlit as st
import subprocess
import tempfile
import os
import random
import socket

streamlit_app = """
import streamlit as st

def main():
    st.title("Simple Streamlit App")

    # Display a text input box
    user_input = st.text_input("Enter something:")

    # Display a button
    if st.button("Submit"):
        # Display the input text
        st.write("You entered:", user_input)

if __name__ == "__main__":
    main()
"""

# Temporary file to hold the Streamlit app
with tempfile.NamedTemporaryFile('w', delete=False, suffix='.py') as temp_file:
    temp_file.write(streamlit_app)
    temp_file_name = temp_file.name

random_port = get_random_port()

# Define the Streamlit command
streamlit_command = f"streamlit run {temp_file_name} --server.port {random_port} --server.address 0.0.0.0"

# Print the URL for the user to visit
print(f"Visit the Streamlit app at http://localhost:{random_port}")

# Use subprocess to run the Streamlit command
process = subprocess.Popen(streamlit_command, shell=True)



In [14]:
import streamlit as st
import pandas as pd
import pickle
import openai
from openai.embeddings_utils import distances_from_embeddings
from openai import OpenAI
import logging

random_port = get_random_port()
print(f"Random port: {random_port}")

streamlit_command = f"streamlit run web_page.py --server.port {random_port} --server.address 0.0.0.0"
print(streamlit_command)
print(f"Visit the site at http://localhost:{random_port}")

try:
    # Start the Streamlit app
    process = subprocess.Popen(streamlit_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    # Print the output and error streams
    while True:
        output = process.stdout.readline()
        error = process.stderr.readline()
        if output:
            print(output.strip())
        if error:
            print(error.strip())
        if output == '' and process.poll() is not None:
            break
except Exception as e:
    print(e)


In [11]:
from IPython.display import IFrame

IFrame(src=f"http://localhost:{random_port}", width=1000, height=600)
