In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install lxml


In [None]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.parse import urljoin, urlparse
from collections import deque
import re

def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
    urls = set()
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            continue
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid_url(href) or domain_name not in href:
            continue
        urls.add(href)
    return urls

def extract_text_from_url(url):
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def crawl_and_extract(starting_url, max_links=100):
    visited = set()
    queue = deque([starting_url])
    word_file = open("website_words.txt", "w", encoding="utf-8")

    while queue and len(visited) < max_links:
        url = queue.popleft()
        if url not in visited:
            visited.add(url)
            try:
                text = extract_text_from_url(url)
                word_file.write(text + "\n")
                for link in get_all_website_links(url):
                    if link not in visited:
                        queue.append(link)
            except Exception as e:
                print(f"Failed to process {url}: {e}")

    word_file.close()

crawl_and_extract("https://www.joshtechnologygroup.com/", max_links=100)


In [None]:
!pip install nltk
import nltk
nltk.download('punkt')

In [None]:
import pandas as pd

with open('website_words.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

tokens = text_data.split()

tokens_df = pd.DataFrame(tokens, columns=['Tokens'])

tokens_df.head(10)


In [None]:
import pandas as pd
import string
from nltk import ngrams

tokens = tokens_df['Tokens']

def clean_token(token):
    token = token.lower()
    token = token.strip()
    return token

cleaned_tokens = [clean_token(token) for token in tokens if token not in string.punctuation]

stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves',
                  'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them',
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
                  'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
                  'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
                  'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
                  'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
                  'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
                  'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
                  'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'])

cleaned_tokens = [token for token in cleaned_tokens if token not in stopwords]

n = 2
ngram_list = list(ngrams(cleaned_tokens, n))

with open('cleaned_website_words.txt', 'w', encoding='utf-8') as f:
    for token in cleaned_tokens:
        f.write(token + "\n")

with open('ngrams.txt', 'w', encoding='utf-8') as f:
    for ngram in ngram_list:
        f.write(' '.join(ngram) + "\n")


In [None]:
!pip install langchain

In [None]:
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
import sys
import os

os.environ["OPENAI_API_KEY"]="YOUR KEY HERE"
data=TextLoader("ngrams.txt")
prompt=sys.argv[1]
index=VectorstoreIndexCreator().from_loaders([data])
response=index.query(prompt)
print(response)