# Imports

In [60]:
import os
from llama_index import GPTVectorStoreIndex, VectorStoreIndex
from llama_index import SimpleDirectoryReader
from llama_index import load_index_from_storage, StorageContext
from llama_index import download_loader
from llama_index.indices.keyword_table import SimpleKeywordTableIndex
from llama_index.query_engine import RetrieverQueryEngine
import openai
import pandas as pd
import numpy as np
import nltk
import sys
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Args

In [None]:
sys.path.append(r'C:\Users\sagar.panwar\AppData\Roaming\nltk_data')

In [None]:
os.environ['OPENAI_API_KEY']= "random"

# Paths

In [8]:
data_dir = r'C:\Users\sagar.panwar\Documents\projects\web_crawler\data'

# Create Embeddings

## load Data and clean data

In [5]:
df = pd.read_csv(os.path.join(data_dir, 'result_20230703-193332.csv'))

In [6]:
df.shape

(71, 4)

In [7]:
df = df.drop_duplicates(subset=["content"], keep="first")

In [8]:
df.shape

(61, 4)

In [9]:
df["content"].replace("", np.nan, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,link,path,content
0,0,https://www.geminisolutions.com/clubs/technolo...,C:\Users\sagar.panwar\Documents\projects\web_c...,Technology ClubThe Technology Club of Gemini f...
1,1,https://www.geminisolutions.com/about-us,C:\Users\sagar.panwar\Documents\projects\web_c...,Building leaders in the digital domain and bey...
2,2,https://www.geminisolutions.com/services/data-...,C:\Users\sagar.panwar\Documents\projects\web_c...,Data EngineeringUnleash the power of your data...
3,3,https://www.geminisolutions.com/case-studies?p...,C:\Users\sagar.panwar\Documents\projects\web_c...,Our Case StudiesTake a dive into how our solut...
4,4,https://www.geminisolutions.com/contact,C:\Users\sagar.panwar\Documents\projects\web_c...,Our LocationsIndiaUSACanada000000All Locations...
...,...,...,...,...
64,64,https://www.geminisolutions.com/case-studies/d...,C:\Users\sagar.panwar\Documents\projects\web_c...,Case Study > Technology > Database Administrat...
65,65,https://www.geminisolutions.com/case-studies/c...,C:\Users\sagar.panwar\Documents\projects\web_c...,Case Study > Technology > Creating a Common Da...
66,66,https://www.geminisolutions.com/case-studies/p...,C:\Users\sagar.panwar\Documents\projects\web_c...,Case Study > Design > Portfolio Optimization A...
67,67,https://www.geminisolutions.com/case-studies/w...,C:\Users\sagar.panwar\Documents\projects\web_c...,Case Study > Technology > Web Based App to Ana...


In [10]:
df.dropna(subset=["content"], inplace=True)
# df

In [11]:
df.shape

(60, 4)

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,link,path,content
0,0,https://www.geminisolutions.com/clubs/technolo...,C:\Users\sagar.panwar\Documents\projects\web_c...,Technology ClubThe Technology Club of Gemini f...
1,1,https://www.geminisolutions.com/about-us,C:\Users\sagar.panwar\Documents\projects\web_c...,Building leaders in the digital domain and bey...
2,2,https://www.geminisolutions.com/services/data-...,C:\Users\sagar.panwar\Documents\projects\web_c...,Data EngineeringUnleash the power of your data...
3,3,https://www.geminisolutions.com/case-studies?p...,C:\Users\sagar.panwar\Documents\projects\web_c...,Our Case StudiesTake a dive into how our solut...
4,4,https://www.geminisolutions.com/contact,C:\Users\sagar.panwar\Documents\projects\web_c...,Our LocationsIndiaUSACanada000000All Locations...


In [13]:
df.content =df.content.replace(to_replace='[!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~]',value=' ',regex=True)#remove punctuation except
df.content =df.content.replace(to_replace='  ',value='',regex=True) 

In [17]:
df.content[0], df.shape

('Technology ClubThe Technology Club of Gemini focuses on reviving dormant tech gears and keeping up with the latest happenings Technology ClubThe Technology Club serves as a hub for technology lovers to share innovative ideasstay updated on the latest industry developmentsand connect with fellow tech enthusiasts. From organizing exciting events to publishing informative monthly newslettersthe Tech Club is dedicated to fostering a vibrant community of tech-savvy individuals and encouraging inter-team interaction.Club HighlightsEventsMembersNewslettersHackerrankTech ExpoTech QuizCode-a-thonKon Banega Tech HeroHackathonClub StoriesTech QuizThe Technology Club organized an online technical quiz agnostic of a particular language which consisted of 30 questions to be done in 30 mins.Tech ExpoThe Tech Expo is a platform for the teams to showcase all the amazing work they do on a day-to day basis and increase transparency. The teams summarize their work and design posters which are put on dis

## Create Clean Keywords

In [44]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sagar.panwar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sagar.panwar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sagar.panwar\AppData\Roaming\nltk_data...


In [45]:
# stop_words

In [60]:
def create_clean_keywords(text):
    text =  text.lower()
    token = word_tokenize(text)
    
    token = [w for w in token if not w.lower() in stop_words]
    
    token = [lemmatizer.lemmatize(val) for val in token]
    
    return ",".join(token)

In [61]:
df['clean_keywords'] = df['content'].apply(create_clean_keywords)

In [62]:
df['clean_keywords']

0     technology,clubthe,technology,club,gemini,focu...
1     building,leader,digital,domain,beyondgemini,fa...
2     data,engineeringunleash,power,data,data,engine...
3     case,studiestake,dive,solution,helped,business...
4     locationsindiausacanada000000all,locationsview...
5     gemini,solution,gemini.solutions•,instagram,ph...
6     case,study,technology,client,major,focus,facil...
7     ussolutionstechnologiesgemucareerscontact,404o...
8     csr,clubleading,way,corporate,social,responsib...
9     quality,engineeringensure,quality,every,step,s...
10    overview,dbtabout,ussolutionstechnologiesgemuc...
11    databasesunlock,potential,data,expert,database...
12    case,study,technology,setting,aws,serverless,l...
13    gemini,wheel,clubthe,adventure,club,gemini,uni...
14    cloud,servicesexperience,unparalleled,scalabil...
15    blog,infrastructure,mou,signed,iit,roparmou,si...
16    blogscommunication,reloadedcommunication,reloa...
17    technical,infrastructurefrom,cloud,infrast

## Embeddings

In [63]:
## Create Vocabulary
vocabulary = set()
for doc in df.clean_keywords:
    vocabulary.update(doc.split(','))

In [65]:
# vocabulary

In [66]:
# df.clean_keywords.apply(lambda x: ",".join(x))

In [67]:
# Intializating the tfIdf model
tfidf = TfidfVectorizer(vocabulary=vocabulary)
# Fit the TfIdf model
tfidf.fit(df.clean_keywords)

In [83]:
# Transform the TfIdf model
df['embedding'] = [val for val in tfidf.transform(df.clean_keywords).toarray()]

In [63]:
df.embedding

NameError: name 'df' is not defined

In [86]:
def gen_vector_T(tokens):
    Q = np.zeros((len(vocabulary)))    
    x= tfidf.transform(tokens)
    #print(tokens[0].split(','))
    for token in tokens[0].split(','):
        #print(token)
        try:
            ind = vocabulary.index(token)
            Q[ind]  = x[0, tfidf.vocabulary_[token]]
        except:
            pass
    return Q

In [87]:
gen_vector_T(['my', 'name'])

array([0., 0., 0., ..., 0., 0., 0.])

In [89]:
df.to_csv(os.path.join(data_dir, r'chatbot_data\embedding\tfidf\embedding.csv'))

In [92]:
import json

In [94]:
# vocabulary
with open(os.path.join(data_dir, r'chatbot_data\embedding\tfidf\vocabulary.txt'), 'w') as f:
    f.write(str(vocabulary))

# ChatBot

## Data Loading

In [61]:
# load documents
StringIterableReader = download_loader("StringIterableReader")

loader = StringIterableReader()

In [67]:
df = pd.read_csv(os.path.join(data_dir, r'chatbot_data\embedding\tfidf\embedding.csv'))

In [68]:
documents = loader.load_data(texts=df.content.tolist())

In [70]:
len(documents)

60

## Indexing

In [89]:
# Create Index
index = SimpleKeywordTableIndex(documents, max_keywords_per_chunk=20)

In [90]:
# GPTVectorStoreIndex(documents)

In [92]:
# index.save_to_disk("7_custom_opt.json")

In [79]:
index.index_struct

<llama_index.indices.keyword_table.simple_base.SimpleKeywordTableIndex at 0x1c7de7cf1f0>

In [80]:
query_engine = index.as_query_engine()

In [81]:
query_engine.query("data science", response_mode=='no_text')

NameError: name 'response_mode' is not defined

In [82]:
retriever = index.as_retriever()

In [83]:
query_engine = RetrieverQueryEngine.from_args(retriever, response_mode="no_text")

In [84]:
index.index_struct

KeywordTable(index_id='cc900d27-a0e0-4c3f-ad44-7c9aa8b22d53', summary=None, table={'technical': {'53dc9cb0-44be-42a2-972f-947b568e7f59', 'dca4a08b-a04b-44e1-8245-b7992101dc97'}, 'organized': {'dca4a08b-a04b-44e1-8245-b7992101dc97'}, 'topics': {'dca4a08b-a04b-44e1-8245-b7992101dc97'}, 'also': {'4ca868c1-4a62-4447-b804-ccbca69c9681', '539be537-04bd-4d96-b8ca-39af1f69832a', 'dca4a08b-a04b-44e1-8245-b7992101dc97', 'f30003bd-97ea-4d22-8433-88e8364bb474', '8333dba7-c5f1-4320-a5c0-534f7152fe67', 'c4f7e769-c302-4380-8ef9-40aded9c8269', '83ad0c71-6ad3-4752-b7d9-60fc54f68827'}, 'activities': {'c5c2a9c2-2e35-4184-8ef3-f583daa06aac', '1606f17b-29c1-42a1-bcda-2da0bd9d7f05', 'c8b8babf-e1dc-43dc-8f55-882951d1f462', 'dca4a08b-a04b-44e1-8245-b7992101dc97', 'e2c01673-01e4-456d-a9b9-c0e6193a58c1'}, 'improve': {'aa2e4a91-547b-4215-8b62-98a57f154589', '2dfb8f90-e4b6-4678-82fb-2809c7a1c60a', 'dca4a08b-a04b-44e1-8245-b7992101dc97', 'f30003bd-97ea-4d22-8433-88e8364bb474'}, 'skillsand': {'dca4a08b-a04b-44e1-82

In [28]:
# query_engine.query("What did the author do growing up?")

AuthenticationError: Incorrect API key provided: random. You can find your API key at https://platform.openai.com/account/api-keys.