# Ollama server

In [2]:
%pip -q install llama-index llama-index-readers-web llama-index-llms-ollama llama-index-embeddings-huggingface llama-index-readers-file unstructured

In [3]:
! curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0>>> Downloading ollama...
100 10406    0 10406    0     0  40309      0 --:--:-- --:--:-- --:--:-- 40333
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [4]:
import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])


In [5]:
ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

In [7]:
!ollama pull llama3

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB     

In [8]:
!ollama list

NAME         	ID          	SIZE  	MODIFIED      
llama3:latest	365c0bd3c000	4.7 GB	4 seconds ago	


# Libraries

In [9]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.readers.web import SimpleWebPageReader
from llama_index.readers.file import (
    DocxReader,
    HWPReader,
    PDFReader,
    EpubReader,
    FlatReader,
    HTMLTagReader,
    ImageCaptionReader,
    ImageReader,
    ImageVisionLLMReader,
    IPYNBReader,
    MarkdownReader,
    MboxReader,
    PptxReader,
    PandasCSVReader,
    VideoAudioReader,
    UnstructuredReader,
    PyMuPDFReader,
    ImageTabularChartReader,
    XMLReader,
    PagedCSVReader,
    CSVReader,
    RTFReader,
)
from llama_index.core import (
    load_index_from_storage,
    StorageContext,
)


# Data loading and storing indexes

# 1. Tablular data from webpage




*   Detecting the tables
*   Saving it as csv
*   Mapping the csv files with their corresponding url
*   Indexing of these csv data



In [51]:
file_to_url_mapping={}

In [52]:
def table_to_csv_converter(url,file_to_url_mapping):

  from bs4 import BeautifulSoup
  import requests
  import pandas as pd
  import os

  url = url
  page = requests.get(url)
  soup = BeautifulSoup(page.text, 'html')

  # Detect all tables on the webpage
  all_tables = soup.find_all('table')
  print(f"Total number of tables: {len(all_tables)}")

  # Create a directory to save the CSV files
  directory = 'directory_csv'
  if not os.path.exists(directory):
      os.makedirs(directory)

  # Iterate over each table
  for i, table in enumerate(all_tables):
      # Get the caption of the table
      table_caption = table.find('caption').text.strip()

      world_titles = table.find_all('th')
      world_table_titles = [title.text.strip() for title in world_titles]

      df = pd.DataFrame(columns=world_table_titles)

      column_data = table.find_all('tr')
      for row in column_data[1:]:
          row_data = row.find_all('td')
          individual_row_data = [data.text.strip() for data in row_data]
          df.loc[len(df)] = individual_row_data

      # Save the DataFrame as a CSV file with the table caption as the file name
      csv_file = os.path.join(directory, f'{table_caption}.csv')
      df.to_csv(csv_file, index=False)
      print(f"CSV file saved: {csv_file}")
      #file_to_url_mapping.append({ f'{table_caption}.csv':url})
      file_to_url_mapping[f'{table_caption}.csv'] = url

In [53]:
url_with_table = ['https://maharashtra.nic.in/directory/','https://maharashtra.nic.in/rti/','https://maharashtra.nic.in/district-centres/']

In [54]:
for url in url_with_table:
  table_to_csv_converter(url,file_to_url_mapping)

Total number of tables: 37
CSV file saved: directory_csv/SIO.csv
CSV file saved: directory_csv/Maharashtra State Centre, Mumbai.csv
CSV file saved: directory_csv/Ahmadnagar.csv
CSV file saved: directory_csv/Akola.csv
CSV file saved: directory_csv/Amravati.csv
CSV file saved: directory_csv/Aurangabad.csv
CSV file saved: directory_csv/Beed.csv
CSV file saved: directory_csv/Bhandara.csv
CSV file saved: directory_csv/Buldana.csv
CSV file saved: directory_csv/Chandrapur.csv
CSV file saved: directory_csv/Dhule.csv
CSV file saved: directory_csv/Gadchiroli.csv
CSV file saved: directory_csv/Gondia.csv
CSV file saved: directory_csv/Hingoli.csv
CSV file saved: directory_csv/Jalgaon.csv
CSV file saved: directory_csv/Jalna.csv
CSV file saved: directory_csv/Kolhapur.csv
CSV file saved: directory_csv/Mumbai.csv
CSV file saved: directory_csv/Mumbai Suburban (Bandra).csv
CSV file saved: directory_csv/Nagpur.csv
CSV file saved: directory_csv/Nanded.csv
CSV file saved: directory_csv/Nandurbar.csv
CSV fil

In [55]:
file_to_url_mapping

{'SIO.csv': 'https://maharashtra.nic.in/directory/',
 'Maharashtra State Centre, Mumbai.csv': 'https://maharashtra.nic.in/directory/',
 'Ahmadnagar.csv': 'https://maharashtra.nic.in/directory/',
 'Akola.csv': 'https://maharashtra.nic.in/directory/',
 'Amravati.csv': 'https://maharashtra.nic.in/directory/',
 'Aurangabad.csv': 'https://maharashtra.nic.in/directory/',
 'Beed.csv': 'https://maharashtra.nic.in/directory/',
 'Bhandara.csv': 'https://maharashtra.nic.in/directory/',
 'Buldana.csv': 'https://maharashtra.nic.in/directory/',
 'Chandrapur.csv': 'https://maharashtra.nic.in/directory/',
 'Dhule.csv': 'https://maharashtra.nic.in/directory/',
 'Gadchiroli.csv': 'https://maharashtra.nic.in/directory/',
 'Gondia.csv': 'https://maharashtra.nic.in/directory/',
 'Hingoli.csv': 'https://maharashtra.nic.in/directory/',
 'Jalgaon.csv': 'https://maharashtra.nic.in/directory/',
 'Jalna.csv': 'https://maharashtra.nic.in/directory/',
 'Kolhapur.csv': 'https://maharashtra.nic.in/directory/',
 'Mum

Indexing of the CSV file

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.readers.web import SimpleWebPageReader
import os
from llama_index.core import (
    load_index_from_storage,
    StorageContext,
)

Settings.llm = Ollama(model="llama3", temperature=0, request_timeout=500.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)


# Paged CSV Reader example
parser = PagedCSVReader()

file_extractor = {".csv": parser}  # Add other CSV formats as needed
documents = SimpleDirectoryReader(
    "./directory_csv", file_extractor=file_extractor
).load_data()
documents

# Create and save the VectorStoreIndex for the loaded data
directory_index = VectorStoreIndex.from_documents(documents, llm=Settings.llm, embed_model=Settings.embed_model)
#  save index1 to disk
directory_index.set_index_id("vector_index")
directory_index.storage_context.persist("./Directory_index")






# 2. Text data from the webpages

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.readers.web import SimpleWebPageReader
import os
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
)
from llama_index.core import download_loader

Settings.llm = Ollama(model="llama3", temperature=0, request_timeout=500.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

# URLs and corresponding link names
urls = {
    "https://maharashtra.nic.in/": "Home_index",
    "https://maharashtra.nic.in/services/": "Services_index",
    "https://maharashtra.nic.in/profile/": "profile_index",
    "https://servicedesk.nic.in/": "helpdisk_index",
    "https://www.nic.in/servicecontents/nicnet/": "nicnet_index",
    "https://www.nic.in/servicecontents/data-centre/": "data-centre_index",
    "https://www.nic.in/servicecontents/national-cloud/": "national-cloud_index",
    "https://www.nic.in/servicecontents/messaging/": "messaging_index",
    "https://www.nic.in/servicecontents/remote-sensing-gis/": "remote-sensing-gis_index",
    "https://www.nic.in/servicecontents/webcast/": "webcast_index",
    "https://www.nic.in/servicecontents/domain-registration/": "domain-registration_index",
    "https://www.nic.in/servicecontents/nkn/": "nkn_index",
    "https://www.nic.in/servicecontents/command-and-control-centre/": "command-and-control-centre_index",
    "https://www.nic.in/servicecontents/government-local-area-networks-lans/": "government-local-area-networks-lans_index",
    "https://www.nic.in/servicecontents/video-conferencing/": "video-conferencing_index",
    "https://www.nic.in/servicecontents/security/": "security_index",
    "https://www.nic.in/servicecontents/centralised-aadhaar-vault/": "centralised-aadhaar-vault_index",
    "https://maharashtra.nic.in/infrastructure/": "infrastructure_index",
    "https://maharashtra.nic.in/news-update/": "news-update_index",
    "https://maharashtra.nic.in/events/": "events_index",
    "https://maharashtra.nic.in/awards/": "awards_index",
    "https://maharashtra.s3waas.gov.in/": "district_website_index",
    "https://eforms.nic.in/OnlineForms/": "eforms_index",
    "https://igod.gov.in/": "igod_index"
}


#  Load data from multiple URLs
documents = []
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
loader = BeautifulSoupWebReader()
for url in urls:
    documents.extend(loader.load_data(urls=[url]))



# Define the directory path to save the documents and indexes
save_directory = "/content/docs"
index_directory = "/content/index"
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Load data from the specified URLs and save each document with the link name
for url, link_name in urls.items():
    documents = loader.load_data(urls=[url])
    file_path = os.path.join(save_directory, f"{link_name}.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        for doc in documents:
            file.write(doc.text + "\n")

    # Create and save the VectorStoreIndex for the loaded data
    index = VectorStoreIndex.from_documents(documents, llm=Settings.llm, embed_model=Settings.embed_model)

    # Create a unique directory for each index
    index_folder = os.path.join(index_directory, link_name)
    if not os.path.exists(index_folder):
        os.makedirs(index_folder)

    # Save the index to disk
    index.set_index_id("vector_index")
    index.storage_context.persist(index_folder)

    print(f"Data from {url} saved successfully to: {file_path}")
    print(f"Index saved successfully to: {index_directory}")

# 3. Unstructured Webpage

In [None]:
from llama_index.readers.web import UnstructuredURLLoader

# Initialize settings for LlamaIndex
Settings.llm = Ollama(model="llama3", temperature=0, request_timeout=500.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

urls = [
    "https://maharashtra.nic.in/organization-structure/",
]

loader = UnstructuredURLLoader(
    urls=urls, continue_on_failure=False, headers={"User-Agent": "value"}
)
documents = loader.load_data()
# Create a VectorStoreIndex from the documents
index = VectorStoreIndex.from_documents(documents, llm=Settings.llm, embed_model=Settings.embed_model)

index.set_index_id("vector_index")
index.storage_context.persist("./organization-structure_index")

# Model and Embeddings

In [38]:
# Initialize settings for LlamaIndex
Settings.llm = Ollama(model="llama3", temperature=0, request_timeout=500.0)

In [11]:

Settings.embed_model = HuggingFaceEmbedding(model_name="Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.2k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/57.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

# Loading the index from disk

In [85]:
persist_dir = "/content/combine_index"

storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
index = load_index_from_storage(storage_context)

print(index)

<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7a407d3cfd90>


# Chat_engine

In [94]:
from llama_index.core.storage.chat_store import SimpleChatStore
from llama_index.core.memory import ChatMemoryBuffer

chat_store = SimpleChatStore()
loaded_chat_store = SimpleChatStore.from_persist_path(
    persist_path="chat_store.json"
)

chat_memory = ChatMemoryBuffer.from_defaults(
    token_limit=3000,
    chat_store=loaded_chat_store,
    chat_store_key="user1",
)


chat_engine = index.as_chat_engine(
    chat_mode="context",
    system_prompt="You are a helpful AI assistant. You are expert in retrieving the answer for the user input based on the provided context. You are also able to give the source link(url) to each user input.",
    memory=chat_memory, verbose = True
)



In [98]:
val_query_response = chat_engine.chat("What are the services offered at NIC?")

# query_engine = index.as_query_engine()

# query = "what are the services offered by NIc?"
# val_query_response = query_engine.query(query)
# print(f"RESPONSE:\n{response}")
print(val_query_response)

According to the context information provided, NIC (National Informatics Centre) offers a range of services, including:

1. **Data Centres**: NIC provides Data Centers Services from National Data Centres at Delhi, Hyderabad, Pune, and Bhubaneswar.
2. **Cloud Infrastructure**: NIC launched National Cloud Services in 2014 under MeghRaj Government of India Cloud Initiative, providing cloud-based services such as Application Programme Monitoring (APM), Data Analytics (DA), Resource Monitoring (RM), and Container Service.
3. **Mini-Clouds**: NIC has established Mini Clouds in various state units to provide cloud-based services to government departments.
4. **Command and Control Centre**: NIC has set up a Command and Control Centre (CCC) to monitor the availability of all data centres, cloud services, and applications hosted by these centres.
5. **Cyber Security**: NIC provides cyber security services, including Network Security, Application Security, and Vulnerability Assessment, to ensure 

In [67]:
val_query_response

Response(response='NIC provides various services including Data Centers Services from National Data Centres at Delhi, Hyderabad, Pune, and Bhubaneswar. These services include Cloud-enabled data centre, co-location services, high-speed network backbone, enterprise-class storage, Network Load Balancers, Intrusion Prevention Systems, Backup as a Service & Storage as a Service.\n\nNIC also offers National Cloud Services from multiple locations of National Data Centres at Bhubaneswar, Delhi, Hyderabad, and Pune. The cloud services include Application Programme Monitoring (APM) Service, Data Analytics (DA) Service, Resource Monitoring (RM) Service, and Container Service.\n\nAdditionally, NIC provides Command and Control Centre services to monitor the availability of all data centres and cloud services. It also offers Cyber Security services including Network Security, Application Security, and 24×7 Security Monitoring Centre to ensure real-time monitoring, detection, prevention, analysis, an

In [57]:
# response = chat_engine.chat("what is the published date of Implementation of Aadhar Enabled Biometric Attendance System (AEBAS) in NIC Maharashtra?")

query_engine = index.as_query_engine()

query = "who isz ther RTI Public Info Officer?"
val_query_response = query_engine.query(query)
# print(f"RESPONSE:\n{response}")
# print(response)

In [58]:
val_query_response

Response(response='Smt. Ireni Akoijam is the RTI Public Information Officer.', source_nodes=[NodeWithScore(node=TextNode(id_='de440ef6-2ec9-4dfc-8e2c-5d2fbb4022fb', embedding=None, metadata={'file_path': '/content/RTI_csv/PUBLIC INFORMATION OFFICERS ( PIO ).csv', 'file_name': 'PUBLIC INFORMATION OFFICERS ( PIO ).csv', 'file_type': 'text/csv', 'file_size': 240, 'creation_date': '2024-05-24', 'last_modified_date': '2024-05-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='896ad005-3f53-4edd-8b2c-9f5102715735', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/content/RTI_csv/PUBLIC INFORMATION OFFICERS ( PIO ).csv', 'file_name': 'PUBLIC INFORMATION OFFICERS ( PIO ).csv', 'file_type': 'text/cs

In [99]:

#print("Jigyasa Response: " + str(val_query_response.response))
node = val_query_response.source_nodes[0]
response_json = {}
response_json['response'] = val_query_response.response
response_json['search_Score'] = node.score

try:
  response_json['url'] = str(node.metadata['URL'])
except  KeyError as e:
  response_json['url'] =  str(file_to_url_mapping[node.metadata['file_name']])







In [100]:
response_json

{'response': 'According to the context information provided, NIC (National Informatics Centre) offers a range of services, including:\n\n1. **Data Centres**: NIC provides Data Centers Services from National Data Centres at Delhi, Hyderabad, Pune, and Bhubaneswar.\n2. **Cloud Infrastructure**: NIC launched National Cloud Services in 2014 under MeghRaj Government of India Cloud Initiative, providing cloud-based services such as Application Programme Monitoring (APM), Data Analytics (DA), Resource Monitoring (RM), and Container Service.\n3. **Mini-Clouds**: NIC has established Mini Clouds in various state units to provide cloud-based services to government departments.\n4. **Command and Control Centre**: NIC has set up a Command and Control Centre (CCC) to monitor the availability of all data centres, cloud services, and applications hosted by these centres.\n5. **Cyber Security**: NIC provides cyber security services, including Network Security, Application Security, and Vulnerability As

# Chat_store

In [93]:
chat_store.persist(persist_path="chat_store.json")
loaded_chat_store = SimpleChatStore.from_persist_path(
    persist_path="chat_store.json"
)

# Reranking

In [101]:
from llama_index.core.postprocessor import LLMRerank

In [102]:
from IPython.display import Markdown, display