In [None]:
!CUDACXX=/usr/local/cuda-12.2/bin/nvcc CMAKE_ARGS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=native" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir --force-reinstall --upgrade

In [None]:
!pip install -q langchain-core langchain langchain-community langchain-chroma langchain-text-splitters langchain-huggingface langchain_milvus jedi==0.17

In [None]:
!pip install -q --force-reinstall numpy==1.26.4 pandas==2.2.2 pymilvus==2.4.6 pymilvus[model]==2.4.6 protobuf==3.20.3 grpcio==1.63.0 nltk==3.9.1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import io
import json
import tempfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from langchain_community.llms import LlamaCpp
from langchain_community.document_loaders.dataframe import DataFrameLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_milvus.retrievers import MilvusCollectionHybridSearchRetriever
from langchain_milvus.utils.sparse import BM25SparseEmbedding

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from pymilvus import MilvusClient
client = MilvusClient("./table_to_insights.db")

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: fd8bab3084b7422fa7a0562f140bcaf2


In [None]:
data_path = "/content/drive/MyDrive/Datasets/Table Insights/table_insights_labeled_data.xlsx"
df = pd.read_excel(data_path)

In [None]:
df.columns = ["Device"] + df.columns.tolist()[1:]
df.head()

Unnamed: 0,Device,date_,cpu_usage,ram_usage,diskio_usage,question_0,insight_0,question_1,insight_1,question_2,insight_2,serialnumber_org
0,Device_1,2022-08-02,7.845169,52.662081,,What does the data contains and say about. Res...,The data contains information about a computer...,What are the top 3 most important insights,1. The highest CPU usage was recorded on 2022-...,What are the top 3 abberations present in the ...,1. The diskio_usage column has no data (NaN) f...,1BBZ4Y2
1,Device_1,2022-08-03,5.029416,53.519746,,0,0,0,0,0,0,1BBZ4Y2
2,Device_1,2022-08-04,4.855019,53.656122,,0,0,0,0,0,0,1BBZ4Y2
3,Device_1,2022-08-05,8.014844,55.245124,,0,0,0,0,0,0,1BBZ4Y2
4,Device_1,2022-08-07,16.909919,54.216115,,0,0,0,0,0,0,1BBZ4Y2


In [None]:
question0 = [question for question in df["question_0"].tolist() if question != 0]
question1 = [question for question in df["question_1"].tolist() if question != 0]
question2 = [question for question in df["question_2"].tolist() if question != 0]

insight0 = [insight for insight in df["insight_0"].tolist() if insight != 0]
insight1 = [insight for insight in df["insight_1"].tolist() if insight != 0]
insight2 = [insight for insight in df["insight_2"].tolist() if insight != 0]

In [None]:
ix = 1
print(question0[ix])
print(insight0[ix])
print()
print(question1[ix])
print(insight1[ix])
print()
print(question2[ix])
print(insight2[ix])

What does the data contains and say about. Respond within 2 Lines
The data contains CPU usage information for a single user (serial number 0095043aa0) over a period of 15 days (from 1/9/2022 to 1/23/2022). The CPU usage shows an overall increasing trend, with the highest usage at 94.81% on 1/20/2022.

What are the top 3 most important insights
1. CPU usage has been consistently increasing over the given period, with the lowest usage of 27.04% on 1/9/2022 and the highest usage of 94.81% on 1/20/2022.
2. The top 3 highest CPU usage days were 1/20/2022 (94.81%), 1/23/2022 (90.99%), and 1/16/2022 (83.23%).
3. The average CPU usage across the given period is approximately 68.87%, indicating a generally high level of computer usage during this time.

What are the top 3 abberations present in the dataset
Based on the provided data, the top 3 aberrations in CPU usage for user 0095043aa0 are:

1. On 1/20/2022, the CPU usage reached its highest point at 94.81%.
2. On 1/23/2022, the CPU usage was

In [None]:
df_stats = df[df.columns.tolist()[:5]]
df_stats.head()

Unnamed: 0,Device,date_,cpu_usage,ram_usage,diskio_usage
0,Device_1,2022-08-02,7.845169,52.662081,
1,Device_1,2022-08-03,5.029416,53.519746,
2,Device_1,2022-08-04,4.855019,53.656122,
3,Device_1,2022-08-05,8.014844,55.245124,
4,Device_1,2022-08-07,16.909919,54.216115,


In [None]:
table_str = []
for device in df["Device"].unique().tolist():
    md_table = df_stats[df_stats["Device"] == device].to_markdown(index=False)
    table_str.append(md_table)

In [None]:
print(table_str[0])

| Device   | date_               |   cpu_usage |   ram_usage |   diskio_usage |
|:---------|:--------------------|------------:|------------:|---------------:|
| Device_1 | 2022-08-02 00:00:00 |     7.84517 |     52.6621 |            nan |
| Device_1 | 2022-08-03 00:00:00 |     5.02942 |     53.5197 |            nan |
| Device_1 | 2022-08-04 00:00:00 |     4.85502 |     53.6561 |            nan |
| Device_1 | 2022-08-05 00:00:00 |     8.01484 |     55.2451 |            nan |
| Device_1 | 2022-08-07 00:00:00 |    16.9099  |     54.2161 |            nan |
| Device_1 | 2022-08-08 00:00:00 |     6.96914 |     59.4595 |            nan |
| Device_1 | 2022-08-09 00:00:00 |     6.94636 |     62.0098 |            nan |
| Device_1 | 2022-08-10 00:00:00 |     6.91032 |     62.3287 |            nan |
| Device_1 | 2022-08-11 00:00:00 |     6.8136  |     61.5232 |            nan |
| Device_1 | 2022-08-12 00:00:00 |     7.50065 |     63.4021 |            nan |
| Device_1 | 2022-08-15 00:00:00 |     5

In [None]:
import tiktoken
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("baseten/Meta-Llama-3-tokenizer")

def estimate_tokens_llama(text):
  return len(tokenizer.encode(text))

def estimate_tokens_tiktoken(text):
  enc = tiktoken.get_encoding("cl100k_base")
  return len(enc.encode(text))

# You can iterate through all table strings if needed
llama_token_count = []
tiktoken_token_count = []
for i, table in enumerate(table_str):
  token_count_llama = estimate_tokens_llama(table)
  token_count_tiktoken = estimate_tokens_tiktoken(table)
  llama_token_count.append(token_count_llama)
  tiktoken_token_count.append(token_count_tiktoken)

In [None]:
max(llama_token_count), max(tiktoken_token_count)

(2738, 2741)