In [None]:
!pip install langchain langchain_core langchain_community langserve sentence_transformers chromadb

In [None]:
!pip install langchain-huggingface
!pip install bitsandbytes huggingface_hub

In [None]:
from langchain_huggingface import HuggingFaceEndpoint

In [None]:
from google.colab import userdata
key = userdata.get('HF_TOKEN')

In [None]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = key

In [None]:
repo_id = 'mistralai/Mistral-7B-Instruct-v0.3'
llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=key, max_new_tokens= 1200, temperature = 0.9)

In [None]:
llm

HuggingFaceEndpoint(repo_id='mistralai/Mistral-7B-Instruct-v0.3', huggingfacehub_api_token='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', max_new_tokens=1200, temperature=0.9, stop_sequences=[], server_kwargs={}, model_kwargs={}, model='mistralai/Mistral-7B-Instruct-v0.3', client=<InferenceClient(model='mistralai/Mistral-7B-Instruct-v0.3', timeout=120)>, async_client=<InferenceClient(model='mistralai/Mistral-7B-Instruct-v0.3', timeout=120)>)

In [None]:
import json
with open('fine_tune_data_original.json', 'r') as f:
    data = json.load(f)

In [None]:
!pip install faiss-gpu
!pip install jq



In [None]:
import jq
from langchain_community.document_loaders import JSONLoader
loader = JSONLoader(
    file_path='fine_tune_data_original.json',
    jq_schema='.[] | {instruction: .instruction, input: .input, output: .output}',  # Combine content
    text_content=False
)
data = loader.load()
data

[Document(metadata={'source': '/content/fine_tune_data_original.json', 'seq_num': 1}, page_content='{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-07-29 17:41:41,536\\nLog Level: INFO\\nMessage: Reading configuration from: /etc/zookeeper/conf/zoo.cfg", "output": "Configuration update: Reading configuration from: /etc/zookeeper/conf/zoo.cfg."}'),
 Document(metadata={'source': '/content/fine_tune_data_original.json', 'seq_num': 2}, page_content='{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-07-29 17:41:41,544\\nLog Level: INFO\\nMessage: Defaulting to majority quorums", "output": "Configuration update: Defaulting to majority quorums."}'),
 Document(metadata={'source': '/content/fine_tune_data_original.json', 'seq_num': 3}, page_content='{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-07-29 17:41:41,555\\nLog Level: INFO\\nMessage: autopurge.snapRetainCount set to 3", "output": "Configuration upda

In [None]:
from sentence_transformers import SentenceTransformer
st_model = SentenceTransformer('all-MiniLM-L6-v2')

documents = [doc.page_content for doc in data]
embeddings = st_model.encode(documents)

In [None]:
import faiss
import numpy as np

dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
embedding_array = np.array(embeddings).astype('float32')
index.add(embedding_array)

In [None]:
def dynamic_search(query_text, top_k=3, metric="L2"):
    query_embedding = st_model.encode([query_text])[0].astype('float32')
    if metric == "cosine":
        faiss.normalize_L2(embedding_array)
        faiss.normalize_L2(np.array([query_embedding]))

    distances, indices = index.search(np.array([query_embedding]), top_k)

    results = [
        {"document": documents[idx], "distance": distances[0][i]}
        for i, idx in enumerate(indices[0])
    ]
    return results

In [None]:
parameter = list(("NULL","WARN","INFO"))
x  = int(input("Select what to return: 1. WARN, 2. INFO\n"))
query = f"""Find logs with {parameter[x]} levels"""

top_k_results = int(input("How many results do you want?\n"))
distance_metric = "cosine"
search_results = dynamic_search(query, top_k=top_k_results, metric=distance_metric)

print("Query Results:")
for result in search_results:
    print(f"- Document: {result['document']} (Distance: {result['distance']:.2f})")

Select what to return: 1. WARN, 2. INFO
2
How many results do you want?
10
Query Results:
- Document: {"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-18 16:09:30,002\nLog Level: INFO\nMessage: Creating new log file: log.b00000001", "output": "Configuration update: Creating new log file: log.b00000001."} (Distance: 1.08)
- Document: {"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-18 16:09:30,007\nLog Level: INFO\nMessage: Creating new log file: log.b00000001", "output": "Configuration update: Creating new log file: log.b00000001."} (Distance: 1.08)
- Document: {"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-10 18:25:32,002\nLog Level: INFO\nMessage: Creating new log file: log.a00000001", "output": "Configuration update: Creating new log file: log.a00000001."} (Distance: 1.09)
- Document: {"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-10 18:24:00,003\nLog L

In [None]:
import pandas as pd

# Load the CSV file
file_path = 'Hadoop_2k.log_structured.csv'
log_data = pd.read_csv(file_path)

# Display the first few rows
print(log_data.head())


   LineId        Date          Time Level Process  \
0       1  2015-10-18  18:01:47,978  INFO    main   
1       2  2015-10-18  18:01:48,963  INFO    main   
2       3  2015-10-18  18:01:48,963  INFO    main   
3       4  2015-10-18  18:01:49,228  INFO    main   
4       5  2015-10-18  18:01:50,353  INFO    main   

                                        Component  \
0  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
1  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
2  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
3  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
4  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   

                                             Content EventId  \
0  Created MRAppMaster for application appattempt...     E29   
1                             Executing with tokens:     E42   
2  Kind: YARN_AM_RM_TOKEN, Service: , Ident: (app...     E61   
3                      Using mapred newApiCommitter.    E111   
4                 OutputCommitter set in c

In [None]:
print(log_data.info())
print(log_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   LineId         2000 non-null   int64 
 1   Date           2000 non-null   object
 2   Time           2000 non-null   object
 3   Level          2000 non-null   object
 4   Process        2000 non-null   object
 5   Component      2000 non-null   object
 6   Content        2000 non-null   object
 7   EventId        2000 non-null   object
 8   EventTemplate  2000 non-null   object
dtypes: int64(1), object(8)
memory usage: 140.8+ KB
None
   LineId        Date          Time Level Process  \
0       1  2015-10-18  18:01:47,978  INFO    main   
1       2  2015-10-18  18:01:48,963  INFO    main   
2       3  2015-10-18  18:01:48,963  INFO    main   
3       4  2015-10-18  18:01:49,228  INFO    main   
4       5  2015-10-18  18:01:50,353  INFO    main   

                                        Compo

In [None]:
log_data['search_text'] = log_data.apply(
    lambda row: f"Timestamp: {row['Date']} | Log Level: {row['Level']} | Message: {row['Content']}", axis=1
)


print(log_data[['search_text']].head())

documentshadoop = log_data['search_text'].tolist()
embeddings = st_model.encode(documentshadoop)

                                         search_text
0  Timestamp: 2015-10-18 | Log Level: INFO | Mess...
1  Timestamp: 2015-10-18 | Log Level: INFO | Mess...
2  Timestamp: 2015-10-18 | Log Level: INFO | Mess...
3  Timestamp: 2015-10-18 | Log Level: INFO | Mess...
4  Timestamp: 2015-10-18 | Log Level: INFO | Mess...


In [None]:
query_text = "Find logs with INFO levels."
search_results = dynamic_search(query_text, top_k=5, metric="cosine")

for result in search_results:
    print(result['document'])

{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-25 11:23:57,387\nLog Level: INFO\nMessage: Getting a diff from the leader 0xd0000001b", "output": "Configuration update: Getting a diff from the leader 0xd0000001b."}
{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-10 18:25:26,697\nLog Level: INFO\nMessage: Getting a diff from the leader 0x900000007", "output": "Configuration update: Getting a diff from the leader 0x900000007."}
{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-18 16:09:30,002\nLog Level: INFO\nMessage: Creating new log file: log.b00000001", "output": "Configuration update: Creating new log file: log.b00000001."}
{"instruction": "Summarize the configuration log.", "input": "Timestamp: 2015-08-25 11:19:48,622\nLog Level: INFO\nMessage: Getting a diff from the leader 0xd0000001b", "output": "Configuration update: Getting a diff from the leader 0xd0000001b."}
{"instruction": "Sum

In [None]:
from langchain_huggingface import HuggingFaceEndpoint
from models.prompts import new_prompt
from langchain_core.runnables import RunnableWithMessageHistory
from langchain_core.memory import BaseMemory
import threading
import asyncio
import language_tool_python

# Initialize the LLM
KEY = "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"
repo_id = 'mistralai/Mistral-7B-Instruct-v0.3'
llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=KEY, max_new_tokens=2000, temperature=0.9)

# Initialize memory
memory = BaseMemory()  # Replace with the appropriate memory class if needed

# Initialize conversation chain with RunnableWithMessageHistory
conversation_chain = RunnableWithMessageHistory(
    llm=llm,  # Pass the LLM here
    memory=memory
)

# Initialize grammar tool
grammar_tool = language_tool_python.LanguageTool('en-US')

def preprocess_query(user_query):
    """Correct grammar and clean the user's query."""
    corrected_query = grammar_tool.correct(user_query)
    return corrected_query.strip()

def postprocess_response(raw_response):
    """Format the response as HTML."""
    return f"<html><body><p>{raw_response}</p></body></html>"

def analyze_dataset(user_query):
    """Analyze the dataset using the LLM."""
    interactive_prompt = f"{new_prompt} + \nUser  Query: {user_query}\nYour Response:"

    try:
        for token in llm.stream(interactive_prompt):
            yield token
    except Exception as e:
        yield f"Error: {str(e)}"from langchain_huggingface import HuggingFaceEndpoint
from models.prompts import new_prompt
from langchain_core.runnables import RunnableWithMessageHistory
from langchain_core.memory import BaseMemory
import threading
import asyncio
import language_tool_python

# Initialize the LLM
KEY = "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"
repo_id = 'mistralai/Mistral-7B-Instruct-v0.3'
llm = HuggingFaceEndpoint(repo_id=repo_id, huggingfacehub_api_token=KEY, max_new_tokens=2000, temperature=0.9)

# Initialize memory
memory = BaseMemory()  # Replace with the appropriate memory class if needed

# Initialize conversation chain with RunnableWithMessageHistory
conversation_chain = RunnableWithMessageHistory(
    llm=llm,  # Pass the LLM here
    memory=memory
)

# Initialize grammar tool
grammar_tool = language_tool_python.LanguageTool('en-US')

def preprocess_query(user_query):
    """Correct grammar and clean the user's query."""
    corrected_query = grammar_tool.correct(user_query)
    return corrected_query.strip()

def postprocess_response(raw_response):
    """Format the response as HTML."""
    return f"<html><body><p>{raw_response}</p></body></html>"

def analyze_dataset(user_query):
    """Analyze the dataset using the LLM."""
    interactive_prompt = f"{new_prompt} + \nUser  Query: {user_query}\nYour Response:"

    try:
        for token in llm.stream(interactive_prompt):
            yield token
    except Exception as e:
        yield f"Error: {str(e)}"print(log_data.head())

log_data['llm_prompt'] = log_data.apply(
    lambda row: f"Analyze the following log entry and classify it into one or more FCAPS categories "
        f"""(Fault, Configuration, Accounting, Performance, Security):\n"
        Timestamp: {row['Date']} {row['Time']}\n
        Log Level: {row['Level']}\n
        Process: {row['Process']}\n
        Component: {row['Component']}\n
        Message: {row['Content']}\n
        Event Template: {row['EventTemplate']}""", axis=1
)

t = int(input("\nEnter the log entry you wish to classify and analyse: "))
log_prompt = log_data['llm_prompt'].iloc[t+1]
response = llm(log_prompt)

print("LLM Response:")
print(response)

   LineId        Date          Time Level Process  \
0       1  2015-10-18  18:01:47,978  INFO    main   
1       2  2015-10-18  18:01:48,963  INFO    main   
2       3  2015-10-18  18:01:48,963  INFO    main   
3       4  2015-10-18  18:01:49,228  INFO    main   
4       5  2015-10-18  18:01:50,353  INFO    main   

                                        Component  \
0  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
1  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
2  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
3  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   
4  org.apache.hadoop.mapreduce.v2.app.MRAppMaster   

                                             Content EventId  \
0  Created MRAppMaster for application appattempt...     E29   
1                             Executing with tokens:     E42   
2  Kind: YARN_AM_RM_TOKEN, Service: , Ident: (app...     E61   
3                      Using mapred newApiCommitter.    E111   
4                 OutputCommitter set in c