In [None]:
!pip install langchain unstructured openai tiktoken python-magic chromadb tqdm

In [None]:
import os
import csv
import math
import nltk
import glob
import json
import magic
import random
import datetime
import pandas as pd
from tqdm import tqdm
from datetime import timedelta
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from IPython.display import Markdown, display
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)

os.environ["OPENAI_API_KEY"] = "HERE IS THE API KEY"

def customDisplay(text, color="blue"):
  display(Markdown(f"<font color='{color}'><b>{text}</b></font>"))

def writeResponse(response, idx):
  with open(f"results/result_{idx}.json", "w", encoding="utf-8") as f:
    f.write(response)

def prepareData():
  customDisplay("Preparing documents...")

  !rm -rf "./data"
  os.makedirs("data")

  csv_files = glob.glob("*.csv")

  for idx, cf in enumerate(csv_files):
    sep = ","
    skiprows = []

    with open(cf, "r", encoding="utf-8") as f:
      line = f.readline().split("sep=")

      if len(line) > 1:
        skiprows.append(0)
        sep = line[1][:-1]

    data = pd.read_csv(cf, sep=sep, skiprows=skiprows)

    # prepare intermediate documents
    with open(f"data/file_{idx}.jsonl", "w", encoding="utf-8") as f:
      for _, row in data.iterrows():
        if str(row["Comment"]) != row["Comment"]:
          continue

        f.write(
          json.dumps(
            str({
              "date": row["Date"],
              "comment": row["Comment"]
            })
          ) + "\n"
        )

  # load entire documents
  loader = DirectoryLoader("data", glob=f"**/*.jsonl")
  documents = loader.load()

  # split the documents into chunks
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 2000,
      chunk_overlap  = 0,
      length_function = len,
  )

  return text_splitter.split_documents(documents)

''' ALL ACTIONS '''
def summarization(texts):
  customDisplay("Summarizing...")

  # initializing the Summary chain
  chain = load_summarize_chain(llm=ChatOpenAI(), chain_type="refine")

  response = chain.run(texts)
  display(Markdown(f"Summary: <b>{response}</b>"))

def qaGeneral(texts):
  customDisplay("Preparing for QA(General)...")

  # embed all those chunks and store them into a vectorDB (chroma)
  embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
  docsearch = Chroma.from_documents(texts, embeddings)

  # initialize the QA model
  qa = RetrievalQA.from_llm(llm=ChatOpenAI(), retriever=docsearch.as_retriever(), return_source_documents=True)

  customDisplay("Type 'exit' to quit.\n")

  # QA loop
  while True: 
    query = input("Ques: ")
    if query == "exit": break

    response = qa({"query": query})
    display(Markdown(f"Ans: <b>{response['result']}</b>"))
    # print('source_documents', response['source_documents'])

def qaPrecise(texts):
  customDisplay("Preparing for QA(Precise)...")

  # initialize the QA chain
  chain = load_qa_chain(llm=ChatOpenAI(), chain_type="map_rerank")

  customDisplay("Type 'exit' to quit.\n")
  
  # QA loop
  while True: 
    query = input("Ques: ")
    if query == "exit": break

    response = chain({"input_documents": texts, "question": query}, return_only_outputs=True)
    display(Markdown(f"Ans: <b>{response['output_text']}</b>"))
    # print('source_documents', response['source_documents'])

def classification():
  customDisplay("Classifying comments between Technical and Non-Technical one by one...")

  human_message_prompt = HumanMessagePromptTemplate(
      prompt=PromptTemplate(
          input_variables=["comment"],
          template='''
Classify this comment into 'technical' and 'non_technical'. Ans should be either 'technical' and 'non_technical'. Don't try to describe it:

Example:
Makes it hard to find info
technical

{comment}
          ''',
      )
  )

  chat = ChatOpenAI(temperature=0)
  chat_prompt_template = ChatPromptTemplate.from_messages([human_message_prompt])

  # initialize the classification model
  chain = LLMChain(llm=chat, prompt=chat_prompt_template, verbose=False)

  !rm -rf "./classifications"
  os.makedirs("classifications")

  files = glob.glob("*.csv")

  # classification loop
  for file in files:
    sep = ","
    skiprows = []

    with open(file, "r", encoding="utf-8") as f:
      line = f.readline().split("sep=")

      if len(line) > 1:
        skiprows.append(0)
        sep = line[1][:-1]

    res = []
    data = pd.read_csv(file, sep=sep, skiprows=skiprows)

    with tqdm(total=len(data)) as pbar:
      for idx, row in data.iterrows():
        r = chain.run(row["Comment"]) if str(row["Comment"]) == row["Comment"] else ""
        res.append(r)
        pbar.update(1)
    
    data["Classification"] = res
    data.to_csv(f"classifications/{file}", sep=",")

  customDisplay("Done, Open 'classifications' folder to see the results.")

def askAI():
  csv_files = glob.glob("*.csv")

  if len(csv_files) == 0:
    customDisplay("!!! No CSV file found !!!", "red")
    return
    
  texts = prepareData()

  while True:
    # getting action type
    at = int(input(
      '''
      0. Quit
      1. Summarization
      2. Question-Answering (General)
      3. Question-Answering (Precise)
      4. Classification (Technical/Non-Technical)

      Which action would you like to perform? (Enter between 0-4)

      '''  
    ))

    if int(at) == at and at >= 0 and at <= 4:
      if at == 0:
        customDisplay("Quitting...")
        break
      else:
        try:
          if at == 1:
            summarization(texts)
          elif at == 2:
            qaGeneral(texts)
          elif at == 3:
            qaPrecise(texts)
          else:
            classification()
        except:
          customDisplay("!!! Something has gone wrong, try again... !!!", "red")

'''FINALLY RUNNING THIS SCRIPT'''
askAI()


<font color='blue'><b>Preparing documents...</b></font>


      0. Quit
      1. Summarization
      2. Question-Answering (General)
      3. Question-Answering (Precise)
      4. Classification (Technical/Non-Technical)

      Which action would you like to perform? (Enter between 0-4)

      1


<font color='blue'><b>Summarizing...</b></font>

<font color='red'><b>!!! Something has gone wrong, try again... !!!</b></font>