Importing Necessary Libraries

In [2]:
import random
import re
from transformers import pipeline


Defining Models

In [3]:
model = pipeline(
    task="zero-shot-classification",
    model="typeform/distilbert-base-uncased-mnli"
)
ner = pipeline(
    task="ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

Device set to use cpu
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


ommon patterns to identify intents

In [9]:
label =["social conversation","factual information lookup"]
chat_content = ["hi","hello","thanks","thank you"]
search_content = ["who", "what", "when", "where", "how does","tell me about"]
SOCIAL_ACT_PATTERNS = {
  "GREETING": ["hi", "hello", "hey"],
  "GRATITUDE": ["thank", "thanks"],
  "PERMISSION_REQUEST": ["can i ask", "may i ask"],
  "WELLBEING_QUERY": ["how are you", "how do you feel"],
}
SOCIAL_ACT_RESPONSES = {
  "GREETING": [
      "Hey ðŸ™‚ What can I help you with?",
      "Hi there! How can I assist?"
  ],
  "GRATITUDE": [
      "You're welcome!",
      "Glad I could help ðŸ™‚"
  ],
  "PERMISSION_REQUEST": [
      "Of course. Go ahead.",
      "Sure, what do you want to ask?"
  ],
  "WELLBEING_QUERY": [
      "I'm doing well â€” how can I help you today?",
      "I'm just here to help you ðŸ™‚"
  ]
}
INFO_VERBS = ["explain", "tell", "describe", "define"]

In [30]:
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

def looks_like_info_request(text):
    return any(v in text for v in INFO_VERBS)

def route_intent(text):
  result=model(text,label)

  top_labels,top_scores=result["labels"],result["scores"]
  if(any(x in text for x in chat_content)):
    final_decision = "social conversation"
  elif(any(x in text for x in search_content)):
    final_decision = "factual information lookup"
  else:
    if(top_scores[0]<=.55):
      final_decision = "factual information lookup"
    else:
      final_decision = top_labels[0]

  return final_decision

def handle_social_conversation(text):
  for key,value in SOCIAL_ACT_PATTERNS.items():
    if any(x in text for x in value):
      return key
  return None

def handle_factual_information_lookup(text):
  print("Let me look that up for you.")
  return "(Search results will appear here)"

def extract_entities(text):
  entities = ner(text)
  values={}

  for entity in entities:
    label = entity["entity_group"]
    word = entity["word"]

    values.setdefault(label, []).append(word)
  return values

def debug_decision(raw_text, intent, entities, response=None):
    return {
        "input": raw_text,
        "intent": intent,
        "response": response,
        "entities": entities
    }
def clean_text(text):

  cleaned_text = re.sub(r'\[.*?\]', '', text)
  cleaned_text = cleaned_text.replace("\r\n","\n").replace("\r","\n")
  cleaned_text = cleaned_text.strip()

  return cleaned_text

def chunk_document(text):

  raw_chunk = re.split(r'\n{2,}',text)
  refined_chunk=[]

  for index,chunk in enumerate(raw_chunk):
    chunk = chunk.strip()

    if len(chunk) < 50:
      continue
    refined_obj={
      "chunk_id":f"Document_overview_{index}",
      "chunk":chunk,
      "chunk_length":len(chunk),
      "num_sentences":len(re.findall(r'[.!?]+', chunk)),
      "position":index,
      "entities":extract_entities(chunk)
    }
    
    refined_chunk.append(refined_obj)

  return refined_chunk

def normalize_query(query):
  return query.lower().strip()

def extract_keywords(query):
  return {t for t in tokenize(query) if len(t) >= 3}

def score_chunk(chunk_dict,chunk_text,query_keywords,query_entities):
  chunk_text = chunk_text.lower()
  chunk_tokens = tokenize(chunk_text)
  keyword_score= sum(1 for k in query_keywords if k in chunk_tokens)
  
  chunk_entities = chunk_dict.get("entities",{})
  entity_score=0
  
  for ent_typ,ents in query_entities.items():
    entity_score+= len(set(ents) & set(chunk_entities.get(ent_typ,[])))
  
  return keyword_score + (2*entity_score)

def retrieve_best_chunk(chunks,query,top_k=1):
  normalized_query = normalize_query(query)
  query_keywords = extract_keywords(normalized_query)
  query_entities = extract_entities(query)
  
  scored=[]
  
  for chunk in chunks:
    score = score_chunk(chunk,chunk['chunk'],query_keywords,query_entities)
    scored.append((score,chunk))
  
  scored.sort(key= lambda x:x[0],reverse=True)
  
  return [c for s,c in scored if s>0][:top_k]
  

In [None]:
# Small demo document for testing chunking + retrieval pipeline
RAW_DOC = """
Artificial Intelligence is a field of computer science focused on building systems that can perform tasks requiring human-like intelligence. 
These tasks include reasoning, learning, perception, and language understanding.

Machine Learning is a subfield of AI that enables systems to learn patterns from data instead of being explicitly programmed. 
It is widely used in recommendation systems, fraud detection, and speech recognition.

Natural Language Processing allows machines to understand and generate human language. 
Applications include chatbots, search engines, and automated customer support.
"""

cleaned_text = clean_text(RAW_DOC)
chunks=chunk_document(cleaned_text)
document = {
    "doc_id": "AI_ML_NLP_overview",
    "source": "wikipedia",
    "raw_text": RAW_DOC,
    "clean_text": cleaned_text,
    "chunks": chunks,
    "No. of chunks":len(chunks)

    }
query = "Explain the concept of Machine Learning."
best_chunk = retrieve_best_chunk(chunks,query)
print(best_chunk)

<class 'dict'>
[{'chunk_id': 'Document_overview_1', 'chunk': 'Machine Learning is a subfield of AI that enables systems to learn patterns from data instead of being explicitly programmed. \nIt is widely used in recommendation systems, fraud detection, and speech recognition.', 'chunk_length': 212, 'num_sentences': 2, 'position': 1, 'entities': {'MISC': ['Machine Learning', 'AI']}}]


In [None]:


while True:
    raw_text = input("what do you have in mind today? ")
    normalized_text = raw_text.lower()
    if looks_like_info_request(normalized_text):
        intent = "factual information lookup"
    else:
        intent = route_intent(normalized_text)

    print(intent)

    if intent == "social conversation":
        social_act = handle_social_conversation(normalized_text)
        if social_act:
            response = random.choice(SOCIAL_ACT_RESPONSES[social_act])
        else:
            response = random.choice(SOCIAL_ACT_RESPONSES["GREETING"])
    else:
        response = handle_factual_information_lookup(raw_text)
    entities = extract_entities(raw_text)
    debug = debug_decision(raw_text, intent, entities, response)
    print(debug)
    print("press 1 to exit")
    if input() == "1":
        break

