In [2]:
# --- Imports ---
import pandas as pd
import re
import torch
import difflib
from PyPDF2 import PdfReader
from collections import defaultdict
from nltk.tokenize import word_tokenize
import nltk; nltk.download("punkt")

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer, util

# --- Load Data ---
df = pd.read_excel("C:/Users/visma/Downloads/ITCHS_2012.xls", dtype=str)

# --- Term Mapping ---
term_mapping = defaultdict(list)
corpus = []

for _, row in df.iterrows():
    hs = str(row["ITC(HS)"]).strip()
    desc = str(row["Description"]).strip()
    chapter = int(hs[:2])
    policy = row.get("Policy", "Unknown")

    entry = {
        "hs_code": hs,
        "desc": desc,
        "chapter": chapter,
        "policy": policy,
        "duty": "Refer to schedule",
        "note": "Auto-mapped"
    }

    # full phrase
    term_mapping[desc.lower()].append(entry)
    # tokenized words
    for word in word_tokenize(desc.lower()):
        if word.isalpha() and len(word) > 2:
            term_mapping[word].append(entry)

    corpus.append(desc)

# --- Chapter Dictionary ---
chapter_dict = {
    "chapter 1": "Live animals",
    "chapter 2": "Meat and edible meat offal",
    "chapter 3": "Fish and crustaceans, molluscs and other aquatic invertebrates",
    "chapter 4": "Dairy produce; birds' eggs; natural honey; edible products of animal origin",
    "chapter 5": "Products of animal origin, not elsewhere specified or included",
    "chapter 6": "Live trees and other plants; bulbs, roots and the like; cut flowers and ornamental foliage",
    "chapter 7": "Edible vegetables and certain roots and tubers",
    "chapter 8": "Edible fruit and nuts; peel of citrus fruit or melons",
    "chapter 9": "Coffee, tea, mate and spices",
    "chapter 10": "Cereals",
    "chapter 11": "Products of the milling industry; malt; starches; inulin; wheat gluten",
    "chapter 12": "Oil seeds and oleaginous fruits; miscellaneous grains, seeds and fruit",
    "chapter 13": "Lac; gums, resins and other vegetable saps and extracts",
    "chapter 14": "Vegetable plaiting materials; vegetable products not elsewhere specified or included",
    "chapter 15": "Animal or vegetable fats and oils and their cleavage products",
    "chapter 16": "Preparations of meat, of fish or of crustaceans, molluscs or other aquatic invertebrates",
    "chapter 17": "Sugars and sugar confectionery",
    "chapter 18": "Cocoa and cocoa preparations",
    "chapter 19": "Preparations of cereals, flour, starch or milk",
    "chapter 20": "Preparations of vegetables, fruit, nuts or other parts of plants",
    "chapter 21": "Miscellaneous edible preparations",
    "chapter 22": "Beverages, spirits and vinegar",
    "chapter 23": "Residues and waste from the food industries; prepared animal fodder",
    "chapter 24": "Tobacco and manufactured tobacco substitutes",
    "chapter 25": "Salt; sulphur; earths and stone; plastering materials, lime and cement",
    "chapter 26": "Ores, slag and ash",
    "chapter 27": "Mineral fuels, mineral oils and products of their distillation",
    "chapter 28": "Inorganic chemicals; organic or inorganic compounds of precious metals",
    "chapter 29": "Organic chemicals",
    "chapter 30": "Pharmaceutical products",
    "chapter 31": "Fertilizers",
    "chapter 32": "Tanning or dyeing extracts; tannins and their derivatives",
    "chapter 33": "Essential oils and resinoids; perfumery, cosmetic or toilet preparations",
    "chapter 34": "Soap, organic surface-active agents, washing preparations",
    "chapter 35": "Albuminoidal substances; modified starches; glues; enzymes",
    "chapter 36": "Explosives; pyrotechnic products; matches; pyrophoric alloys",
    "chapter 37": "Photographic or cinematographic goods",
    "chapter 38": "Miscellaneous chemical products",
    "chapter 39": "Plastics and articles thereof",
    "chapter 40": "Rubber and articles thereof",
    "chapter 41": "Raw hides and skins (other than furskins) and leather",
    "chapter 42": "Articles of leather; saddlery and harness",
    "chapter 43": "Furskins and artificial fur; manufactures thereof",
    "chapter 44": "Wood and articles of wood",
    "chapter 45": "Cork and articles of cork",
    "chapter 46": "Manufactures of straw, of esparto or of other plaiting materials",
    "chapter 47": "Pulp of wood or of other fibrous cellulosic material",
    "chapter 48": "Paper and paperboard; articles of paper pulp",
    "chapter 49": "Printed books, newspapers, pictures and other products",
    "chapter 50": "Silk",
    "chapter 51": "Wool, fine or coarse animal hair",
    "chapter 52": "Cotton",
    "chapter 53": "Other vegetable textile fibres; paper yarn and woven fabrics",
    "chapter 54": "Man-made filaments",
    "chapter 55": "Man-made staple fibres",
    "chapter 56": "Wadding, felt and nonwovens",
    "chapter 57": "Carpets and other textile floor coverings",
    "chapter 58": "Special woven fabrics; tufted textile fabrics",
    "chapter 59": "Impregnated, coated, covered or laminated textile fabrics",
    "chapter 60": "Knitted or crocheted fabrics",
    "chapter 61": "Articles of apparel and clothing accessories, knitted or crocheted",
    "chapter 62": "Articles of apparel and clothing accessories, not knitted or crocheted",
    "chapter 63": "Other made up textile articles",
    "chapter 64": "Footwear, gaiters and the like",
    "chapter 65": "Headgear and parts thereof",
    "chapter 66": "Umbrellas, sun umbrellas, walking sticks",
    "chapter 67": "Prepared feathers and down",
    "chapter 68": "Articles of stone, plaster, cement, asbestos, mica",
    "chapter 69": "Ceramic products",
    "chapter 70": "Glass and glassware",
    "chapter 71": "Natural or cultured pearls, precious or semi-precious stones",
    "chapter 72": "Iron and steel",
    "chapter 73": "Articles of iron or steel",
    "chapter 74": "Copper and articles thereof",
    "chapter 75": "Nickel and articles thereof",
    "chapter 76": "Aluminium and articles thereof",
    "chapter 77": "Reserved for future use",
    "chapter 78": "Lead and articles thereof",
    "chapter 79": "Zinc and articles thereof",
    "chapter 80": "Tin and articles thereof",
    "chapter 81": "Other base metals; cermets",
    "chapter 82": "Tools, implements, cutlery, spoons and forks",
    "chapter 83": "Miscellaneous articles of base metal",
    "chapter 84": "Nuclear reactors, boilers, machinery and mechanical appliances",
    "chapter 85": "Electrical machinery and equipment",
    "chapter 86": "Railway or tramway locomotives, rolling stock and parts",
    "chapter 87": "Vehicles other than railway or tramway",
    "chapter 88": "Aircraft, spacecraft, and parts thereof",
    "chapter 89": "Ships, boats and floating structures",
    "chapter 90": "Optical, photographic, cinematographic, measuring, medical instruments",
    "chapter 91": "Clocks and watches and parts thereof",
    "chapter 92": "Musical instruments",
    "chapter 93": "Arms and ammunition; parts and accessories thereof",
    "chapter 94": "Furniture; bedding, mattresses, cushions",
    "chapter 95": "Toys, games and sports requisites",
    "chapter 96": "Miscellaneous manufactured articles",
    "chapter 97": "Works of art, collectors’ pieces and antiques",
    "chapter 98": "Project imports, special classification provisions"
}

# --- Sentence Embeddings ---
model = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# --- FAISS Fallback ---
def extract_text_from_pdf(path):
    reader = PdfReader(path)
    return "\n".join([p.extract_text() for p in reader.pages if p.extract_text()])

def build_faiss_index(text):
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_text(text)
    docs = [Document(page_content=c) for c in chunks]
    embeddings = HuggingFaceEmbeddings()
    return FAISS.from_documents(docs, embeddings)

# --- Semantic Handler ---
def semantic_search(query):
    query_embed = model.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embed, corpus_embeddings, top_k=3)[0]
    results = []

    for h in hits:
        data = term_mapping[corpus[h["corpus_id"]].lower()][0]
        chapter_name = chapter_dict.get(f"chapter {data['chapter']}", "Unknown")
        results.append(
            f"{data['desc']} → CHAPTER {data['chapter']} – {chapter_name}\n"
            f"HS Code: {data['hs_code']} | Policy: {data['policy']} | Duty: {data['duty']}"
        )

    return "\n".join(results)

# --- Synonym Dictionary (expandable) ---
synonyms = {
    "gold": ["precious metal", "bullion"],
    "missile": ["rocket", "projectile"],
    "drone": ["uav", "quadcopter", "suicide drone"],
}

# --- Hybrid Handler ---
def handle_query(query, faiss_index):
    q = query.lower()
    q_tokens = set(word_tokenize(q))
    
    # 1. Synonym Expansion
    for token in list(q_tokens):
        if token in synonyms:
            q_tokens.update(synonyms[token])

    # 2. Multi-keyword combination match 
combined_matches = []
for token in q_tokens:
    if token in term_mapping:
        combined_matches.extend(term_mapping[token])

# Filter for items that match multiple tokens
if len(q_tokens) > 1:
    filtered = []
    for entry in combined_matches:
        matched_words = [word for word in q_tokens if word in entry["desc"].lower()]
        if len(matched_words) >= 2:  # Match at least 2 keywords
            filtered.append(entry)

    if filtered:
        seen = set()
        results = []
        for item in sorted(filtered, key=lambda x: (x["chapter"], x["hs_code"])):
            if item["hs_code"] in seen:
                continue
            seen.add(item["hs_code"])
            chapter_name = chapter_dict.get(f"chapter {item['chapter']}", "Unknown")
            results.append(
                f"{item['desc']} → CHAPTER {item['chapter']} – {chapter_name}\n"
                f"HS Code: {item['hs_code']} | Policy: {item['policy']} | Duty: {item['duty']}"
            )
        return "\n".join(results[:3]) + "\n📘 Refer to relevant chapters."


    # 3. Fuzzy Match
    all_terms = list(term_mapping.keys())
    fuzz = difflib.get_close_matches(q, all_terms, n=1, cutoff=0.7)
    if fuzz:
        result = term_mapping[fuzz[0]][0]
        chapter_name = chapter_dict.get(f"chapter {result['chapter']}", "Unknown")
        return (
            f"{fuzz[0]} → CHAPTER {result['chapter']} – {chapter_name}\n"
            f"HS Code: {result['hs_code']} | Policy: {result['policy']} | Duty: {result['duty']}"
        )

    # 4. Semantic Match
    return "🔍 " + semantic_search(query)

# --- Local Run ---
if __name__ == "__main__":
    print("Extracting legal text...")
    full_text = extract_text_from_pdf("C:/Users/visma/OneDrive/Desktop/project cahtbot/a197551.pdf" )
    print("Building FAISS index...")
    faiss_index = build_faiss_index(full_text)

    print("🚀 Customs Hybrid Chatbot Ready. Type 'exit' to stop.")
    while True:
        q = input("🗨️ You: ")
        if q.lower() == "exit":
            break
        answer = handle_query(q, faiss_index)
        print(f"🤖 Bot: {answer}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\visma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Extracting legal text...
Building FAISS index...


  embeddings = HuggingFaceEmbeddings()
  embeddings = HuggingFaceEmbeddings()


🚀 Customs Hybrid Chatbot Ready. Type 'exit' to stop.


🗨️ You:  cotton and silk


🤖 Bot: Mules and hinnies as livestock → CHAPTER 1 – Live animals
HS Code: 01019030 | Policy: Restricted           | Duty: Refer to schedule
Whales, dolphins and porpoises (mammals of the order Cetacea); manatees and dugongs (mammals of the order Sirenia); seals, sea lions and walruses (mammals of the suborder Pinnipedia) → CHAPTER 1 – Live animals
HS Code: 01061200 | Policy: Restricted           | Duty: Refer to schedule
Camels and other camelids (Camelidae) → CHAPTER 1 – Live animals
HS Code: 01061300 | Policy: Restricted           | Duty: Refer to schedule
📘 Refer to relevant chapters.


🗨️ You:  exit
