In [None]:
from bs4 import BeautifulSoup
import json
from data import html_code_list  # Import the array from data.py

def clean_text(text):
    """Remove BOM and invisible characters, and strip whitespace."""
    return text.replace('\ufeff', '').strip()

def extract_info(html):
    """Extracts information from a single HTML document."""
    soup = BeautifulSoup(html, 'html.parser')
    data = {}

   #Tags
    container = soup.find('div', class_='mb-2 md:mb-0 w-full')
    tags = container.find_all('div', title=True)
    tag_texts = [tag.get_text(strip=True) for tag in tags]
    data['Tags'] = tag_texts

    # Place
    place_tag = soup.find("h3", class_="text-raven dark:text-indigo-100 text-base mt-2 md:mt-0 hover:underline cursor-pointer")
    data['place'] = clean_text(place_tag.get_text()) if place_tag else ""

    # Scheme name
    title_tag = soup.find("h1", class_="font-bold text-xl sm:text-2xl text-[#24262B] dark:text-white mt-1")
    data['Scheme Name'] = clean_text(title_tag.get_text()) if title_tag else ""

    # Details
    details = []
    details_section = soup.find("div", id="details")
    if details_section:
        for tag in details_section.find_all(['li', 'span']):
            text = clean_text(tag.get_text())
            if text and text not in details:
                details.append(text)
    data['Details'] = details

    # Benefits
    benefits = []
    benefits_section = soup.find("div", id="benefits")
    if benefits_section:
        for tag in benefits_section.find_all(['li', 'span']):
            text = clean_text(tag.get_text())
            if text and text not in benefits:
                benefits.append(text)
    data['Benefits'] = benefits

    # Eligibility
    eligibility = []
    eligibility_section = soup.find("div", id="eligibility")
    if eligibility_section:
        for tag in eligibility_section.find_all(['li', 'span']):
            text = clean_text(tag.get_text())
            if text and text not in eligibility:
                eligibility.append(text)
    data['Eligibility'] = eligibility

    # Application Process
    application_process = {}
    application_section = soup.find("div", id="application-process")
    if application_section:
        tab_containers = application_section.find_all("div", class_="rounded")
        for tab in tab_containers:
            parent = tab.find_previous("div", class_="overflow-x-auto")
            label = clean_text(parent.get_text()) if parent else "Unknown"

            content = []
            markdown = tab.find("div", class_="markdown-options")
            if markdown:
                for div in markdown.find_all("div", class_="mb-2"):
                    text = clean_text(div.get_text(separator=" "))
                    if text:
                        content.append(text)

            if label and content:
                application_process[label] = content
    data['Application Process'] = application_process

    # Initialize result dictionary


    # Documents Required
    documents_required = []
    documents_section = soup.find("div", id="documents-required")
    if documents_section:
        for li in documents_section.find_all("li"):
            text = clean_text(li.get_text())
            if text:
                documents_required.append(text)
    data['Documents Required'] = documents_required

    # Exclusions
    exclusions = []
    exclusions_section = soup.find("div", id="exclusions")
    if exclusions_section:
        blockquotes = exclusions_section.find_all("blockquote")
        for bq in blockquotes:
            text = clean_text(bq.get_text())
            if text:
                exclusions.append(text)
    data['Exclusions'] = exclusions

    # FAQs
    faq_dict = {}
    faq_section = soup.find("div", id="faqs")
    if faq_section:
        faq_items = faq_section.find_all("div", class_="py-4")
        for item in faq_items:
            question_tag = item.find("p", class_="font-bold")
            question = clean_text(question_tag.get_text()) if question_tag else ""

            answer_tag = item.find("div", class_="rounded-b")
            answer_text = clean_text(answer_tag.get_text(separator=" ")) if answer_tag else ""

            if question and answer_text:
                faq_dict[question] = answer_text
    data['FAQs'] = faq_dict

    # Sources and References
    sources = []
    sources_section = soup.find("div", id="sources")
    if sources_section:
        links = sources_section.find_all("a", href=True)
        for link in links:
            text_tag = link.find("p")
            link_text = clean_text(text_tag.get_text()) if text_tag else clean_text(link.get_text())
            url = link['href']
            if link_text and url:
                sources.append({"text": link_text, "url": url})
    data['Sources and References'] = sources

    return data


# ==== Main Execution Starts Here ====

all_data = [extract_info(html) for html in html_code_list]

# Save the data to a file
with open("extracted_schemes.json", "w", encoding='utf-8') as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

print("✅ Extraction complete. Data saved to extracted_schemes.json.")



SyntaxError: unterminated triple-quoted string literal (detected at line 868) (data.py, line 822)

In [None]:
# Replace 'file.json' with the path to your actual JSON file
with open('/content/extracted_schemes.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
tagsList = []
for scheme in data:
  for tag in scheme['Tags']:
    if tag not in tagsList:
      tagsList.append(tag)
print(tagsList)

with open("tags.json", "w", encoding='utf-8') as f:
    json.dump(tagsList, f, indent=2, ensure_ascii=False)



['Accidental Case', 'Diseases', 'Financial Assistance', 'Hospital', 'Treatment', 'Medical Treatment', 'Serious Illness', 'BPL', 'Differently Abled', 'Distress', 'Handicap', 'PwD', 'Basic Services', 'Drainage', 'Green Space', 'Non-Motorized', 'Parks', 'Septage Management', 'Sewerage', 'Storm Water', 'Urban Transport', 'Health', 'Sanitation', 'Toilet', 'Child', 'Financial Asisstance', 'Infant', 'Pregnant', 'Scheduled Caste', 'Woman', 'Mentally Disabled', 'Money Assistance', 'Physically Disabled', 'Death Benefits', 'Disability Benefits', 'Education', 'Protection', 'Reintegration', 'Sex Worker', 'Shelter', 'Skill', 'Vocational Education', 'Heath', 'Maternity Assistance', 'Arogya', 'Nidhi', 'Patient', 'BPL Category', 'Girl Marriage', 'Group Marriages', 'Artist', 'Disability', 'Legal Assistance', 'Medical Assistance', 'Non Resident Indian', 'Overseas Indian', 'Student', 'Welfare', 'Checkup', 'Labour', 'Medical', 'Organized Worker', 'Shramyogi', 'Building Worker', 'Construction Worker', 'Mate

In [None]:
!pip install rapidfuzz


Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [None]:
import pandas as pd
import re
from rapidfuzz import process, fuzz

tag_mapping = {
    "disability": [
        "pwd", "pwds", "handicap", "handicapped", "disabled", "disable", "differently abled",
        "divyang", "divyangjan", "special needs", "persons with disability",
        "persons with disabilities", "physically challenged", "disability", "differently-abled person",
        "differently abled person", "permanent disablement", "permanent disablity", "astha card",
        "mentally disabled", "physically disabled", "hearing impaired", "cochlear", "sruthitharangam",
        "visually challenged person", "mental retardation", "autism", "cerebral palsy",
        "person with disability", "aids appliances", "hearing aid", "spectacles", "assistive devices",
        "mentally retarded", "partial disability", "permanent partial disability"
    ],
    "health": [
        "health", "heath", "healthcare", "health treatment", "medical", "medicine",
        "medical care", "primary medical treatment", "hospital", "doctor", "treatment",
        "covid", "corona", "mental health", "health support", "illness", "disease",
        "medical treatment", "serious illness", "health & wellness", "wellness", "checkup",
        "medical assistance", "medical appliance", "health coverage", "health insurance",
        "mediclaim", "health check-up", "check-up", "arogya", "nidhi", "medical institution",
        "medical expenses", "ailments", "surgery", "hospitalization", "hospitalisation",
        "medical benefit", "medical facility", "medical help", "critical diseases",
        "critical illnesses", "life threatening disorders & diseases", "serious disease treatment",
        "common diseases", "incurable diseases", "free treatment", "free medical treatment",
        "health and wellness", "healthcare", "dental care", "denture", "jaw implants", "teeth",
        "medical support", "operation", "chemotherapy", "sanitation", "toilet", "ct scan",
        "laboratory services", "quality diagnosis", "drug distribution counter", "generic drugs",
        "pharmaceutical", "janaushadhi", "primary healthcare services", "national health mission",
        "patient", "patients", "smart health card", "free medicines", "free medicines distribution",
        "cashless health insurance", "cashless medical treatment", "cashless treatment", "cashless",
        "cashless service", "affordable health insurance", "family health", "health and nutrition",
        "arogya kosh", "free diet", "free services", "immunization", "immunization of children",
        "vitamin", "major ailments", "accident", "accidental injuries", "accidental support",
        "accidental case", "tb", "tuberculosis", "cancer", "cancer treatment", "cancer patient",
        "silicosis diseases", "silicosis disease", "silicosis victim", "aids", "chronic diseases",
        "leprosy", "lepers", "occupational diseases", "physiotherapy", "covid-19", "health workers"
    ],
    "pregnancy": [
        "pregnancy", "pregnant", "pregnant women", "pregnat women", "expecting women",
        "maternity", "mother care", "maternal", "antenatal", "prenatal", "newborn",
        "maternity assistance", "maternal health", "pregnancy care", "child delivery",
        "lactating", "lactating mothers", "lactating women", "birth", "maternity benefit",
        "maternity benefit programme", "delivery", "delivery assistance", "abortion",
        "miscarriage", "infant", "new born", "newborn child", "pregnant mothers",
        "pregnancy care kit", "anganwadi", "anganwadi centers", "baby", "fertility treatment"
    ],
    "workers": [
        "worker", "workers", "labour", "labourer", "laborer", "construction worker",
        "daily wage", "unorganised worker", "organized worker", "migrant worker",
        "skilled worker", "unskilled worker", "gig worker", "delivery boy", "factory worker",
        "building worker", "transport worker", "construction workers", "unorganized worker",
        "craftsmen", "shramyogi", "constrution worker", "labor workers", "labor", "wage loss"
    ],
    "education": [
        "student", "students", "education", "school", "schools", "college", "colleges",
        "scholarship", "studying", "tuition", "fees", "exam", "online class", "literacy",
        "vocational education", "skill", "vocational training", "training", "vocational",
        "workshop", "early education"
    ],
    "senior citizen": [
        "senior citizen", "old age", "elderly", "aged", "pensioner", "retired",
        "retiree", "senior", "old", "60 plus", "senior citizens", "daycare"
    ],
    "women": [
        "women", "woman", "female", "girl", "ladies", "gender", "female empowerment",
        "females", "women rights", "girl child", "women safety", "ladki", "mahila",
        "women and child", "women wellness", "women empowerment", "women health",
        "daughter", "girls", "girl marriage", "taruni", "mamta", "sanitary napkin",
        "sanitary pad", "menstrual hygiene kit"
    ],
    "children": [
        "child", "children", "kids", "baby", "infant", "toddler", "newborn",
        "minor", "child care", "child health", "students", "child development",
        "child development scheme", "icds", "integrated child development services",
        "child protection", "child nutrition", "atal sneh", "bal sakha", "kishori"
    ],
    "youth": [
        "youth", "young", "teen", "teenager", "adolescent", "youngster",
        "student", "college student", "young adult", "adolescent girls",
        "samriddhi"
    ],
    "tribal": [
        "tribal", "tribe", "indigenous", "adivasi", "scheduled tribe", "st",
        "forest dweller", "tribal welfare", "primitive tribal"
    ],
    "farmer": [
        "farmer", "farmers", "agriculture", "cultivator", "kisan", "agrarian",
        "farm labour", "agriculture and infrastructure"
    ],
    "financial aid": [
        "money", "financial", "loan", "grant", "subsidy", "financial support",
        "cash", "insurance", "pension", "bank", "atm", "monetary", "financial assistance",
        "financial asisstance", "money assistance", "expenses", "cash incentive",
        "cash assistance", "premium", "ex-gratia", "relief fund", "monthly assistance",
        "monthly allowance", "rehabilitation allowance", "grant", "maternity loan",
        "bpl", "bpl category", "below poverty line", "non bpl", "ews", "ews scheme",
        "economically-weaker sections", "poor", "distress", "welfare benefits",
        "death benefit", "death benefits", "funeral", "death assistance"
    ],
    "employment": [
        "employment", "job", "jobs", "work", "livelihood", "self employed",
        "employment scheme", "jobless", "unemployed", "startup", "entrepreneur",
        "opportunity", "self help groups", "shgs", "social security", "social empowerment",
        "empowerment", "rehabilitation services", "rehabilitation", "reintegration",
        "promoting registration"
    ],
    "housing": [
        "house", "housing", "shelter", "homeless", "flat", "home", "room", "rent", "accommodation"
    ],
    "food": [
        "food", "ration", "nutrition", "malnutrition", "grocery", "meal", "midday meal",
        "food distribution", "hunger", "free food", "free meal", "rice", "free diet",
        "nutritional support", "good nutrition", "nutritional status", "nutrition kit",
        "providing nutritious food"
    ],
    "disaster": [
        "flood", "earthquake", "cyclone", "disaster", "natural disaster", "calamity",
        "fire", "relief", "disaster aid", "emergency", "injuries"
    ],
    "minority": [
        "minority", "muslim", "christian", "jain", "buddhist", "sikh",
        "minority religion", "religious minority", "religion", "parsi community",
        "parsi couples", "zoroastrians", "scheduled caste", "sc/st", "backward class"
    ],
    "legal": [
        "legal assistance", "protection", "legal", "law", "court", "lawyer",
        "advocate", "legal aid", "legal help", "legal support"
    ],
    "urban_development": [
        "basic services", "drainage", "green space", "non-motorized", "parks",
        "septage management", "sewerage", "storm water", "urban transport",
        "swachh bharat", "abhiyan", "hygiene", "grama jyothi", "mana ooru mana pranalika"
    ],
    "government_employees": [
        "government employees", "government employee", "pensioners", "public and private sector hospitals"
    ],
    "rural_development": [
        "rural", "grameen", "rural development", "natural resources management",
        "water and sanitation"
    ],
    "ex_servicemen": [
        "ex-servicemen", "sainik", "esm", "widow of ex-servicemen"
    ],
    "social_welfare": [
        "welfare", "social welfare", "social welfare scheme", "social service",
        "good citizens", "national", "purashkar", "dbt", "dbt scheme", "dayalu",
        "rajasthan scheme", "sahara yojana", "sishu saathi scheme", "nagaland",
        "nagaland government health insurance", "goa", "gujarat", "himachal pradesh",
        "delhi", "anishi", "family welfare", "family", "families", "social security and poverty eradication"
    ],
    "artists_media": [
        "artist", "media", "press", "journalist", "non resident indian", "overseas indian",
        "sex worker"
    ],
    "animal_husbandry": [
        "animal", "buffalo", "gene", "goat", "semen", "sheep", "liverstock"
    ],
    "psychological_support": [
        "counselling", "psychological", "recreation", "sports", "development"
    ],
    "special_events": [
        "group marriages"
    ],
    "financial_protection": [
        "financial protection", "life insurance"
    ]
}

In [None]:

# Invert tag mapping for easy fuzzy lookup
flat_tags = []
tag_to_category = {}

for category, tags in tag_mapping.items():
    for tag in tags:
        cleaned_tag = tag.lower().strip()
        flat_tags.append(cleaned_tag)
        tag_to_category[cleaned_tag] = category

# Cleaning function
def clean_text(text):
    return re.sub(r'[^a-z0-9 ]+', '', text.lower().strip())

# Fuzzy match function
def match_tag_fuzzy(raw_tag, scorer=fuzz.token_sort_ratio, threshold=80):
    cleaned = clean_text(raw_tag)

    # First check exact or substring match
    if cleaned in tag_to_category:
        return tag_to_category[cleaned]

    # Fuzzy match with threshold
    best_match = process.extractOne(
        cleaned,
        flat_tags,
        scorer=scorer
    )

    if best_match and best_match[1] >= threshold:
        return tag_to_category[best_match[0]]

    return "unknown"

In [None]:
df = pd.DataFrame({
    'raw_tag' : [
  "Accidental Case",
  "Diseases",
  "Financial Assistance",
  "Hospital",
  "Treatment",
  "Medical Treatment",
  "Serious Illness",
  "BPL",
  "Differently Abled",
  "Distress",
  "Handicap",
  "PwD",
  "Basic Services",
  "Drainage",
  "Green Space",
  "Non-Motorized",
  "Parks",
  "Septage Management",
  "Sewerage",
  "Storm Water",
  "Urban Transport",
  "Health",
  "Sanitation",
  "Toilet",
  "Child",
  "Financial Asisstance",
  "Infant",
  "Pregnant",
  "Scheduled Caste",
  "Woman",
  "Mentally Disabled",
  "Money Assistance",
  "Physically Disabled",
  "Death Benefits",
  "Disability Benefits",
  "Education",
  "Protection",
  "Reintegration",
  "Sex Worker",
  "Shelter",
  "Skill",
  "Vocational Education",
  "Heath",
  "Maternity Assistance",
  "Arogya",
  "Nidhi",
  "Patient",
  "BPL Category",
  "Girl Marriage",
  "Group Marriages",
  "Artist",
  "Disability",
  "Legal Assistance",
  "Medical Assistance",
  "Non Resident Indian",
  "Overseas Indian",
  "Student",
  "Welfare",
  "Checkup",
  "Labour",
  "Medical",
  "Organized Worker",
  "Shramyogi",
  "Building Worker",
  "Construction Worker",
  "Maternity",
  "Pregnancy Care",
  "Women",
  "Life Threatening Disorders & Diseases",
  "Patients",
  "Silicosis Diseases",
  "Cashless",
  "Goa",
  "Health Coverage",
  "Insurance",
  "Accident",
  "Death",
  "Differently-abled Person",
  "Cancer Treatment",
  "Construction",
  "Worker",
  "Death Assistance",
  "Monthly Assistance",
  "Permanent Disablement",
  "Social Empowerment",
  "Transport Worker",
  "Accidental Injuries",
  "Craftsmen",
  "Social Security",
  "Unorganised Worker",
  "Health & Wellness",
  "Medical Appliance",
  "Immunization",
  "Maternal Health",
  "Pregnancy",
  "Women And Child",
  "Building",
  "Expenses",
  "Girl",
  "Kishori",
  "Samriddhi",
  "Affordable Health Insurance",
  "Cashless Health Insurance",
  "Health Insurance",
  "Nagaland",
  "Nagaland Government Health Insurance",
  "Amrutam",
  "Wellness",
  "Free Diet",
  "Medicine",
  "Newborn",
  "Mediclaim",
  "Astha Card",
  "Persons With Disabilities",
  "Ration",
  "Welfare Benefits",
  "Grameen",
  "Rural",
  "Swachh Bharat",
  "Bank",
  "Buffalo",
  "Gene",
  "Goat",
  "Semen",
  "Sheep",
  "Delivery Assistance",
  "Empowerment",
  "Child Development",
  "Child Development Scheme",
  "ICDS",
  "Immunization Of Children",
  "Lactating Mothers",
  "Journalist",
  "Birth",
  "Girl Child",
  "Medical Institution",
  "Mother",
  "Integrated Child Development Services",
  "Abhiyan",
  "Healthcare",
  "Mamta",
  "Taruni",
  "Family Benefits",
  "Silicosis Disease",
  "Unorganized Worker",
  "Cashless Medical Treatment",
  "Critical Illnesses",
  "Economically-weaker Sections",
  "Public And Private Sector Hospitals",
  "Cashless Treatment",
  "Government Employees",
  "Hospitalisation",
  "Pensioners",
  "Cancer",
  "Free Treatment",
  "Hospitalization",
  "Tuberculosis",
  "Construction Workers",
  "Clothes",
  "Food",
  "Free Medical Treatment",
  "Injuries",
  "Occupational Diseases",
  "Disease",
  "Autism",
  "Cerebral Palsy",
  "Mental Retardation",
  "Person With Disability",
  "AIDS",
  "Chronic Diseases",
  "TB",
  "Dental Care",
  "Denture",
  "Jaw Implants",
  "Teeth",
  "Child Care",
  "Women Wellness",
  "Ailments",
  "Medical Expenses",
  "Surgery",
  "Child Delivery",
  "Illness",
  "Opportunity",
  "SHGs",
  "Training",
  "Adolescent Girls",
  "Child Health",
  "Early Education",
  "Malnutrition",
  "Nutrition",
  "Vocational Training",
  "Family",
  "Premium",
  "Pension",
  "Animal",
  "Employment",
  "Liverstock",
  "DAYALU",
  "Permanent Disablity",
  "Ex-Servicemen",
  "Serious Disease Treatment",
  "Widow Of Ex-Servicemen",
  "Maternity Benefit",
  "Medical Benefit",
  "Pregnant Women",
  "Scheduled Tribe",
  "Dependent Family",
  "Labor Workers",
  "Medical Facility",
  "Dependent Family Member",
  "Government Employee",
  "Medical Help",
  "Non BPL",
  "Rajasthan Scheme",
  "Relief Fund",
  "Children",
  "Cochlear",
  "Hearing Impaired",
  "Sruthitharangam",
  "Social Welfare",
  "Agriculture And Infrastructure",
  "Grama Jyothi",
  "Health And Nutrition",
  "Mana Ooru Mana Pranalika",
  "Natural Resources Management",
  "Rural Development",
  "Social Security And Poverty Eradication",
  "Water And Sanitation",
  "Critical Diseases",
  "Partial Disability",
  "Doctor",
  "Free Clothes",
  "Free Food",
  "Visually Challenged Person",
  "Daycare",
  "Physiotherapy",
  "Recreation",
  "Senior Citizen",
  "New Born",
  "Scholarship",
  "Constrution Worker",
  "Workers",
  "Silicosis Victim",
  "Abortion",
  "Aids Appliances",
  "Handicapped",
  "Leprosy",
  "Vitamin",
  "Accidental Support",
  "Medical Support",
  "Miscarriage",
  "NGO",
  "Senior Citizens",
  "Operation",
  "Hearing Aid",
  "Spectacles",
  "Death Benefit",
  "Funeral",
  "EWS",
  "Illnesses",
  "Poor",
  "Child Healthcare",
  "Labor",
  "Daughter",
  "Families",
  "Medical Treatments",
  "COVID-19",
  "Health Workers",
  "Life Insurance",
  "Fertility Treatment",
  "Parsi Community",
  "Parsi Couples",
  "Zoroastrians",
  "Hygiene",
  "Sanitary Napkin",
  "Delivery",
  "Atal Sneh",
  "Gujarat",
  "Newborn Child",
  "Anganwadi Centers",
  "Improve The Nutritional Status Of Pregnant",
  "Pregnant Women Care Kit",
  "Providing Nutritious Food To Pregnant Women And Children",
  "Arogya Lakshmi",
  "Lactating Women",
  "Pregnat Women",
  "Health Checkup",
  "Health Treatment",
  "CT Scan",
  "Free Services",
  "Laboratory Services",
  "Quality Diagnosis",
  "DBT Scheme",
  "EWS Scheme",
  "Sahara Yojana",
  "Family Welfare",
  "Check-Up",
  "Health Check-up",
  "Common Diseases",
  "Free Medicines",
  "Primary Medical Treatment",
  "Girls",
  "Sanitary Pad",
  "Cash Incentive",
  "Good Nutrition",
  "Maternity Benefit Programme",
  "Below Poverty Line",
  "Incurable Diseases",
  "Nutritional Support",
  "Anganwadi",
  "Development",
  "Vocational",
  "Workshop",
  "Differently Abled Person",
  "Female",
  "Women Health",
  "Cancer Patient",
  "Grant",
  "SC/ST",
  "Assistive Devices",
  "Rehabilitation Services",
  "DBT",
  "Free Meal",
  "Youth",
  "Cash Assistance",
  "School",
  "Sishu Saathi Scheme",
  "Social Welfare Scheme",
  "Chemotherapy",
  "Arogya Kosh",
  "Delhi",
  "Family Health",
  "Himachal Pradesh",
  "Lepers",
  "Monthly Allowance",
  "Rehabilitation Allowance",
  "Entrepreneur",
  "Generic Drugs",
  "Janaushadhi",
  "Pharmaceutical",
  "Meal",
  "Rice",
  "Home",
  "Sports",
  "Counselling",
  "Psychological",
  "Rehabilitation",
  "Good Citizens",
  "National",
  "Purashkar",
  "Social Service",
  "Media",
  "Press",
  "ANISHI",
  "Financial Protection",
  "Smart Health Card",
  "Drug Distribution Counter",
  "Free Medicines Distribution",
  "Ex-Gratia",
  "Bal Sakha",
  "Child Protection",
  "National Health Mission",
  "Primary Healthcare Services",
  "Health And Wellness",
  "Maternity Loan",
  "Self Help Groups",
  "Women Empowerment",
  "Major Ailments",
  "Permanent Partial Disability",
  "Backward Class",
  "Nutrition Kit",
  "Child Nutrition",
  "Pregnant Mothers",
  "Tribal Welfare",
  "Primitive Tribal",
  "Cashless Service",
  "ESM",
  "Sainik",
  "Mentally Retarded",
  "Menstrual Hygiene Kit",
  "Promoting Registration",
  "Wage Loss"
]})

df['matched_category'] = df['raw_tag'].apply(match_tag_fuzzy)
df.to_json('temp.json', orient='records', lines=True)

print(df)

                    raw_tag matched_category
0           Accidental Case           health
1                  Diseases           health
2      Financial Assistance    financial aid
3                  Hospital           health
4                 Treatment           health
..                      ...              ...
377                  Sainik    ex_servicemen
378       Mentally Retarded       disability
379   Menstrual Hygiene Kit            women
380  Promoting Registration       employment
381               Wage Loss          workers

[382 rows x 2 columns]


In [None]:

with open("tags.json", "w", encoding='utf-8') as f:
    json.dump(tagsList, f, indent=2, ensure_ascii=False)

In [None]:
with open("extracted_schemes.json", encoding='utf-8') as f:
  test_data = json.load(f)

for scheme in test_data:
  tags = scheme["Tags"]
  testTags = []
  for tag in tags:
    newTag = match_tag_fuzzy(tag)
    if newTag not in testTags:
      testTags.append(newTag)
  scheme["Tags"] = []
  scheme["Tags"] = testTags

  with open("Extract.json", "w", encoding = "utf-8") as f:
    json.dump(test_data, f, indent = 2, ensure_ascii=False)

In [None]:
placeList = []
for scheme in test_data:
  place = scheme["place"]
  if place not in placeList:
    placeList.append(place)

placeList.sort()

with open("place.json", "w", encoding = "utf-8") as f:
  json.dump(placeList, f, indent = 2, ensure_ascii=False)


In [None]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch

# Load pre-trained RoBERTa model and tokenizer
model_name = 'deepset/roberta-base-squad2'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)

# Define context and questions
context = "car accident, assam"
questions = [
    "What place did they mention?",
    "Where is India located?",
    "How many states does India have?",
    "Who is the President of the United States?"
]

def get_answers(questions, context, model, tokenizer, threshold=0.5):
    # Tokenize all question-context pairs
    inputs = tokenizer(
        questions,
        [context] * len(questions),  # Repeat context for each question
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    answers = []
    for i in range(len(questions)):
        # Get the start and end indices with highest scores
        start_idx = torch.argmax(start_logits[i]).item()
        end_idx = torch.argmax(end_logits[i]).item() + 1  # Include end token

        # Check confidence scores (logits)
        start_score = torch.max(start_logits[i]).item()
        end_score = torch.max(end_logits[i]).item()

        # If confidence is too low or indices are invalid, return null
        if start_score < threshold or end_score < threshold or end_idx <= start_idx:
            print("The threshold is ", threshold)
            answers.append(None)
        else:
            # Decode answer tokens to text
            answer_tokens = inputs['input_ids'][i][start_idx:end_idx]
            answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
            answers.append(answer)

    return answers

# Get answers for all questions
answers = get_answers(questions, context, model, tokenizer)

# Print results
for q, a in zip(questions, answers):
    print(f"Q: {q}")
    print(f"A: {a if a else 'No Answer'}")


Q: What place did they mention?
A: No Answer
Q: Where is India located?
A: No Answer
Q: How many states does India have?
A: No Answer
Q: Who is the President of the United States?
A: No Answer
