<a href="https://colab.research.google.com/github/Rajakumari208/mergeconflict/blob/main/myproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas faker

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


**synthetic dataset**

In [None]:
import uuid
import random
from faker import Faker
import pandas as pd

# Initialize Faker for random locations and user details
fake = Faker()

# Target departments
departments = [
    "Department of Finance",
    "Department of Health",
    "Department of Education",
    "Department of Public Works",
    "Department of Transportation",
    "Department of Housing and Urban Development",
    "Department of Environment",
    "Department of Social Welfare"
]

# Complaint templates per department
templates = {
    "Department of Finance": [
        "I was charged an extra fee at the {location} tax office when filing my returns.",
        "My refund from the {location} revenue department has been delayed by months.",
        "I received an incorrect tax notice for last year from the {location} branch."
    ],
    "Department of Health": [
        "The staff at {location} Community Health Center refused to attend to emergency cases promptly.",
        "I found expired medicines at the {location} dispensary last week.",
        "The sanitation conditions in {location} municipal hospital are unacceptable."
    ],
    "Department of Education": [
        "The textbooks delivered to {location} High School are torn and incomplete.",
        "The new laboratory in {location} College is missing essential equipment.",
        "Students at {location} Primary School are facing teacher shortages."
    ],
    "Department of Public Works": [
        "There is a huge pothole near {location} main road that's causing accidents.",
        "The public toilet at {location} park is broken and unhygienic.",
        "Streetlights on {location} Avenue are not working at night."
    ],
    "Department of Transportation": [
        "The bus service from {location} to city center is frequently delayed.",
        "I found overcrowding and no ticket checking on bus route {location}.",
        "The metro station at {location} is inaccessible for differently-abled passengers."
    ],
    "Department of Housing and Urban Development": [
        "The drainage around {location} housing complex is completely blocked.",
        "My application for tenancy in {location} welfare housing has been pending for over a year.",
        "Illegal constructions near {location} slum area are not being addressed."
    ],
    "Department of Environment": [
        "Open dumping of garbage is happening every night near {location} lake.",
        "I smell industrial waste fumes coming from {location} factory area.",
        "Tree cutting without permission occurred next to {location} nature reserve."
    ],
    "Department of Social Welfare": [
        "My pension application for senior citizens under the {location} scheme has been rejected unfairly.",
        "Disabled welfare allowances in {location} block are not being disbursed.",
        "I never received the promised ration under the public distribution system at {location} center."
    ]
}

def generate_synthetic_complaints(num_per_dept=5000):
    """Generate a balanced synthetic dataset of complaints."""
    records = []
    for dept in departments:
        for _ in range(num_per_dept):
            template = random.choice(templates[dept])
            # Fill in a random city or locality
            location = fake.city()
            text = template.format(location=location)
            comp_id = str(uuid.uuid4())
            user_id = str(uuid.uuid4())
            records.append({
                "complaint_id": comp_id,
                "user_id": user_id,
                "department": dept,
                "text": text
            })
    return pd.DataFrame(records)

if __name__ == "__main__":
    # Adjust num_per_dept to reach your total desired size
    df = generate_synthetic_complaints(num_per_dept=5000)  # 8 × 5k = 40k records
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle

    # Save to CSV
    df.to_csv("synthetic_grievances.csv", index=False, encoding="utf-8")
    print(f"Generated {len(df)} synthetic complaints across {len(departments)} departments.")


Generated 40000 synthetic complaints across 8 departments.


**environment setup**


In [None]:
!pip install pandas scikit-learn datasets transformers torch nltk spacy
!python -m nltk.downloader vader_lexicon
!python -m spacy download en_core_web_sm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

**department classification model**

In [None]:
# dept_classifier.py

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. Load synthetic data
df = pd.read_csv("synthetic_grievances.csv")

# 2. Map department names to numeric labels
departments = sorted(df["department"].unique())
label2id = {dept: i for i, dept in enumerate(departments)}
df["label"] = df["department"].map(label2id)

# 3. Split into train/validation
train_df, valid_df = train_test_split(
    df, test_size=0.15, stratify=df["label"], random_state=42
)

# 4. Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[["text", "label"]])
valid_ds = Dataset.from_pandas(valid_df[["text", "label"]])

# 5. Tokenization
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_batch, batched=True)
valid_ds = valid_ds.map(tokenize_batch, batched=True)
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
valid_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 6. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(departments)
)

# 7. Training arguments
training_args = TrainingArguments(
    output_dir="./dept_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100
)

# 8. Initialize Trainer and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer
)

trainer.train()
trainer.evaluate()

# 9. Save artifacts
trainer.save_model("./dept_classifier")
tokenizer.save_pretrained("./dept_classifier")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/34000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdevadharshinim-aiml2023[0m ([33mvaishalinir-ymc2022-chennai-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0003,0.000188
2,0.0001,4.8e-05
3,0.0001,2.8e-05


('./dept_classifier/tokenizer_config.json',
 './dept_classifier/special_tokens_map.json',
 './dept_classifier/vocab.txt',
 './dept_classifier/added_tokens.json',
 './dept_classifier/tokenizer.json')

**sentiment analysis**

In [None]:
# sentiment_label_and_train.py

import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# 1. Load data
df = pd.read_csv("synthetic_grievances.csv")

# 2. Pseudo-label using VADER
sia = SentimentIntensityAnalyzer()
df["compound"] = df["text"].apply(lambda t: sia.polarity_scores(t)["compound"])
def map_sentiment(score):
    if score >= 0.05:   return "positive"
    if score <= -0.05:  return "negative"
    return "neutral"
df["sentiment"] = df["compound"].apply(map_sentiment)

# 3. Encode labels
sent_labels = ["negative", "neutral", "positive"]
label2id = {lab: i for i, lab in enumerate(sent_labels)}
df["sent_id"] = df["sentiment"].map(label2id)

# 4. Train/test split
train_df, valid_df = train_test_split(
    df, test_size=0.15, stratify=df["sent_id"], random_state=42
)

# 5. Dataset & Tokenization
train_ds = Dataset.from_pandas(train_df[["text", "sent_id"]].rename(columns={"sent_id":"label"}))
valid_ds = Dataset.from_pandas(valid_df[["text", "sent_id"]].rename(columns={"sent_id":"label"}))

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
valid_ds = valid_ds.map(tokenize_fn, batched=True)
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","label"])
valid_ds.set_format(type="torch", columns=["input_ids","attention_mask","label"])

# 6. Model & Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(sent_labels)
)

args = TrainingArguments(
    output_dir="./sentiment_model",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=3e-5
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer
)

trainer.train()
trainer.evaluate()
trainer.save_model("./sentiment_model")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/34000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0045,0.00331
2,0.005,0.003113


**entity recognition**

In [None]:
# simple_ner_pipeline.py

import pandas as pd
import spacy

# 1. Load data
df = pd.read_csv("synthetic_grievances.csv")

# 2. Load spaCy’s small English model
nlp = spacy.load("en_core_web_sm")

# 3. Extract entities for each text
def extract_entities(text):
    doc = nlp(text)
    entities = {"LOC":[], "ORG":[], "ISSUE":[]}
    for ent in doc.ents:
        if ent.label_ in ["GPE","LOC"]:            entities["LOC"].append(ent.text)
        if ent.label_ in ["ORG","NORP"]:           entities["ORG"].append(ent.text)
    # Simple issue-type heuristics (keyword search)
    for issue in ["fee","delay","expired","blocked","pothole","overflow","construction"]:
        if issue in text.lower(): entities["ISSUE"].append(issue)
    return entities

df["entities"] = df["text"].apply(extract_entities)

# 4. Inspect sample
print(df[["text","entities"]].head(10).to_dict(orient="records"))


[{'text': 'Tree cutting without permission occurred next to Gilesville nature reserve.', 'entities': {'LOC': [], 'ORG': [], 'ISSUE': []}}, {'text': 'The public toilet at Annettemouth park is broken and unhygienic.', 'entities': {'LOC': ['Annettemouth'], 'ORG': [], 'ISSUE': []}}, {'text': 'My application for tenancy in West Peggy welfare housing has been pending for over a year.', 'entities': {'LOC': ['West Peggy'], 'ORG': [], 'ISSUE': []}}, {'text': 'The sanitation conditions in Carterstad municipal hospital are unacceptable.', 'entities': {'LOC': [], 'ORG': ['Carterstad'], 'ISSUE': []}}, {'text': 'The drainage around Kevinhaven housing complex is completely blocked.', 'entities': {'LOC': [], 'ORG': ['Kevinhaven'], 'ISSUE': ['blocked']}}, {'text': 'I never received the promised ration under the public distribution system at New Jamesville center.', 'entities': {'LOC': ['New Jamesville'], 'ORG': [], 'ISSUE': []}}, {'text': 'The new laboratory in South Sarahberg College is missing essent

***putting* *it* *all* *together***

In [None]:
from transformers import pipeline

dept_pipe = pipeline(
    "text-classification",
    model="./dept_classifier",
    tokenizer="./dept_classifier"
)

sent_pipe = pipeline(
    "text-classification",
    model="./sentiment_model",
    tokenizer="./sentiment_model"
)

import spacy
nlp = spacy.load("en_core_web_sm")

def process_complaint(text):
    dept = dept_pipe(text)[0]
    sentiment = sent_pipe(text)[0]
    ents = [ent for ent in nlp(text).ents if ent.label_ in ["GPE","LOC","ORG"]]
    return {"department":dept, "sentiment":sentiment, "entities":[(e.text,e.label_) for e in ents]}

print(process_complaint("There is a huge pothole near Springfield main road that's causing accidents."))


Device set to use cuda:0
Device set to use cuda:0


{'department': {'label': 'LABEL_5', 'score': 0.9999803304672241}, 'sentiment': {'label': 'LABEL_1', 'score': 0.9999128580093384}, 'entities': [('Springfield', 'GPE')]}


**build the interface pipeline**

In [None]:
# grievance_service/pipeline.py

from transformers import pipeline as hf_pipeline
import spacy

# Load department model
dept_pipe = hf_pipeline(
    "text-classification",
    model="./dept_classifier",
    tokenizer="./dept_classifier"
)

# Load sentiment model
sent_pipe = hf_pipeline(
    "text-classification",
    model="./sentiment_model",
    tokenizer="./sentiment_model"
)

# Load spaCy for NER
nlp = spacy.load("en_core_web_sm")

def analyze_complaint(text):
    # 1. Department
    dept_pred = dept_pipe(text)[0]
    department = dept_pred["label"]
    dept_score = dept_pred["score"]

    # 2. Sentiment/Urgency
    sent_pred = sent_pipe(text)[0]
    sentiment = sent_pred["label"]
    sent_score = sent_pred["score"]

    # 3. Entity Extraction
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ in ("GPE","LOC")]
    orgs      = [ent.text for ent in doc.ents if ent.label_ in ("ORG","NORP")]

    return {
        "department": department,
        "dept_confidence": dept_score,
        "sentiment": sentiment,
        "sent_confidence": sent_score,
        "locations": locations,
        "organizations": orgs
    }


Device set to use cuda:0
Device set to use cuda:0


**wrap rest api with fast api**

In [None]:
import os
import sys

sys.path.append(os.getcwd())  # adds current directory to Python path



In [None]:
print(os.getcwd())
print(os.listdir())


/content
['.config', 'wandb', 'sentiment_model', 'dept_classifier', 'synthetic_grievances.csv', 'sample_data']


In [None]:
# Step 1: Create the folder
!mkdir grievance_service

# Step 2: Create pipeline.py
with open("grievance_service/pipeline.py", "w") as f:
    f.write("""
def analyze_complaint(text):
    if "delay" in text.lower():
        return {"issue": "Delay", "severity": "High"}
    elif "rude" in text.lower():
        return {"issue": "Staff Behavior", "severity": "Medium"}
    else:
        return {"issue": "General", "severity": "Low"}
""")



mkdir: cannot create directory ‘grievance_service’: File exists


In [None]:
import sys
sys.path.append("/content/grievance_service")

from pipeline import analyze_complaint


In [None]:
!pip install fastapi uvicorn




In [None]:
# Step 1: Create grievance_service/app.py
with open("grievance_service/app.py", "w") as f:
    f.write("""
from fastapi import FastAPI
from pydantic import BaseModel
from pipeline import analyze_complaint

app = FastAPI(title="Grievance AI Service")

class Complaint(BaseModel):
    text: str

@app.post("/analyze")
async def analyze(complaint: Complaint):
    result = analyze_complaint(complaint.text)
    return result
""")


In [None]:
!uvicorn grievance_service.app:app --reload --port 8000


[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m23324[0m] using [36m[1mStatReload[0m
Process SpawnProcess-1:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/_subprocess.py", line 80, in subprocess_started
    target(sockets=sockets)
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 67, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/usr/lib/pyt

In [None]:
from grievance_service.pipeline import analyze_complaint


In [None]:

with open("grievance_service/app.py", "w") as f:
    f.write("""
from fastapi import FastAPI
from pydantic import BaseModel
from grievance_service.pipeline import analyze_complaint

app = FastAPI(title="Grievance AI Service")

class Complaint(BaseModel):
    text: str

@app.post("/analyze")
async def analyze(complaint: Complaint):
    result = analyze_complaint(complaint.text)
    return result
""")


In [None]:
!uvicorn grievance_service.app:app --reload --port 8000


[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m25500[0m] using [36m[1mStatReload[0m
[32mINFO[0m:     Started server process [[36m25508[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Shutting down
[32mINFO[0m:     Waiting for application shutdown.
[32mINFO[0m:     Application shutdown complete.
[32mINFO[0m:     Finished server process [[36m25508[0m]
[32mINFO[0m:     Stopping reloader process [[36m[1m25500[0m]


running from right working directory

In [None]:
!pip install fastapi uvicorn pyngrok nest-asyncio


Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


  for x in reversed(rel.split(sep)):


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from grievance_service.pipeline import analyze_complaint

import nest_asyncio
from pyngrok import ngrok
import uvicorn

# Allow nested event loops in Colab
nest_asyncio.apply()

# Create FastAPI app
app = FastAPI(title="Grievance AI Service")

class Complaint(BaseModel):
    text: str

@app.post("/analyze")
async def analyze(complaint: Complaint):
    result = analyze_complaint(complaint.text)
    return result

# Expose the app via ngrok
public_url = ngrok.connect(8000)
print(f"🔗 Public URL: {public_url}")

# Run the server
uvicorn.run(app, port=8000)




ERROR:pyngrok.process.ngrok:t=2025-08-09T08:48:09+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n"


PyngrokNgrokError: The ngrok process errored on start: authentication failed: Usage of ngrok requires a verified account and authtoken.\n\nSign up for an account: https://dashboard.ngrok.com/signup\nInstall your authtoken: https://dashboard.ngrok.com/get-started/your-authtoken\r\n\r\nERR_NGROK_4018\r\n.

In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("310w5XO14gZ1OZgf1TkAHb0govw_7m7A7dnLsBi92EguNsrA8")  # paste your token inside the quotes


In [None]:
public_url = ngrok.connect(8000)
print(f"🔗 Public URL: {public_url}")


🔗 Public URL: NgrokTunnel: "https://426474cc44e2.ngrok-free.app" -> "http://localhost:8000"
