# Install Dependencies

In [None]:
!pip install pgvector sqlalchemy psycopg2-binary jb-manager-bot tqdm python-dotenv json-repair

# Setup Environment

In [1]:
from crud import engine, session

from pgvector.sqlalchemy import Vector
from sqlalchemy.orm import declarative_base, mapped_column
from sqlalchemy import ARRAY, Column, String
from sqlalchemy.dialects.postgresql import JSON

Base = declarative_base()

class GrievanceCategory(Base):
    __tablename__ = "grievance_category"
    id = Column(String, primary_key=True)
    ministry = Column(String)
    category = Column(String)
    subcategory = Column(ARRAY(String))
    description = Column(String)
    embedding = mapped_column(Vector(1536))
    fields = Column(ARRAY(JSON))

    def __repr__(self):
        return f"<GrievanceCategory(ministry={self.ministry}, category={self.category}, subcategory={self.subcategory}>"

# Data Indexing

Refresh DB

In [2]:
Base.metadata.create_all(engine)

In [3]:
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [2]:
import csv
import json_repair
from openai_utils import generate_embeddings

In CPGrams, a user can file a grievance in a grievance bucket. Each bucket is defined by the following attributes:
- Level 1: Department/Ministry Name
- Level 2: Grievance Category
- Level 3-N: List of Sub Categories

Each row in the dataset represents a grievance bucket along with
- Description of grievance bucket (Manually generated by GPT-4)
- Extra info required to file the grievance in form of fields (Manually generated by GPT)

Loading CPGRAMS data

In [3]:
filename = "grievance_category.csv"
fp = open(filename)
reader = csv.DictReader(fp, delimiter=",")

# Reading first row
row = next(reader)

In [4]:
# Department Name
row['Department Name']

'Telecommunications'

In [5]:
# Grievance Category
row['Category']

'Mobile Related'

In [6]:
# Grievance Sub Categories
[row[f'Sub Category -{index}'] for index in range(1, 7) if row[f'Sub Category -{index}']]

['Call Drop']

In [7]:
# Description of Grievance Bucket
row['Description of Grievance Category']

'Users of mobile phones may experience call drops, where a phone call is unexpectedly disconnected or the call quality deteriorates abruptly during a conversation. This can be frustrating for individuals who rely on their mobile phones for communication. Call drops may occur due to various reasons such as network congestion, poor signal strength, technical issues with the mobile network, interference, or inadequate infrastructure. \n\nExamples of terms used to refer to this grievance: dropped calls, call disconnects, call quality issues, signal loss during call, call interruptions, call breakups.'

In [8]:
# Fields required to file the grievance
fields = f"[{row['GPT Form Field Generation']}]"
json_repair.loads(fields)

[{'field_name': 'Frequency_of_occurrence',
  'data_type': 'string',
  'mandatory': True,
  'description': 'How often do you experience call drops?',
  'options': ['Frequent (More than 5 times a day)',
   'Occasional (2-5 times a day)',
   'Rare (Less than 2 times a day)']},
 {'field_name': 'Time_of_day',
  'data_type': 'string',
  'mandatory': True,
  'description': 'During what time of day do you experience call drops the most?',
  'options': ['Morning (6 AM - 12 PM)',
   'Afternoon (12 PM - 6 PM)',
   'Evening (6 PM - 12 AM)',
   'Night (12 AM - 6 AM)']},
 {'field_name': 'Call_reception_area',
  'data_type': 'string',
  'mandatory': True,
  'description': 'Where are you located when experiencing call drops?'}]

In [9]:
fp.close()

Loading and preprocessing CPGrams data into json

In [None]:
from tqdm import tqdm

grievance_buckets = []

with open("updated_category.csv", "r") as f:
    csv_reader = csv.DictReader(f, delimiter=",")
    next(csv_reader, None)  
    
    for row in tqdm(csv_reader):
        ministry = row['Department Name']
        category = row['Category']
        subcategories = [row[f'Sub Category -{index}'] for index in range(1, 7) if row[f'Sub Category -{index}']]
        description = row['Description of Grievance Category']
        fields = f"[{row['GPT Form Field Generation']}]"
        try:
            fields = json_repair.loads(fields)
            if type(fields) is not list:
                raise Exception("Fields is not a list")
        except Exception as e:
            tqdm.write(f"Error {e} while parsing fields for {ministry} - {category} - {subcategories}")
            continue
        grievance_buckets.append({
            "ministry": ministry,
            "category": category,
            "subcategory": subcategories,
            "description": description,
            "fields": fields
        })

In [11]:
grievance_buckets[0]

{'ministry': 'Telecommunications',
 'category': 'Mobile Related',
 'subcategory': ['Improper Network Coverage'],
 'description': 'This grievance pertains to issues related to improper network coverage or connectivity for mobile phones. Users may experience dropped calls, slow data speeds, poor signal strength, frequent disconnections, inability to make calls or send messages, and overall unreliable network performance. \n\nExamples of how people may refer to this grievance include:\n- Poor mobile network coverage\n- Weak signal strength\n- Network congestion\n- Call drops\n- No service or limited service\n- Slow internet speed on mobile\n- Mobile phone connectivity issues\n\nUsers facing these problems may find it difficult to communicate effectively, especially in emergencies or important situations. Proper network coverage is essential for accessing basic services, staying connected with others, and utilizing mobile services such as internet browsing, messaging, and calls. \n\nProper

Generate embeddings for the bucket description 

In [14]:
embedding, _ = generate_embeddings([grievance_buckets[0]['description']])
embedding = embedding[0]
len(embedding)

1536

In [15]:
descriptions = [bucket["description"] for bucket in grievance_buckets]
embeddings, cost = generate_embeddings(descriptions)

In [16]:
len(embeddings), len(embeddings[0]), cost

(15214, 1536, 0.2516053)

Pushing to vectorDB

In [16]:
import uuid

session.rollback()
for grievance, embedding in zip(grievance_buckets, embeddings):
    grievance_category = GrievanceCategory(
        id=str(uuid.uuid4()),
        ministry=grievance["ministry"],
        category=grievance["category"],
        subcategory=grievance["subcategory"],
        description=grievance["description"],
        fields=grievance["fields"],
        embedding=embedding,
    )
    session.add(grievance_category)

session.commit()