In [1]:
!pip install textstat spacy
!python -m spacy download en_core_web_sm


Collecting textstat
  Downloading textstat-0.7.12-py3-none-any.whl.metadata (15 kB)
Collecting spacy
  Downloading spacy-3.8.11-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.15-cp313-cp313-win_amd64.whl.metadata (2.3 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.13-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.12-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.10-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting wa

In [2]:
import pandas as pd
import textstat
import spacy
import re
import ast
import subprocess


In [3]:
class RobustClassifier:
    def __init__(self):
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except:
            subprocess.run(
                ["python", "-m", "spacy", "download", "en_core_web_sm"],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL
            )
            self.nlp = spacy.load("en_core_web_sm")

        self.sequential_markers = {
            'first', 'second', 'then', 'next',
            'finally', 'subsequently', 'lastly'
        }

        self.second_person_pronouns = {
            'you', 'your', 'yours', 'yourself'
        }

        self.action_verbs = {
            'apply', 'file', 'submit', 'contact', 'visit',
            'obtain', 'provide', 'ensure', 'check', 'verify',
            'prepare', 'complete', 'sign', 'consult', 'appeal'
        }

        self.legal_terms = {
            'act', 'section', 'clause', 'article', 'amendment',
            'statute', 'regulation', 'ordinance', 'provision',
            'code', 'law', 'rule', 'court', 'tribunal'
        }

        self.citation_patterns = [
            r'\b(section|sec\.?|s\.?)\s*\d+',
            r'\b(article|art\.?)\s*\d+',
            r'\b\d{4}\s*act\b',
            r'\b[A-Z]{2,}\s+Act\b'
        ]

    def _analyze_sentence_types(self, doc):
        st = {'imperative': 0, 'declarative': 0, 'interrogative': 0}
        for sent in doc.sents:
            root = sent.root
            if sent.text.strip().endswith('?'):
                st['interrogative'] += 1
            elif root.pos_ == 'VERB' and root.tag_ == 'VB':
                if not any(c.dep_ in ['nsubj', 'nsubjpass'] for c in root.children):
                    st['imperative'] += 1
            else:
                st['declarative'] += 1
        return st

    def normalize_score(self, raw):
        return int(min(raw * 1.6, 100))

    def predict(self, text, extracted_entities):
        doc = self.nlp(text)
        wc = len([t for t in doc if not t.is_punct and not t.is_space])
        if wc == 0:
            return "Inf:0 | Act:0"

        legal_citations = sum(
            len(re.findall(p, text, re.I))
            for p in self.citation_patterns
        )

        entity_count = len(extracted_entities)
        num_count = sum(
            1 for t in doc if t.like_num or t.pos_ == 'NUM'
        )

        inf_density = (
            (legal_citations * 3) +
            (entity_count * 2) +
            num_count
        ) / wc * 100

        entity_legal_bonus = sum(
            1 for e in extracted_entities
            if any(t in e.lower() for t in self.legal_terms)
        )

        action_count = 0
        for t in doc:
            if t.tag_ == 'MD':
                action_count += 2
            if t.lower_ in self.second_person_pronouns:
                action_count += 1
            if t.lemma_ in self.action_verbs:
                action_count += 2

        for m in self.sequential_markers:
            if m in text.lower():
                action_count += 1

        act_density = action_count / wc * 100

        st = self._analyze_sentence_types(doc)
        total = sum(st.values())
        imp_ratio = st['imperative'] / total if total else 0
        dec_ratio = st['declarative'] / total if total else 0

        raw_inf = (inf_density * 2.0) + (dec_ratio * 20) + (entity_legal_bonus * 5)
        raw_act = (act_density * 1.5) + (imp_ratio * 30)

        return f"Inf:{self.normalize_score(raw_inf)} | Act:{self.normalize_score(raw_act)}"


In [4]:
def extract_scores(score_string):
    inf = int(re.search(r'Inf:(\d+)', score_string).group(1))
    act = int(re.search(r'Act:(\d+)', score_string).group(1))
    return inf, act

def readability_score(text):
    return textstat.flesch_reading_ease(text)


In [9]:
df = pd.read_csv("vidhikarya_cleaned_for_pipeline.csv")


In [10]:
def parse_answers(x):
    if pd.isna(x):
        return []

    if isinstance(x, list):
        return x

    text = str(x)

    # IMPORTANT: split on |||
    if "|||" in text:
        return [a.strip() for a in text.split("|||") if a.strip()]

    # fallback
    if text.strip().startswith("["):
        try:
            return ast.literal_eval(text)
        except:
            pass

    return [text.strip()]


In [11]:
classifier = RobustClassifier()
rows = []

for i, (_, row) in enumerate(df.iterrows()):
    if i % 100 == 0:
        print(f"Processed {i} rows")

    question = row["question"]
    answers = parse_answers(row["updated_answers"])

    if len(answers) == 0:
        continue

    scored = []

    for ans in answers:
        score_str = classifier.predict(ans, [])
        inf, act = extract_scores(score_str)
        read = readability_score(ans)

        scored.append({
            "answer": ans,
            "inf": inf,
            "act": act,
            "read": read
        })

    # THESE LINES MUST BE INSIDE THE LOOP
    best_inf = max(scored, key=lambda x: x["inf"])
    best_act = max(scored, key=lambda x: x["act"])
    best_read = max(scored, key=lambda x: x["read"])

    rows.append({
        "query": question,
        "all_answers": " || ".join(answers),

        "best_informative_answer": best_inf["answer"],
        "best_informative_score": best_inf["inf"],

        "best_actionable_answer": best_act["answer"],
        "best_actionable_score": best_act["act"],

        "best_readable_answer": best_read["answer"],
        "best_readable_score": best_read["read"],
    })


Processed 0 rows
Processed 100 rows
Processed 200 rows
Processed 300 rows
Processed 400 rows
Processed 500 rows
Processed 600 rows
Processed 700 rows
Processed 800 rows
Processed 900 rows
Processed 1000 rows
Processed 1100 rows
Processed 1200 rows
Processed 1300 rows
Processed 1400 rows
Processed 1500 rows
Processed 1600 rows
Processed 1700 rows
Processed 1800 rows
Processed 1900 rows
Processed 2000 rows
Processed 2100 rows
Processed 2200 rows
Processed 2300 rows
Processed 2400 rows
Processed 2500 rows
Processed 2600 rows
Processed 2700 rows
Processed 2800 rows
Processed 2900 rows
Processed 3000 rows
Processed 3100 rows
Processed 3200 rows
Processed 3300 rows
Processed 3400 rows
Processed 3500 rows
Processed 3600 rows
Processed 3700 rows
Processed 3800 rows
Processed 3900 rows
Processed 4000 rows
Processed 4100 rows
Processed 4200 rows
Processed 4300 rows
Processed 4400 rows
Processed 4500 rows
Processed 4600 rows
Processed 4700 rows
Processed 4800 rows
Processed 4900 rows
Processed 50

In [None]:
final_df = pd.DataFrame(rows)
final_df.to_csv("final_legal_answer_selection.csv", index=False)

from google.colab import files
files.download("final_legal_answer_selection.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
final_df = pd.DataFrame(rows)
final_df.to_csv("final_legal_answer_selection.csv", index=False)


In [None]:
total_rows = len(df)
non_null = df["updated_answers"].notna().sum()

parsed_non_empty = sum(
    1 for x in df["updated_answers"]
    if len(parse_answers(x)) > 0
)

print("Total rows:", total_rows)
print("Rows with non-null updated_answers:", non_null)
print("Rows with parsed answers:", parsed_non_empty)


Total rows: 37945
Rows with non-null updated_answers: 37381
Rows with parsed answers: 37381


In [None]:
print("Length of df used in loop:", len(df))
df.head()


Length of df used in loop: 37945


Unnamed: 0,Category,Sub Category,location,question,answers,Number of Ans,Link,advocate_names,date_of_question,date_of_scraping,...,Unnamed: 642,Unnamed: 643,Unnamed: 644,Unnamed: 645,Unnamed: 646,Unnamed: 647,Unnamed: 648,Unnamed: 649,Unnamed: 650,Unnamed: 651
0,Property,Balcony extension with Camera,"( Pune, Maharashtra )",Some flat owners in my society have extended t...,"Dear Client,If other floor owners extended the...",1,https://www.vidhikarya.com/FreeLegalAdvice/655...,Kishan Dutt Kalaskar|||S. K. Dutta,2 weeks ago,16-06-2025,...,,,,,,,,,,
1,Corporate and Incorporation,Salary not credited from 2 month,"( Mangalore, Karnataka )",How can i get my salary with legal procedure( ...,"Dear Client,Your query is too insufficient to ...",1,https://www.vidhikarya.com/FreeLegalAdvice/655...,S. K. Dutta,2 weeks ago,16-06-2025,...,,,,,,,,,,
2,Property,D FORM PATTA SURVEY NUMBERS CHANGED,"( Krishna, Andhra Pradesh )",Dear sir\n This is bring to ...,"Dear Client,To correct errors in Patta, the ow...",1,https://www.vidhikarya.com/FreeLegalAdvice/655...,Kishan Dutt Kalaskar|||S. K. Dutta,2 weeks ago,16-06-2025,...,,,,,,,,,,
3,Property,Urgent Help Needed – Apartment Maintenance Issues,"( Hyderabad, Telangana )",Subject- Urgent Help Needed – Apartment Mainte...,"Dear client,Based on your situation, it's clea...",3,https://www.vidhikarya.com/FreeLegalAdvice/655...,Kishan Dutt Kalaskar|||Simi Paul|||S. K. Dutta...,2 weeks ago,16-06-2025,...,,,,,,,,,,
4,Property,"Clarification on Transfer of Shares, Release d...","( Mumbai suburban, Maharashtra )",We are 3 siblings. Our mother is owner of 2 fl...,"Dear Client,Since your mother passed away with...",3,https://www.vidhikarya.com/FreeLegalAdvice/655...,Gourab Das|||Kishan Dutt Kalaskar|||S. K. Dutta,2 weeks ago,16-06-2025,...,,,,,,,,,,


In [None]:
print("Number of rows written:", len(rows))


Number of rows written: 37381
