In [3]:
import pandas as pd

crpc = pd.read_csv("../data/raw/crpc_sections.csv")
crpc.head()



Unnamed: 0,Chapter,Chapter_name,Chapter_subtype,Section,Section _name,Description
0,1,PRELIMINARY,PRELIMINARY,1,"Short title, extent and commencement.",(1) This Act may be called the Code of Crimina...
1,1,PRELIMINARY,PRELIMINARY,2,Definitions.,(1) This Act may be called the Code of Crimina...
2,1,PRELIMINARY,PRELIMINARY,3,Construction of references.,"In this Code, unless the context otherwise req..."
3,1,PRELIMINARY,PRELIMINARY,4,Trial of offences under the Indian Penal Code ...,(1) All offences under the Indian Penal Code (...
4,1,PRELIMINARY,PRELIMINARY,5,Saving.,"Nothing contained in this Code shall, in the a..."


In [5]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

crpc["cleaned_text"] = crpc["Description"].apply(clean_text)


In [28]:
def map_category(chapter_name):
    chapter_name = str(chapter_name).lower()

    if any(word in chapter_name for word in ["arrest", "warrant", "police"]):
        return "Police Procedure"

    elif any(word in chapter_name for word in ["trial", "charge", "judgment"]):
        return "Trial Procedure"

    elif any(word in chapter_name for word in ["information", "fir", "complaint"]):
        return "FIR Procedure"

    elif "bail" in chapter_name:
        return "Bail"

    elif any(word in chapter_name for word in ["investigation", "search", "inquiry"]):
        return "Investigation"

    elif any(word in chapter_name for word in ["court", "magistrate", "sessions"]):
        return "Court Procedure"

    else:
        return "General Procedure"


In [29]:
crpc["category"] = crpc["Chapter_name"].apply(map_category)
crpc["category"].value_counts()


category
General Procedure    261
Trial Procedure      130
Police Procedure      77
Court Procedure       38
Bail                  18
FIR Procedure         10
Name: count, dtype: int64

In [31]:
general_df = crpc[crpc["category"] == "General Procedure"]
specific_df = crpc[crpc["category"] != "General Procedure"]

# Keep only 100 General Procedure rows
general_df = general_df.sample(n=min(100, len(general_df)), random_state=42)

df_balanced = pd.concat([specific_df, general_df])
df_balanced["category"].value_counts()
final_crpc = df_balanced[["cleaned_text", "category"]]

In [32]:
df_balanced["category"].value_counts()


category
Trial Procedure      130
General Procedure    100
Police Procedure      77
Court Procedure       38
Bail                  18
FIR Procedure         10
Name: count, dtype: int64

In [35]:
final_crpc = df_balanced[["cleaned_text", "category", "source"]]
final_crpc.to_csv("../data/processed/cleaned_legal_texts.csv",
    index=False
)


In [36]:
crpc["source"] = "CrPC"
crpc.to_csv("../data/processed/crpc_sections_processed.csv", index=False)

In [47]:
final_crpc = crpc[["cleaned_text", "category", "source"]]
final_crpc.head()


Unnamed: 0,cleaned_text,category,source
0,this act may be called the code of criminal pr...,General Procedure,CrPC
1,this act may be called the code of criminal pr...,General Procedure,CrPC
2,in this code unless the context otherwise requ...,General Procedure,CrPC
3,all offences under the indian penal code of sh...,General Procedure,CrPC
4,nothing contained in this code shall in the ab...,General Procedure,CrPC


In [39]:
final_crpc.to_csv(
    "../data/processed/cleaned_legal_texts.csv",
    index=False
)


In [40]:
final_crpc = pd.read_csv("../data/processed/cleaned_legal_texts.csv", low_memory=False)
print(final_crpc.isnull().sum())
print(final_crpc["category"].value_counts())

cleaned_text    1
category        0
source          0
dtype: int64
category
General Procedure    261
Trial Procedure      130
Police Procedure      77
Court Procedure       38
Bail                  18
FIR Procedure         10
Name: count, dtype: int64


In [41]:
import pandas as pd

final_crpc = pd.read_csv("../data/processed/cleaned_legal_texts.csv")

print(final_crpc.head())
print("\nNull values:\n", final_crpc.isnull().sum())
print("\nCategory distribution:\n", final_crpc["category"].value_counts())


                                        cleaned_text           category source
0  this act may be called the code of criminal pr...  General Procedure   CrPC
1  this act may be called the code of criminal pr...  General Procedure   CrPC
2  in this code unless the context otherwise requ...  General Procedure   CrPC
3  all offences under the indian penal code of sh...  General Procedure   CrPC
4  nothing contained in this code shall in the ab...  General Procedure   CrPC

Null values:
 cleaned_text    1
category        0
source          0
dtype: int64

Category distribution:
 category
General Procedure    261
Trial Procedure      130
Police Procedure      77
Court Procedure       38
Bail                  18
FIR Procedure         10
Name: count, dtype: int64
