In [1]:
import pandas as pd
import numpy as np
import nltk 
import joblib

In [2]:
df = pd.read_csv('./dataset/indian_bail_judgements_processed.csv')

In [3]:
df.head()

Unnamed: 0,ipc_sections,bail_type,bail_cancellation_case,landmark_case,accused_gender,prior_cases,bail_outcome,crime_type,facts,bias_flag,parity_argument_used,region,court_level
0,"['120B', '121', '121A']",Regular,True,True,Male,Unknown,Rejected,Narcotics,Jibangshu Paul was apprehended carrying Rs. 32...,False,False,Assam,2
1,"['376', '354', '343', '109', '220', '348', '33...",Regular,True,False,Male,Unknown,Rejected,Sexual Offense,The case involves custodial rape of a woman by...,False,False,Tamil Nadu,2
2,"['465', '468', '471', '474', '420', '511', '34']",Anticipatory,False,True,Male,Unknown,Rejected,Fraud or Cheating,"Hyderali, a government contractor, was accused...",False,True,Kerala,2
3,"['326', '307', '120B', '201']",Regular,True,False,Male,Unknown,Granted,Others,"The petitioner, a government employee, alleged...",False,False,West Bengal,2
4,"['302', '34']",Regular,False,False,Female,Unknown,Rejected,Murder,Shankri Devi and co-accused were charged with ...,False,False,Jammu & Kashmir,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ipc_sections            1200 non-null   object
 1   bail_type               1200 non-null   object
 2   bail_cancellation_case  1200 non-null   bool  
 3   landmark_case           1200 non-null   bool  
 4   accused_gender          1200 non-null   object
 5   prior_cases             1200 non-null   object
 6   bail_outcome            1200 non-null   object
 7   crime_type              1200 non-null   object
 8   facts                   1200 non-null   object
 9   bias_flag               1200 non-null   bool  
 10  parity_argument_used    1200 non-null   bool  
 11  region                  1200 non-null   object
 12  court_level             1200 non-null   int64 
dtypes: bool(4), int64(1), object(8)
memory usage: 89.2+ KB


In [5]:
df['ipc_sections']

0                                 ['120B', '121', '121A']
1       ['376', '354', '343', '109', '220', '348', '33...
2        ['465', '468', '471', '474', '420', '511', '34']
3                           ['326', '307', '120B', '201']
4                                           ['302', '34']
                              ...                        
1195                                                   []
1196                                             ['364A']
1197          ['419', '420', '467', '468', '471', '120B']
1198                           ['25(1-b)(a)', '26', '35']
1199    ['120B', '255', '256', '257', '259', '260', '4...
Name: ipc_sections, Length: 1200, dtype: object

In [6]:
for col in df.columns:
    if df[col].unique().shape[0] <= 10:
        print(f"{col}: {df[col].unique()}")

bail_type: ['Regular' 'Anticipatory' 'Interim' 'Unknown' 'Others' 'Not applicable']
bail_cancellation_case: [ True False]
landmark_case: [ True False]
accused_gender: ['Male' 'Female' 'Unknown' 'Multiple']
prior_cases: ['Unknown' 'Yes' 'No']
bail_outcome: ['Rejected' 'Granted']
bias_flag: [False  True]
parity_argument_used: [False  True]
court_level: [2 1 3]


In [7]:
# ENCODING OHE type column
ohe_cols = ['bail_type', 'crime_type', 'region', 'accused_gender', "prior_cases"]
df_ohe = pd.get_dummies(
    df[ohe_cols],
    drop_first=True,      # avoids dummy variable trap
    prefix=ohe_cols
)
df = df.drop(columns=ohe_cols)
df = pd.concat([df.reset_index(drop=True),
                df_ohe.reset_index(drop=True)], axis=1)

In [8]:
joblib.dump(df.columns.tolist(), "./artefacts/ohe_columns.json")

['./artefacts/ohe_columns.json']

In [9]:
# ENCODING boolean type columns
for col in [col for col in df.columns if df[col].dtype == 'bool']:
    df[col] = df[col].replace({
    'True': 1, 'False': 0,
    'Yes': 1, 'No': 0
}).astype(int)

In [10]:
# Encoding target column
df['bail_outcome'] = df['bail_outcome'].replace({
    'Granted': 1,   'Rejected': 0
}).astype(int)

  df['bail_outcome'] = df['bail_outcome'].replace({


In [11]:
df.isnull().sum()

ipc_sections                    0
bail_cancellation_case          0
landmark_case                   0
bail_outcome                    0
facts                           0
bias_flag                       0
parity_argument_used            0
court_level                     0
bail_type_Interim               0
bail_type_Not applicable        0
bail_type_Others                0
bail_type_Regular               0
bail_type_Unknown               0
crime_type_Cyber Crime          0
crime_type_Domestic Violence    0
crime_type_Dowry Harassment     0
crime_type_Extortion            0
crime_type_Fraud or Cheating    0
crime_type_Kidnapping           0
crime_type_Murder               0
crime_type_Narcotics            0
crime_type_Others               0
crime_type_Sexual Offense       0
crime_type_Theft or Robbery     0
region_Assam                    0
region_Bihar                    0
region_Chandigarh               0
region_Chhattisgarh             0
region_Delhi                    0
region_Gujarat

In [12]:
# Train test split
from sklearn.model_selection import train_test_split

Y = df['bail_outcome']
X = df.drop(columns=['bail_outcome',])
X_train,X_test,y_train,y_test=train_test_split(X, Y, stratify=Y, test_size=0.20, random_state=42)

In [13]:
## MULTI-LABEL BINARIZATION OF ipc_sections

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
import ast
# Ensure ipc_sections is list
X_train['ipc_sections'] = X_train['ipc_sections'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

mlb = MultiLabelBinarizer()
ipc_encoded = mlb.fit_transform(X_train['ipc_sections'])

# IMPORTANT: align index
ipc_df = pd.DataFrame(
    ipc_encoded,
    columns=mlb.classes_,
    index=X_train.index
)

# Drop and concat safely
X_train = X_train.drop(columns=['ipc_sections'])
X_train = pd.concat([X_train, ipc_df], axis=1)


In [14]:
X_test['ipc_sections'] = X_test['ipc_sections'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
ipc_encoded_test = mlb.transform(X_test['ipc_sections'])
ipc_df_test = pd.DataFrame(ipc_encoded_test, columns=mlb.classes_, index=X_test.index)

X_test = X_test.drop(columns=['ipc_sections'])
X_test = pd.concat([X_test, ipc_df_test], axis=1)



In [15]:
X_test.isnull().sum()

bail_cancellation_case    0
landmark_case             0
facts                     0
bias_flag                 0
parity_argument_used      0
                         ..
59                        0
66E                       0
67                        0
67A                       0
8                         0
Length: 310, dtype: int64

In [16]:
# Vectorization of text columns
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')


lemmatizer=WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def clean_text(text):
    if not isinstance(text, str):
        return ""   # convert NaN / float to empty string

    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


X_train['facts_clean'] = X_train['facts'].apply(clean_text)
X_train['facts_clean'] = X_train['facts_clean'].apply(lambda x: lemmatize_words(x))

tfidf = TfidfVectorizer(
    max_features=3000,      # controls dimensionality
    stop_words='english',   # removes common words
    ngram_range=(1, 2)      # unigrams + bigrams
)

X_train_facts_tfidf = tfidf.fit_transform(X_train['facts_clean'])



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rachi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rachi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
X_train_facts_df = pd.DataFrame(
    X_train_facts_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

X_train = X_train.drop(columns=['facts', 'facts_clean'])

X_train_final = pd.concat(
    [X_train.reset_index(drop=True),
     X_train_facts_df.reset_index(drop=True)],
    axis=1
)


In [18]:
X_test['facts_clean'] = X_test['facts'].apply(clean_text)
X_test['facts_clean'] = X_test['facts_clean'].apply(lambda x: lemmatize_words(x))

X_test_facts_tfidf = tfidf.transform(X_test['facts_clean'])

X_test_facts_df = pd.DataFrame(
    X_test_facts_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)

X_test = X_test.drop(columns=['facts', 'facts_clean'])

X_test_final = pd.concat(
    [X_test.reset_index(drop=True),
     X_test_facts_df.reset_index(drop=True)],
    axis=1
)


In [19]:
X_train_final.to_csv('./dataset/X_train_final.csv', index=False)
X_test_final.to_csv('./dataset/X_test_final.csv', index=False)
y_train.to_csv('./dataset/y_train.csv', index=False)
y_test.to_csv('./dataset/y_test.csv', index=False)


In [21]:
### Saving preprocessed encoders and vectorizers
joblib.dump(tfidf, './artefacts/tfidf_vectorizer.pkl')
joblib.dump(mlb, './artefacts/ipc_mlb.pkl')


['./artefacts/ipc_mlb.pkl']