Importing libraries

In [15]:
import pandas as pd
import re
from transformers import BertTokenizer, BertModel
import torch

Setting up the data base

In [16]:
from configparser import ConfigParser
from pathlib import Path

#Define relative path
base_path = Path.cwd()
config_path = base_path /"config_vector.ini"

#Initialize config parser
config = ConfigParser()
config.read(config_path)

#Accessing database details
db_config = config['database']
username = db_config['username']
pwd = db_config['password']
hostname = db_config['hostname']
port_id = int(db_config['port_id'])
database = db_config['database']

Reading the Data

In [17]:
#Creating SQLAlchemy engine
from sqlalchemy import create_engine, Column, String
engine = create_engine(f'postgresql://{username}:{pwd}@{hostname}:{port_id}/{database}')

from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base

#Creating a session
Session = sessionmaker(bind=engine)
session = Session()

#Defining ORM model
Base = declarative_base()

class Najirs(Base):
    __tablename__ = "najirs_v4"

    id = Column(String, primary_key=True)
    content = Column(String)
    date = Column(String)
    data = Column(String)

#Query data safely
results = session.query(Najirs.id, Najirs.content, Najirs.data, Najirs.date).all()
df = pd.DataFrame(results, columns= ["id", "content", "data", "date"])

df.head()

  Base = declarative_base()


Unnamed: 0,id,content,data,date
0,१००३८,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या.डा.आनन्दमोहन भट्टराई : न्याय प...,२०७४-१२-१९
1,४६१५,"<div class=""col-md-8 para-sections"">\n<div id=...","{'body': ['न्या.गजेन्द्रकेशरी वास्तोला', '१. ...",२०४९-०९-०८
2,११२१,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या. धनेन्द्रबहादुर सिंह : प्रस्तु...,२०३५-०१-१२
3,२४०६,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या.बब्बरप्रसाद सिंहः नेपालको संवि...,२०४२-०५-१९
4,३१४४,"<div class=""col-md-8 para-sections"">\n<div id=...",{'body': ['न्या.पृथ्वी बहादुर सिंहः नेपालको सं...,२०४४-०२-२५


Preprocessing the dataframe

In [18]:
df.drop(['id', 'content', 'date'], axis = 'columns', inplace = True)

def is_principles_empty(content):
    return 'principles' not in content or not content['principles']

#Removing rows with empty 'principles'
empty_principles_mask = df['data'].apply(is_principles_empty)
empty_principles_count = empty_principles_mask.sum()
df = df[~empty_principles_mask]
print(f"Removed {empty_principles_count} rows with empty 'principles' key.")

Removed 572 rows with empty 'principles' key.


Preprocessing the actual data

In [19]:
file_path = base_path/"nepali_stopwords.txt"

#Reading the content of the file
with file_path.open(encoding="utf-8") as f:
    content = f.read()

stopwords = [word.strip().strip('"') for word in content.split(",")]

def clean_text(text):
    
    text = re.sub(r'[\u200c\u200d]', '', text)  #Remove unwanted characters like \u200c and \u200d
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  #Keep only Devanagari characters and spaces
    text = re.sub(r'\d+', '', text)  #Remove digits

    #Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords]

    return " ".join(tokens)

# Extract and preprocess "subject" and "principles"
def extract_text(content):
    try:
        text = ""
        for key, value in content.items():
            if key in ["subject", "principles"]:
                if isinstance(value, str):
                    text += value + " "
                elif isinstance(value, list):
                    for sublist in value:
                        if isinstance(sublist, list):
                            text += " ".join(sublist) + " "
                        else:
                            text += sublist + " "
        return clean_text(text.strip())
    except Exception as e:
        print(f"Error processing content: {e}")
        return ""

In [20]:
def clean_text(text):
    text = re.sub(r'[\u200c\u200d]', '', text)
    text = re.sub(r'[^\u0900-\u097F\s]', '', text)  #Keep only Devanagari characters and spaces
    text = re.sub(r'\d+', '', text)  #Remove digits
    text = re.sub(r'।', '', text)

    tokens = text.split()
    return " ".join(tokens)

def extract_text(content):
    try:
        text = []
        for key, value in content.items():
            if key in ["subject", "principles"]:
                if isinstance(value, str):
                    text.append(value)
                elif isinstance(value, list):
                    for sublist in value:
                        if isinstance(sublist, list):
                            text.append(" ".join(map(str, sublist)))
                        else:
                            text.append(str(sublist))
        combined_text = " ".join(text)
        return clean_text(combined_text.strip())
    except Exception as e:
        print(f"Error processing content: {e}")
        return ""

In [21]:
#Applying preprocessing
df['cleaned_text'] = df['data'].apply(extract_text)
df.drop(['data'], axis = 'columns', inplace=True)

Loading and Fine-Tuning The NepBERTa Model

In [22]:
#Loading NepBERTa tokenizer and model
tokenizer = BertTokenizer.from_pretrained("NepBERTa/NepBERTa")
model = BertModel.from_pretrained("NepBERTa/NepBERTa", from_tf=True)





All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [23]:
def get_sentence_embedding(sentences):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.pooler_output.numpy()

#Generating embeddings
df['embeddings'] = df['cleaned_text'].apply(lambda x: get_sentence_embedding([x]) if x else None)
print("Embedding done")

Embedding done


In [24]:
print(df['embeddings'].head())

0    [[-0.023295283, 0.04019555, 0.22646093, 0.1248...
1    [[-0.18046029, 0.052735027, 0.23040603, 0.1659...
2    [[-0.2502869, -0.0183751, 0.117184184, 0.09710...
3    [[-0.2658006, -0.10030958, 0.21205233, 0.08202...
4    [[-0.11624412, -0.095075056, 0.28900784, 0.041...
Name: embeddings, dtype: object


Saving the Embeddings 

In [25]:
#Full file path for saving the DataFrame
export_path = base_path / "najirs_NepBERTa.pkl"

#Saving the DataFrame
df.to_pickle(export_path)

Export path: d:\Python\ML-Projects\Intern-Synapse\TextVectorization\najirs_NepBERTa.pkl
Data Frame Saved!
