In [None]:
from pathlib import Path
import textract
folder_path=Path('datasets')

text_list=[]

for file in folder_path.iterdir():
    text_list.append(textract.process(file).decode("utf-8"))

['DEPARTMENT OF ORAL PATHOLOGY& MICROBIOLOGY\n\n\n\n\n\n\n\nName of the Patient:\n\n\n\n Biopsy no.: \n\n\n\nO.P /Bed Ref No.                                     \n\n\n\n Age / Sex:\n\n55 yrs./Female\n\nClinic/ Hospital:                     \n\nMCODS, Mangalore\n\n Dept. / Ward:\n\nOral Surgery\n\nReceived from :                  \n\nDr. Premalatha Shetty\n\n  Time: \n\n02:55Pm\n\nClinical Diagnosis : \n\nTraumatic Fibroma\n\nDate: \n\n10/03/2023\n\nSite of biopsy:  \n\nAntero-Lateral border of the tongue \n\n  Biopsy:                    \n\nExcisional \n\nGross description of the specimen:   \n\n\tOne bottle labelled as “” received contains one soft tissue white in colour, smooth to rough surface, soft to firm in consistency measuring approximately 05 x 0.5 x 0.5 cms.\n\nTissue kept for routine processing.\n\n\n\nMicroscopic Appearance:  \n\n\t The given tissue sections show the presence of hyperplastic parakeratinized stratified squamous epithelium overlying a fibrous connective tiss

In [2]:
import re
def extract_keywords(list):
    data={}
    data['Age']=(re.findall(r'Sex:?(?:\s+)(\d+)',list,re.IGNORECASE))
    data['Sex']=re.findall(r'(?:\d+)(?:\s)?(?:\w.+)?(?:\s)?\/(?:\s+)?([a-zA-Z]+)',list,re.IGNORECASE)
    data['Clinical_diagnosis']=re.findall(r'Clinical\s+Diagnosis\s:(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Biopsy']=re.findall(r'Biopsy:(?:\s+)(\w+)',list)
    data['Site_of_biopsy']=re.findall(r'Site.of.biopsy:(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Microscopic_appearance']=re.findall(r'Microscopic(?:\s+)appearance:?(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Gross_description']=re.findall(r'Gross.description.of.the.specimen:?(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Diagnosis']=re.findall(r'DIAGNOSIS:(?:\s+)(?:\w.+)(?:\n+)(\w.+)',list,re.IGNORECASE)

    return data

In [3]:
from pathlib import Path
folder_path=Path('datasets')
extract_keywords_dict={}
merged = {
    "Age": [], "Sex": [], "Clinical_diagnosis": [], "Site_of_biopsy": [],
    "Biopsy": [], "Gross_description": [], "Microscopic_appearance": [], "Diagnosis": []
}

In [4]:
for file in folder_path.iterdir():
    list=textract.process(file).decode("utf-8")
    extract_keywords_dict=extract_keywords(list)
    for key, value in extract_keywords_dict.items():
        merged[key].extend(value)

In [5]:
import pandas as pd 
df=pd.DataFrame.from_dict(merged,orient='index')
df=df.transpose()
df=df.drop(columns=['Gross_description','Microscopic_appearance'])
df

Unnamed: 0,Age,Sex,Clinical_diagnosis,Site_of_biopsy,Biopsy,Diagnosis
0,55,Female,Traumatic Fibroma,Antero-Lateral border of the tongue,Excisional,Fibroma
1,49,Male,Candidiasis,Left buccal mucosa,Incisional,Candidiasis
2,60,Male,Ca Maxilla??,"Buccal vestibule w.r.t 15,16,17",Incisional,Oral Squamous Cell Carcinoma
3,54,Female,Pyogenic Granuloma?,"Marginal Gingiva w.r.t.42,43",Excisional,Pyogenic Granuloma
4,54,Female,Squamous Cell Carcinoma,Right Maxilla,Incisional,Oral Squamous Cell Carcinoma


In [6]:
gross_description=merged['Gross_description']

In [7]:
import pandas as pd
from transformers import AutoModelForTokenClassification,AutoTokenizer,pipeline

model_name="d4data/biomedical-ner-all"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline=pipeline("ner",model=model,tokenizer=tokenizer, aggregation_strategy="simple")

extracted_results=[]

for idx, description in enumerate(gross_description):
    
    entities=ner_pipeline(description)

    aggregated_entities={}

    for ent in entities:
        ent_type = ent["entity_group"]

        word = ent["word"].replace("##","")

        if ent_type in aggregated_entities:
            aggregated_entities[ent_type].append(word)
        else:
            aggregated_entities[ent_type] = [word]

    extracted_results.append(aggregated_entities)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [10]:
import pandas as pd
gross_description_df=pd.DataFrame(extracted_results)
gross_description_df.head()
gross_description_df=gross_description_df.drop(['Detailed_description','Distance','Volume','Dosage'],axis=1)

In [13]:
microscopic_appearance=merged['Microscopic_appearance']

In [14]:
extracted_results_ma=[]

for idx,description in enumerate(microscopic_appearance):

    entities=ner_pipeline(description)

    aggregated_entities={}

    for ent in entities:
        ent_type=ent['entity_group']
        word=ent['word'].replace("##","")

        if ent_type in aggregated_entities:
            aggregated_entities[ent_type].append(word)
        else:
            aggregated_entities[ent_type]=[word]
    extracted_results_ma.append(aggregated_entities)
    

In [16]:
microscopic_appearance_df=pd.DataFrame(extracted_results_ma)
microscopic_appearance_df=microscopic_appearance_df.drop(['Disease_disorder','Diagnostic_procedure','Lab_value'],axis=1)

In [18]:
df_merged=pd.concat([df,gross_description_df,microscopic_appearance_df],axis=1)
df_merged=df_merged.drop(columns=(['Severity']))

In [19]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   5 non-null      object
 1   Sex                   5 non-null      object
 2   Clinical_diagnosis    5 non-null      object
 3   Site_of_biopsy        5 non-null      object
 4   Biopsy                5 non-null      object
 5   Diagnosis             5 non-null      object
 6   Texture               5 non-null      object
 7   Color                 5 non-null      object
 8   Area                  5 non-null      object
 9   Detailed_description  5 non-null      object
 10  Biological_structure  5 non-null      object
 11  Sign_symptom          5 non-null      object
dtypes: object(12)
memory usage: 612.0+ bytes


In [21]:
target=df_merged['Diagnosis']

In [22]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
target=label_encoder.fit_transform(target)
target

array([1, 0, 2, 3, 2])

In [23]:
from sklearn.preprocessing import MultiLabelBinarizer
def multilabel_encode(df , colums):
    mlb=MultiLabelBinarizer()
    for col in colums:
        encoded=mlb.fit_transform(df[col])
        encoded_df = pd.DataFrame(encoded, columns=[f"{col}_{label}" for label in mlb.classes_])
        df = df.drop(columns=[col]).join(encoded_df)
    return df
cols=['Texture','Area','Color','Detailed_description','Biological_structure','Sign_symptom']
df_merged=multilabel_encode(df_merged,cols)
df_merged.head()

Unnamed: 0,Age,Sex,Clinical_diagnosis,Site_of_biopsy,Biopsy,Diagnosis,Texture_firm,Texture_rough,Texture_smooth,Texture_soft,...,Sign_symptom_age,Sign_symptom_atosis,Sign_symptom_cell infiltrate,Sign_symptom_d,Sign_symptom_ha,Sign_symptom_infiltrate,Sign_symptom_inflammatory,Sign_symptom_inflammatory cell infiltrate,Sign_symptom_mour,Sign_symptom_tu
0,55,Female,Traumatic Fibroma,Antero-Lateral border of the tongue,Excisional,Fibroma,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
1,49,Male,Candidiasis,Left buccal mucosa,Incisional,Candidiasis,1,1,1,1,...,1,0,0,0,1,1,1,0,0,0
2,60,Male,Ca Maxilla??,"Buccal vestibule w.r.t 15,16,17",Incisional,Oral Squamous Cell Carcinoma,1,1,1,1,...,0,1,0,1,0,0,0,1,1,1
3,54,Female,Pyogenic Granuloma?,"Marginal Gingiva w.r.t.42,43",Excisional,Pyogenic Granuloma,1,0,1,1,...,0,0,0,0,0,1,1,0,0,0
4,54,Female,Squamous Cell Carcinoma,Right Maxilla,Incisional,Oral Squamous Cell Carcinoma,1,1,1,1,...,0,0,0,0,0,0,0,1,1,1


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

x = df_merged.drop(columns=['Diagnosis'])
x_train,x_test,y_train,y_test=train_test_split(x,target,test_size=0.2,random_state=40)


In [25]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor

estimator=[
    ('encoder',TargetEncoder()),
    ('reg',XGBRegressor())
]

pipe=Pipeline(steps=estimator)
pipe

In [26]:
from skopt import BayesSearchCV
from skopt.space import Real,Categorical,Integer

search_space={
    'reg__max_depth':Integer(2,8),
    'reg__learning_rate':Real(0.001,1.0,prior='log-uniform'),
    'reg__subsample':Real(0.5,1.0),
    'reg__colsample_bytree':Real(0.5,1.0),
    'reg__colsample_bylevel':Real(0.5,1.0),
    'reg__colsample_bynode':Real(0.5,1.0),
    'reg__reg_alpha':Real(0.0,10.0),
    'reg__reg_lamda':Real(0.0,10.0),
    'reg__gamma':Real(0.0,10.0) 
}
opt =  BayesSearchCV(pipe,search_space,cv=3,n_iter=10,scoring='neg_root_mean_squared_error')

In [27]:
opt.fit(x_train,y_train)

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters: { "reg_lamda" } are not used.

Parameters:

In [28]:
import xgboost as xg
from sklearn.metrics import mean_squared_error

y_pred = opt.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 0.2500
