In [128]:
from pathlib import Path
import textract
folder_path=Path('datasets')

text_list=[]


for file in folder_path.iterdir():
    text_list.append(textract.process(file).decode("utf-8"))

In [129]:
import re
def extract_keywords(list):
    data={}
    data['Age']=(re.findall(r'Sex:?(?:\s+)(\d+)',list,re.IGNORECASE))
    data['Sex']=re.findall(r'(?:\d+)(?:\s)?(?:\w.+)?(?:\s)?\/(?:\s+)?([a-zA-Z]+)',list,re.IGNORECASE)
    data['Clinical_diagnosis']=re.findall(r'Clinical\s+Diagnosis\s:(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Biopsy']=re.findall(r'Biopsy:(?:\s+)(\w+)',list)
    data['Site_of_biopsy']=re.findall(r'Site.of.biopsy:(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Microscopic_appearance']=re.findall(r'Microscopic(?:\s+)appearance:?(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Gross_description']=re.findall(r'Gross.description.of.the.specimen:?(?:\s+)(\w.+)',list,re.IGNORECASE)
    data['Diagnosis']=re.findall(r'DIAGNOSIS:(?:\s+)(?:\w.+)(?:\n+)(\w.+)',list,re.IGNORECASE)

    return data

In [130]:
from pathlib import Path
folder_path=Path('datasets')
extract_keywords_dict={}
merged = {
    "Age": [], "Sex": [], "Clinical_diagnosis": [], "Site_of_biopsy": [],
    "Biopsy": [], "Gross_description": [], "Microscopic_appearance": [], "Diagnosis": []
}

In [131]:
for file in folder_path.iterdir():
    list=textract.process(file).decode("utf-8")
    extract_keywords_dict=extract_keywords(list)
    for key, value in extract_keywords_dict.items():
        merged[key].extend(value)

In [132]:
import pandas as pd 
df=pd.DataFrame.from_dict(merged,orient='index')
df=df.transpose()
df=df.drop(columns=['Gross_description','Microscopic_appearance'])
df

Unnamed: 0,Age,Sex,Clinical_diagnosis,Site_of_biopsy,Biopsy,Diagnosis
0,55,Female,Traumatic Fibroma,Antero-Lateral border of the tongue,Excisional,Fibroma
1,49,Male,Candidiasis,Left buccal mucosa,Incisional,Candidiasis
2,60,Male,Ca Maxilla??,"Buccal vestibule w.r.t 15,16,17",Incisional,Oral Squamous Cell Carcinoma
3,54,Female,Pyogenic Granuloma?,"Marginal Gingiva w.r.t.42,43",Excisional,Pyogenic Granuloma
4,54,Female,Squamous Cell Carcinoma,Right Maxilla,Incisional,Oral Squamous Cell Carcinoma


In [133]:
gross_description=merged['Gross_description']

In [134]:
import pandas as pd
from transformers import AutoModelForTokenClassification,AutoTokenizer,pipeline

model_name="d4data/biomedical-ner-all"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model=AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline=pipeline("ner",model=model,tokenizer=tokenizer, aggregation_strategy="simple")

extracted_results=[]

for idx, description in enumerate(gross_description):
    
    entities=ner_pipeline(description)

    aggregated_entities={}

    for ent in entities:
        ent_type = ent["entity_group"]

        word = ent["word"].replace("##","")

        if ent_type in aggregated_entities:
            aggregated_entities[ent_type].append(word)
        else:
            aggregated_entities[ent_type] = [word]

    extracted_results.append(aggregated_entities)


Device set to use cpu


In [135]:
import pandas as pd
gross_description_df=pd.DataFrame(extracted_results)
gross_description_df.head()
gross_description_df=gross_description_df.drop(['Detailed_description','Distance','Volume','Dosage'],axis=1)

In [136]:
microscopic_appearance=merged['Microscopic_appearance']
microscopic_appearance

['The given tissue sections show the presence of hyperplastic parakeratinized stratified squamous epithelium overlying a fibrous connective tissue stroma. The connective tissue stroma exhibits haphazardly arranged dense bundles of collagen fibers and mild chronic inflammatory cell infiltrate (chiefly lymphocytes).',
 'The given tissue sections show the presence of 4-5 cell thick parakeratinized stratified squamous epithelium overlying the connective tissue stroma. The epithelium exhibits the presence of PAS and Gomori’s Methanamine silver positive fungal spores, hyphae, gram positive and gram negative diplobacilli. The underlying fibrocellular connective tissue stroma shows the presence of chronic inflammatory cell infiltrate (lymphocytes and plasma cells), areas of haemorrhage and adipose tissue units.',
 'The given tissue sections show the presence of dysplastic parakeratinized stratified squamous epithelium invading the underlying connective tissue in the form of sheets and islands.

In [137]:
extracted_results_ma=[]

for idx,description in enumerate(microscopic_appearance):

    entities=ner_pipeline(description)

    aggregated_entities={}

    for ent in entities:
        ent_type=ent['entity_group']
        word=ent['word'].replace("##","")

        if ent_type in aggregated_entities:
            aggregated_entities[ent_type].append(word)
        else:
            aggregated_entities[ent_type]=[word]
    extracted_results_ma.append(aggregated_entities)
    

In [138]:
microscopic_appearance_df=pd.DataFrame(extracted_results_ma)
microscopic_appearance_df=microscopic_appearance_df.drop(['Disease_disorder','Diagnostic_procedure','Lab_value'],axis=1)

In [139]:
df_merged=pd.concat([df,gross_description_df,microscopic_appearance_df],axis=1)
df_merged=df_merged.drop(columns=(['Severity']))
df_merged

Unnamed: 0,Age,Sex,Clinical_diagnosis,Site_of_biopsy,Biopsy,Diagnosis,Texture,Color,Area,Detailed_description,Biological_structure,Sign_symptom
0,55,Female,Traumatic Fibroma,Antero-Lateral border of the tongue,Excisional,Fibroma,"[soft tissue, smooth, rough, soft to firm]",[white],[05 x 0. 5 x 0. 5 cm],"[hyperplastic, para, keratinized, st, rat, us,...","[squamous epithelium, connect, connective tiss...",[cell infiltrate]
1,49,Male,Candidiasis,Left buccal mucosa,Incisional,Candidiasis,"[soft, firm, smooth, rough]","[white, brown]",[1. 0 x 0. 6 x 0. 4 cm],"[4 - 5 cell, thick, para, keratinized, st, chr...","[rat, squamous, epithelium, connect, epith, um...","[inflammatory, infiltrate, ha, age]"
2,60,Male,Ca Maxilla??,"Buccal vestibule w.r.t 15,16,17",Incisional,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[1. 1 x 0., x 0., ’ cm]","[d, para, keratin, rat, dense, chronic]","[squamous epithelium, connect, connective]","[tu, mour, d, atosis, inflammatory cell infilt..."
3,54,Female,Pyogenic Granuloma?,"Marginal Gingiva w.r.t.42,43",Excisional,Pyogenic Granuloma,"[soft tissue, smooth, soft, firm]",[white],[0. 5 x 0. 5 x 0. 3 cm],"[para, keratinized, st, rat, chronic]","[squamous epithelium, connect, tissue, end]","[inflammatory, infiltrate]"
4,54,Female,Squamous Cell Carcinoma,Right Maxilla,Incisional,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[1, 0. 6 cm]",[chronic],[connect],"[tu, mour, inflammatory cell infiltrate]"


In [140]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   5 non-null      object
 1   Sex                   5 non-null      object
 2   Clinical_diagnosis    5 non-null      object
 3   Site_of_biopsy        5 non-null      object
 4   Biopsy                5 non-null      object
 5   Diagnosis             5 non-null      object
 6   Texture               5 non-null      object
 7   Color                 5 non-null      object
 8   Area                  5 non-null      object
 9   Detailed_description  5 non-null      object
 10  Biological_structure  5 non-null      object
 11  Sign_symptom          5 non-null      object
dtypes: object(12)
memory usage: 612.0+ bytes


In [141]:
df_merged['Biopsy_encoded'] = df_merged['Biopsy'].map({'Incisional': 0, 'Excisional': 1})
df_merged=df_merged.drop(columns=['Biopsy'])
df_merged['Sex_encoded']=df_merged['Sex'].map({'Female':0,'Male':1})
df_merged=df_merged.drop(columns=['Sex'])
df_merged

Unnamed: 0,Age,Clinical_diagnosis,Site_of_biopsy,Diagnosis,Texture,Color,Area,Detailed_description,Biological_structure,Sign_symptom,Biopsy_encoded,Sex_encoded
0,55,Traumatic Fibroma,Antero-Lateral border of the tongue,Fibroma,"[soft tissue, smooth, rough, soft to firm]",[white],[05 x 0. 5 x 0. 5 cm],"[hyperplastic, para, keratinized, st, rat, us,...","[squamous epithelium, connect, connective tiss...",[cell infiltrate],1,0
1,49,Candidiasis,Left buccal mucosa,Candidiasis,"[soft, firm, smooth, rough]","[white, brown]",[1. 0 x 0. 6 x 0. 4 cm],"[4 - 5 cell, thick, para, keratinized, st, chr...","[rat, squamous, epithelium, connect, epith, um...","[inflammatory, infiltrate, ha, age]",0,1
2,60,Ca Maxilla??,"Buccal vestibule w.r.t 15,16,17",Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[1. 1 x 0., x 0., ’ cm]","[d, para, keratin, rat, dense, chronic]","[squamous epithelium, connect, connective]","[tu, mour, d, atosis, inflammatory cell infilt...",0,1
3,54,Pyogenic Granuloma?,"Marginal Gingiva w.r.t.42,43",Pyogenic Granuloma,"[soft tissue, smooth, soft, firm]",[white],[0. 5 x 0. 5 x 0. 3 cm],"[para, keratinized, st, rat, chronic]","[squamous epithelium, connect, tissue, end]","[inflammatory, infiltrate]",1,0
4,54,Squamous Cell Carcinoma,Right Maxilla,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[1, 0. 6 cm]",[chronic],[connect],"[tu, mour, inflammatory cell infiltrate]",0,0


In [142]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df_merged['Site_of_biopsy'] = le.fit_transform(df_merged['Site_of_biopsy'])
df_merged['Clinical_diagnosis'] = le.fit_transform(df_merged['Clinical_diagnosis'])

df_merged

Unnamed: 0,Age,Clinical_diagnosis,Site_of_biopsy,Diagnosis,Texture,Color,Area,Detailed_description,Biological_structure,Sign_symptom,Biopsy_encoded,Sex_encoded
0,55,4,0,Fibroma,"[soft tissue, smooth, rough, soft to firm]",[white],[05 x 0. 5 x 0. 5 cm],"[hyperplastic, para, keratinized, st, rat, us,...","[squamous epithelium, connect, connective tiss...",[cell infiltrate],1,0
1,49,1,2,Candidiasis,"[soft, firm, smooth, rough]","[white, brown]",[1. 0 x 0. 6 x 0. 4 cm],"[4 - 5 cell, thick, para, keratinized, st, chr...","[rat, squamous, epithelium, connect, epith, um...","[inflammatory, infiltrate, ha, age]",0,1
2,60,0,1,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[1. 1 x 0., x 0., ’ cm]","[d, para, keratin, rat, dense, chronic]","[squamous epithelium, connect, connective]","[tu, mour, d, atosis, inflammatory cell infilt...",0,1
3,54,2,3,Pyogenic Granuloma,"[soft tissue, smooth, soft, firm]",[white],[0. 5 x 0. 5 x 0. 3 cm],"[para, keratinized, st, rat, chronic]","[squamous epithelium, connect, tissue, end]","[inflammatory, infiltrate]",1,0
4,54,3,4,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[1, 0. 6 cm]",[chronic],[connect],"[tu, mour, inflammatory cell infiltrate]",0,0


In [143]:
import re

def extract_max_dimension(size_entry):
    try:
        # Convert to string if not already
        entry_str = str(size_entry)
        
        # Use regex to extract all numbers from the string
        # This handles complex formats with varying delimiters
        numbers = re.findall(r'(\d+\.\s*\d+|\d+)', entry_str)
        
        if not numbers:
            return None
            
        # Convert extracted strings to floats
        dimensions = [float(num.replace(' ', '')) for num in numbers]
        
        # Return the largest dimension
        return max(dimensions)
    except Exception as e:
        print(f"New error processing {size_entry}: {e}")
        return None

# Apply to get the numerical maximum dimension
df_merged['Area_Numeric'] = df_merged['Area'].apply(extract_max_dimension)

# Convert to category codes
def size_to_numerical_code(size_entry):
    max_dim = extract_max_dimension(size_entry)
    
    if max_dim is None:
        return None
    elif max_dim < 1:
        return 0  # For '<1 cm'
    elif max_dim <= 5:
        return 1  # For '1-5 cm'
    else:
        return 2  # For '>5 cm'

# Apply to get numerical category codes
df_merged['Area_Code'] = df_merged['Area'].apply(size_to_numerical_code)
df_merged=df_merged.drop(columns=['Area'])
df_merged

Unnamed: 0,Age,Clinical_diagnosis,Site_of_biopsy,Diagnosis,Texture,Color,Detailed_description,Biological_structure,Sign_symptom,Biopsy_encoded,Sex_encoded,Area_Numeric,Area_Code
0,55,4,0,Fibroma,"[soft tissue, smooth, rough, soft to firm]",[white],"[hyperplastic, para, keratinized, st, rat, us,...","[squamous epithelium, connect, connective tiss...",[cell infiltrate],1,0,5.0,1
1,49,1,2,Candidiasis,"[soft, firm, smooth, rough]","[white, brown]","[4 - 5 cell, thick, para, keratinized, st, chr...","[rat, squamous, epithelium, connect, epith, um...","[inflammatory, infiltrate, ha, age]",0,1,1.0,1
2,60,0,1,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[d, para, keratin, rat, dense, chronic]","[squamous epithelium, connect, connective]","[tu, mour, d, atosis, inflammatory cell infilt...",0,1,1.1,1
3,54,2,3,Pyogenic Granuloma,"[soft tissue, smooth, soft, firm]",[white],"[para, keratinized, st, rat, chronic]","[squamous epithelium, connect, tissue, end]","[inflammatory, infiltrate]",1,0,0.5,0
4,54,3,4,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]",[chronic],[connect],"[tu, mour, inflammatory cell infiltrate]",0,0,1.0,1


In [144]:
from sklearn.preprocessing import MultiLabelBinarizer
from ast import literal_eval

# Create a list of columns to encode
columns_to_encode = ['Detailed_description', 'Texture', 'Color']#,'Sign_symptom']# 'Biological_structure']

for column in columns_to_encode:
    
    df_merged[column] = df_merged[column].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

    mlb = MultiLabelBinarizer()
    df_mlb=pd.DataFrame(mlb.fit_transform(df_merged[column]), columns=mlb.classes_)

    df_merged=pd.concat([df_merged, df_mlb], axis=1)

In [145]:
df_merged

Unnamed: 0,Age,Clinical_diagnosis,Site_of_biopsy,Diagnosis,Texture,Color,Detailed_description,Biological_structure,Sign_symptom,Biopsy_encoded,...,thick,us,firm,rough,smooth,soft,soft tissue,soft to firm,brown,white
0,55,4,0,Fibroma,"[soft tissue, smooth, rough, soft to firm]",[white],"[hyperplastic, para, keratinized, st, rat, us,...","[squamous epithelium, connect, connective tiss...",[cell infiltrate],1,...,0,1,0,1,1,0,1,1,0,1
1,49,1,2,Candidiasis,"[soft, firm, smooth, rough]","[white, brown]","[4 - 5 cell, thick, para, keratinized, st, chr...","[rat, squamous, epithelium, connect, epith, um...","[inflammatory, infiltrate, ha, age]",0,...,1,0,1,1,1,1,0,0,1,1
2,60,0,1,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]","[d, para, keratin, rat, dense, chronic]","[squamous epithelium, connect, connective]","[tu, mour, d, atosis, inflammatory cell infilt...",0,...,0,0,1,1,1,1,0,0,1,1
3,54,2,3,Pyogenic Granuloma,"[soft tissue, smooth, soft, firm]",[white],"[para, keratinized, st, rat, chronic]","[squamous epithelium, connect, tissue, end]","[inflammatory, infiltrate]",1,...,0,0,1,0,1,1,1,0,0,1
4,54,3,4,Oral Squamous Cell Carcinoma,"[smooth, rough, soft, firm]","[white, brown]",[chronic],[connect],"[tu, mour, inflammatory cell infiltrate]",0,...,0,0,1,1,1,1,0,0,1,1


In [146]:
df_merged=df_merged.drop(columns=['Texture','Color','Detailed_description','Biological_structure','Sign_symptom'])
df_merged

Unnamed: 0,Age,Clinical_diagnosis,Site_of_biopsy,Diagnosis,Biopsy_encoded,Sex_encoded,Area_Numeric,Area_Code,4 - 5 cell,chronic,...,thick,us,firm,rough,smooth,soft,soft tissue,soft to firm,brown,white
0,55,4,0,Fibroma,1,0,5.0,1,0,0,...,0,1,0,1,1,0,1,1,0,1
1,49,1,2,Candidiasis,0,1,1.0,1,1,1,...,1,0,1,1,1,1,0,0,1,1
2,60,0,1,Oral Squamous Cell Carcinoma,0,1,1.1,1,0,1,...,0,0,1,1,1,1,0,0,1,1
3,54,2,3,Pyogenic Granuloma,1,0,0.5,0,0,1,...,0,0,1,0,1,1,1,0,0,1
4,54,3,4,Oral Squamous Cell Carcinoma,0,0,1.0,1,0,1,...,0,0,1,1,1,1,0,0,1,1


In [147]:
x=df_merged.drop(columns=['Diagnosis'])
x['Age'] = x['Age'].astype(int)
x

Unnamed: 0,Age,Clinical_diagnosis,Site_of_biopsy,Biopsy_encoded,Sex_encoded,Area_Numeric,Area_Code,4 - 5 cell,chronic,chronic inflammatory,...,thick,us,firm,rough,smooth,soft,soft tissue,soft to firm,brown,white
0,55,4,0,1,0,5.0,1,0,0,1,...,0,1,0,1,1,0,1,1,0,1
1,49,1,2,0,1,1.0,1,1,1,0,...,1,0,1,1,1,1,0,0,1,1
2,60,0,1,0,1,1.1,1,0,1,0,...,0,0,1,1,1,1,0,0,1,1
3,54,2,3,1,0,0.5,0,0,1,0,...,0,0,1,0,1,1,1,0,0,1
4,54,3,4,0,0,1.0,1,0,1,0,...,0,0,1,1,1,1,0,0,1,1


In [148]:
y=df_merged['Diagnosis']
y

0                         Fibroma
1                    Candidiasis 
2    Oral Squamous Cell Carcinoma
3              Pyogenic Granuloma
4    Oral Squamous Cell Carcinoma
Name: Diagnosis, dtype: object

In [149]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
y


array([1, 0, 2, 3, 2])

In [150]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=20)


In [151]:
target=df_merged['Diagnosis']

In [152]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
target=label_encoder.fit_transform(target)
target

array([1, 0, 2, 3, 2])

In [153]:
y_train = y_train -1
y_test = y_test  - 1 # If y_test is also affected

In [154]:
print("Label Mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


Label Mapping: {'Candidiasis ': np.int64(0), 'Fibroma': np.int64(1), 'Oral Squamous Cell Carcinoma': np.int64(2), 'Pyogenic Granuloma': np.int64(3)}


In [155]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np

model = xgb.XGBClassifier(eval_metric='mlogloss') 

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.0


In [156]:
y_pred_original = le.inverse_transform(y_pred)
print("Original Predicted Labels:", y_pred_original)


Original Predicted Labels: ['Fibroma']


In [157]:
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBRegressor,XGBClassifier

estimator=[
    ('encoder',TargetEncoder()),
    ('reg',XGBClassifier())
]

pipe=Pipeline(steps=estimator)
pipe

In [158]:
'''from skopt import BayesSearchCV
from skopt.space import Real,Categorical,Integer

search_space={
    'reg__max_depth':Integer(2,8),
    'reg__learning_rate':Real(0.001,1.0,prior='log-uniform'),
    'reg__subsample':Real(0.5,1.0),
    'reg__colsample_bytree':Real(0.5,1.0),
    'reg__colsample_bylevel':Real(0.5,1.0),
    'reg__colsample_bynode':Real(0.5,1.0),
    'reg__reg_alpha':Real(0.0,10.0),
    'reg__reg_lamda':Real(0.0,10.0),
    'reg__gamma':Real(0.0,10.0) 
}
opt =  BayesSearchCV(pipe,search_space,cv=3,n_iter=10)'''

"from skopt import BayesSearchCV\nfrom skopt.space import Real,Categorical,Integer\n\nsearch_space={\n    'reg__max_depth':Integer(2,8),\n    'reg__learning_rate':Real(0.001,1.0,prior='log-uniform'),\n    'reg__subsample':Real(0.5,1.0),\n    'reg__colsample_bytree':Real(0.5,1.0),\n    'reg__colsample_bylevel':Real(0.5,1.0),\n    'reg__colsample_bynode':Real(0.5,1.0),\n    'reg__reg_alpha':Real(0.0,10.0),\n    'reg__reg_lamda':Real(0.0,10.0),\n    'reg__gamma':Real(0.0,10.0) \n}\nopt =  BayesSearchCV(pipe,search_space,cv=3,n_iter=10)"