# PROCESSING

In [1]:
from pathlib import Path

In [2]:
base = Path('./')

post_patch_csvs = [f for f in base.rglob("process_list_autogen.csv") if f.parts[0] != "archived"]

post_feat_ext_csvs = [f for f in base.rglob("post_feat_ext.csv") if f.parts[0] != "archived"]

In [3]:
import pandas as pd
pd.set_option('display.max_rows', None)

In [4]:
post_patch = pd.concat([
    pd.read_csv(csv) for csv in post_patch_csvs
])

post_patch["slide_id"] = post_patch["slide_id"].str.replace('.svs', '', regex=False)
post_patch["error"] = post_patch["error"].str[:20] # Truncate error message
post_patch.rename(columns={'slide_id':'image_id'}, inplace=True)

# Already existing ones here can be interpretted as patched
post_patch.loc[
    (post_patch["status"] == "processed") | (post_patch["status"] == "already_exist"), "status"
] = "patched"
    
    

post_feat_ext = pd.concat([
    pd.read_csv(csv) for csv in post_feat_ext_csvs
])

post_feat_ext["slide_id"] = post_feat_ext["slide_id"].str.replace('.svs', '', regex=False)
post_feat_ext["error"] = post_feat_ext["error"].str[-20:] # Truncate error message
post_feat_ext.rename(columns={'slide_id':'image_id'}, inplace=True)

# Already existing ones here can be interpretted as features extracted
post_feat_ext.loc[
    post_feat_ext["status"] == "already_exist", "status"
] = "features_extracted"

In [5]:
post_patch.head()

Unnamed: 0,image_id,process,status,seg_level,sthresh,mthresh,close,use_otsu,keep_ids,exclude_ids,a_t,a_h,max_n_holes,vis_level,line_thickness,use_padding,contour_fn,error
0,PATH000001432,0,patched,-2,8,3,4,True,none,none,1.0,1.0,8,-1,100,True,four_pt,
1,PATH000001472,0,patched,-2,8,3,4,True,none,none,1.0,1.0,8,-1,100,True,four_pt,
2,PATH000001571,0,patched,-2,8,3,4,True,none,none,1.0,1.0,8,-1,100,True,four_pt,
3,PATH000001583,0,patched,-2,8,3,4,True,none,none,1.0,1.0,8,-1,100,True,four_pt,
4,PATH000001588,0,patched,-2,8,3,4,True,none,none,1.0,1.0,8,-1,100,True,four_pt,


In [6]:
master = pd.read_csv("labels.csv")

master.head()

Unnamed: 0,image_id,usable,label,site_label,expert_label,expert_label_other,expert_label_notes,country,patient_id,svs_delivered
0,PATH000000291,0,Missing,Missing,,,,.,,1
1,PATH000000452,0,Missing,Missing,,,,.,,1
2,PATH000001313,0,Missing,Missing,,,,.,,1
3,PATH000001317,0,Missing,Missing,,,,.,,1
4,PATH000001376,0,Missing,Missing,,,,.,,1


In [8]:
master['status'] = "tbp"
master['error'] = pd.NA
master_i = master.set_index('image_id')

post_patch_i = post_patch.set_index('image_id')
master_i.update(post_patch_i[['status', 'error']])

post_feat_ext_i = post_feat_ext.set_index('image_id')
master_i.update(post_feat_ext_i[['status', 'error']])
    
master = master_i.reset_index()

### We decided that we'll only use the images that Diego has reviewed, so we need to update the usable flag. ALSO WE ARE EXCLUDING MALAWI NORMALS

In [9]:
master.loc[
    master['expert_label'] == " ", 'usable'
] = 0

Let's see the label distribution over countries, looks like Malawi has a lot of the Negative/Reactive. And if we have to downsample it does make sense to me to exclude these ones systematically.

In [24]:
master.loc[
    (master['country'] == "Malawi") & (master['label'] == "Negative/Reactive"), 'usable'
] = 0

In [25]:
len(master[
    (master['country'] == "Malawi") & (master['label'] == "Negative/Reactive") & (master.usable) & (master.status != "tbp")
])

0

In [26]:
master.status.value_counts()

status
tbp                   7813
features_extracted    7530
failed_seg             286
failed_ext              40
Name: count, dtype: int64

In [27]:
master.head()

Unnamed: 0,image_id,usable,label,site_label,expert_label,expert_label_other,expert_label_notes,country,patient_id,svs_delivered,status,error
0,PATH000000291,0,Missing,Missing,,,,.,,1,tbp,
1,PATH000000452,0,Missing,Missing,,,,.,,1,tbp,
2,PATH000001313,0,Missing,Missing,,,,.,,1,tbp,
3,PATH000001317,0,Missing,Missing,,,,.,,1,tbp,
4,PATH000001376,0,Missing,Missing,,,,.,,1,tbp,


# EXPLORATION

### Let's see if the svs_delivered flag is accurate for the samples we've processed thus far.

In [28]:
master.groupby(['svs_delivered','status']).size()

svs_delivered  status            
0              tbp                   4216
1              failed_ext              40
               failed_seg             286
               features_extracted    7530
               tbp                   3597
dtype: int64

### Let's see how much of what we've done we can actually use

In [30]:
master.groupby('usable')['status'].value_counts()

usable  status            
0       tbp                   5463
        features_extracted    4092
        failed_seg             132
        failed_ext              19
1       features_extracted    3438
        tbp                   2350
        failed_seg             154
        failed_ext              21
Name: count, dtype: int64

In [31]:
master.groupby(['svs_delivered','usable']).size()

svs_delivered  usable
0              0         4216
1              0         5490
               1         5963
dtype: int64

# PROCESSING 

### Grouping labels

In [32]:
master.loc[
    master.expert_label == "Insufficient/Inadequate", 'label'
] = "insufficient"

In [33]:
master.loc[
    master.expert_label == "Atypia: Specify", 'label'
] = 'atypia'

In [34]:
master.loc[
    master.expert_label == "Other: Specify", 'label'
] = 'other'

In [35]:
master.loc[
    master.expert_label == 'CIN1', 'label'
] = 'low_grade'

In [36]:
master.loc[
    master.expert_label.isin(['CIN2','CIN3','AIS']), 'label'
] = 'high_grade'

In [37]:
master.loc[
    master.expert_label.isin([
        'Adenocarcinoma Invasive','Adenosquamous Carcinoma','Other Cancer: Specify','Squamous Invasive Carcinoma'
    ]) | 
    (
        (master.expert_label == "Other: Specify") & 
        (master.expert_label_other.str.contains("carcinoma", case=False)) &
        (~master.expert_label_other.str.contains("rule out", case=False))
    )  
    , 'label'
] = 'cancer'

In [38]:
master.loc[
    (master.expert_label == "Negative/Reactive") |
    (
        (master.expert_label == "Other: Specify") &
        (master.expert_label_other.str.contains("microglandular hyperplasia", case=False))
    ),
    'label'
] = 'normal'

### We discard other and atypia

In [39]:
master.loc[
    (master.label == "other") | (master.label == "atypia"),
    'usable'
] = 0

In [41]:
len(master[(master.label == "other") | (master.label == "atypia")])

75

In [40]:
master.groupby('usable')['status'].value_counts()

usable  status            
0       tbp                   5483
        features_extracted    4137
        failed_seg             134
        failed_ext              19
1       features_extracted    3393
        tbp                   2330
        failed_seg             152
        failed_ext              21
Name: count, dtype: int64

# EXPLORATION

### Let's see the label distributions over the samples we can use.

In [42]:
import numpy as np

In [43]:
usable = master[master['usable'] == 1].copy()

In [44]:
def with_overalls(df, overall_label='Overall'):
    # Add overall column (sum of each row)
    df2 = df.copy()
    df2[overall_label] = df2.sum(axis=1)
    # Add overall row (sum of each column, including the new column)
    overall_row = df2.sum(axis=0)
    overall_row.name = overall_label
    # Append the overall row
    df2 = pd.concat([df2, overall_row.to_frame().T])
    return df2

In [45]:
with_overalls(pd.crosstab(usable.status, usable.label))

label,cancer,high_grade,insufficient,low_grade,normal,Overall
failed_ext,0,5,6,0,10,21
failed_seg,0,11,52,8,81,152
features_extracted,26,428,1060,192,1687,3393
tbp,17,307,362,147,1497,2330
Overall,43,751,1480,347,3275,5896


In [46]:
# Calculate percentages
percentages = ((pd.crosstab(usable.country, usable.expert_label) / usable.expert_label.value_counts()) * 100).T

# Format with % symbol and round to 2 decimal places
formatted = (percentages.round(1).astype(str) + '%').replace(['0.0%', '0.%'],'')
formatted["Overall"] = "100%"

formatted

country,Brazil Brasilia,Cambodia,Dominican Republic,El Salvador,Eswatini,Honduras,Malawi,Nigeria,Tanzania,Overall
expert_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AIS,,85.7%,,14.3%,,,,,,100%
Adenocarcinoma Invasive,,40.0%,,,,20.0%,20.0%,20.0%,,100%
Adenosquamous Carcinoma,,,100.0%,,,,,,,100%
CIN1,7.2%,15.6%,16.7%,16.7%,2.6%,7.2%,22.8%,8.6%,2.6%,100%
CIN2,7.2%,21.7%,13.8%,22.5%,4.3%,3.6%,18.8%,3.6%,4.3%,100%
CIN3,7.4%,34.5%,6.9%,15.5%,1.3%,3.0%,18.2%,3.1%,10.1%,100%
Insufficient/Inadequate,2.7%,4.0%,4.9%,22.6%,0.6%,6.8%,37.8%,16.3%,4.3%,100%
Negative/Reactive,8.1%,26.0%,9.4%,30.5%,2.4%,7.1%,,10.5%,6.0%,100%
Other Cancer: Specify,,,,100.0%,,,,,,100%
Other: Specify,,57.1%,14.3%,28.6%,,,,,,100%


In [47]:
# Calculate percentages
percentages = ((pd.crosstab(usable.country, usable.label) / usable.label.value_counts()) * 100).T

# Format with % symbol and round to 2 decimal places
formatted = (percentages.round(1).astype(str) + '%').replace(['0.0%', '0.%'],'')
formatted["Overall"] = "100%"

formatted

country,Brazil Brasilia,Cambodia,Dominican Republic,El Salvador,Eswatini,Honduras,Malawi,Nigeria,Tanzania,Overall
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
cancer,2.3%,51.2%,4.7%,4.7%,,9.3%,11.6%,2.3%,14.0%,100%
high_grade,7.3%,32.6%,8.1%,16.8%,1.9%,3.1%,18.1%,3.2%,8.9%,100%
insufficient,2.7%,4.0%,4.9%,22.6%,0.6%,6.8%,37.8%,16.3%,4.3%,100%
low_grade,7.2%,15.6%,16.7%,16.7%,2.6%,7.2%,22.8%,8.6%,2.6%,100%
normal,8.1%,26.0%,9.5%,30.5%,2.4%,7.1%,,10.5%,6.0%,100%


In [50]:
pd.crosstab(usable.country, usable.label).T

country,Brazil Brasilia,Cambodia,Dominican Republic,El Salvador,Eswatini,Honduras,Malawi,Nigeria,Tanzania
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cancer,1,22,2,2,0,4,5,1,6
high_grade,55,245,61,126,14,23,136,24,67
insufficient,40,59,73,335,9,101,559,241,63
low_grade,25,54,58,58,9,25,79,30,9
normal,266,852,310,998,77,233,0,343,196


In [38]:
failed_seg = usable[
    usable.status == "failed_seg"
]
pd.crosstab(usable.error, usable.label, dropna=False)

label,cancer,high_grade,insufficient,low_grade,normal
error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
exited unexpectedly,0,1,1,0,2
Not a JPEG file: sta,0,0,0,0,1
arts with 0x00 0x00\n,0,4,3,0,8
axis 0 is out of bou,0,0,1,0,0
cannot identify imag,0,0,1,0,1
no_tissue,0,0,40,0,0
re end of JPEG file\n,0,0,2,0,0
too_large,0,11,10,8,79
,43,735,1422,339,3184


### Preparing for training

In [39]:
master.head()

Unnamed: 0,image_id,usable,label,site_label,expert_label,expert_label_other,expert_label_notes,country,patient_id,svs_delivered,status,error
0,PATH000000291,0,Missing,Missing,,,,.,,1,tbp,
1,PATH000000452,0,Missing,Missing,,,,.,,1,tbp,
2,PATH000001313,0,Missing,Missing,,,,.,,1,tbp,
3,PATH000001317,0,Missing,Missing,,,,.,,1,tbp,
4,PATH000001376,0,Missing,Missing,,,,.,,1,tbp,


In [40]:
prepped = master[
    (master.status == "features_extracted") &
    master.usable == 1
][['patient_id', 'image_id', 'label', 'country']].rename(
    columns={
        'patient_id': 'case_id',
        'image_id': 'slide_id'
    }
)

In [41]:
len(prepped)

3393

WE CATCH ALL ERRORS FOR MISSING PT FILES, BUT WE STILL WANT TO MAKE SURE ALL THESE ENTRIES EXIST IN THE SYMLINKED DIR.

In [42]:
pt_files = Path('./pave_training/pathology_features/pt_files')


for slide_id in prepped.slide_id.values:
    if not (pt_files / (slide_id + ".pt")).is_file(): 
        print(f"{slide_id} is not in pt_files, removing from prepped csv.")
        prepped = prepped[
            prepped.slide_id != slide_id
        ]

In [43]:
prepped.to_csv("/projects/ataghinia@xsede.org/PAVE-Pathology/dataset_csv/pathology_full_subtyping.csv")

In [44]:
prepped.groupby('label').size()

label
cancer            26
high_grade       428
insufficient    1060
low_grade        192
normal          1687
dtype: int64

### Check on status of directories

In [34]:
import os
import pandas as pd

def num_files(basepath, x, subdir):
    try:
        return len(os.listdir(os.path.join(basepath, x, subdir)))
    except:
        return 0

def subdir_counts(basepath):
    cols = ['wsis','patches','pt_files']
    df = pd.DataFrame(columns=cols)
    
    for x in sorted(os.listdir(basepath)):
        df.loc[x] = [num_files(basepath, x, col) for col in cols]
    
    return df
    


In [35]:
subdir_counts("navyblue").T

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T
wsis,300,300,300,300,300,300,300,300,300,300,300,300,300,48,0,0,0,0,0,0
patches,298,296,292,282,280,300,288,296,298,294,298,296,298,47,0,0,0,0,0,0
pt_files,285,296,283,278,277,300,288,295,298,294,295,295,298,47,0,0,0,0,0,0


In [36]:
subdir_counts("pv3").T

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q
wsis,300,300,300,300,300,300,300,300,300,300,300,300,300,8,0,0,0
patches,281,254,263,298,283,290,290,297,292,291,285,286,294,8,0,0,0
pt_files,273,253,262,298,282,290,290,297,292,291,285,286,294,8,0,0,0


### We need to define new CSVs for our three binary classifiers (sufficiency, normalcy, and management)

In [37]:
sufficiency = prepped.copy()
sufficiency.loc[
    sufficiency.label != "insufficient", 'label'
] = 'sufficient'

In [38]:
sufficiency['insufficient'] = sufficiency.label == "insufficient"

sufficiency['blurry'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("blu", case=False)))

sufficiency['mucus'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("muc", case=False)))

sufficiency['scant_material'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("mater", case=False)))

sufficiency['scant_cells'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("cell", case=False)))

sufficiency['inflammation'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("inf", case=False)))

In [39]:
print(
    sufficiency[['insufficient','scant_material', 'blurry', 'mucus', 'scant_cells', 'inflammation']].sum(),
    f"\nTotal: {len(sufficiency)}"
)

insufficient      1060
scant_material     363
blurry             366
mucus              260
scant_cells        148
inflammation       177
dtype: int64 
Total: 3393


In [40]:
sufficiency.label.value_counts()

label
sufficient      2333
insufficient    1060
Name: count, dtype: int64

In [45]:
normalcy = prepped[prepped.label != 'insufficient'].copy()
normalcy.loc[
    normalcy.label != "normal", 'label'
] = 'abnormal'

In [46]:
normalcy.label.value_counts()

label
normal      1687
abnormal     646
Name: count, dtype: int64

In [48]:
management = prepped[prepped.label != 'insufficient'].copy()

management.label = 'follow_up'

management.loc[
    (prepped.label == 'high_grade') | (prepped.label == 'cancer'), 'label'
] = 'treatment'

In [49]:
management.label.value_counts()

label
follow_up    1879
treatment     454
Name: count, dtype: int64

In [50]:
management.to_csv("/projects/ataghinia@xsede.org/PAVE-Pathology/dataset_csv/pathology_management.csv")

In [43]:
sufficiency.to_csv("/projects/ataghinia@xsede.org/PAVE-Pathology/dataset_csv/pathology_sufficiency.csv")

In [44]:
normalcy.to_csv("/projects/ataghinia@xsede.org/PAVE-Pathology/dataset_csv/pathology_normalcy.csv")

### Heatmap Process List

In [45]:
import pandas as pd

In [68]:
results = pd.read_csv("/projects/ataghinia@xsede.org/PAVE-Pathology/eval_results/EVAL_normalcy/fold_0.csv")

In [69]:
full = pd.merge(results, prepped, on='slide_id', how='left')

In [70]:
full['correct'] = full.Y == full.Y_hat

In [74]:
sampled = full.groupby(['label','correct']).sample(n=2)

In [75]:
sampled

Unnamed: 0,slide_id,Y,Y_hat,p_0,p_1,case_id,label,country,correct
92,PATH000008408,1.0,1.0,2.1e-05,0.999979,PAVE-MW-0243,cancer,Malawi,True
101,PATH000008198,1.0,1.0,8e-06,0.999992,PAVE-MW-0853,cancer,Malawi,True
83,PATH000005462,1.0,0.0,0.72552,0.27448,ES1005-0040,high_grade,El Salvador,False
17,PATH000002949,1.0,0.0,0.604371,0.395629,04-5EFCCA,high_grade,Tanzania,False
4,PATH000004370,1.0,1.0,0.003381,0.996619,02-38796,high_grade,Brazil Brasilia,True
100,PATH000007640,1.0,1.0,0.00248,0.99752,PAVE-MW-0842,high_grade,Malawi,True
57,PATH000009986,1.0,0.0,0.826775,0.173225,CMK0789,low_grade,Honduras,False
71,PATH000005572,1.0,0.0,0.839471,0.160529,ES0714-0042,low_grade,El Salvador,False
49,PATH000008815,1.0,1.0,0.005924,0.994076,CMK0470,low_grade,Honduras,True
144,PATH000006176,1.0,1.0,0.088258,0.911742,YY02252,low_grade,Nigeria,True


In [77]:
sampled.to_csv("/projects/ataghinia@xsede.org/PAVE-Pathology/heatmaps/process_lists/normalcy_1_test.csv")

In [51]:
pd.crosstab(full.Y_hat, full.country) / full.country.value_counts()

country,Brazil Brasilia,Cambodia,Dominican Republic,El Salvador,Eswatini,Honduras,Malawi,Nigeria,Tanzania
Y_hat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,0.583333,0.527778,0.866667,0.555556,1.0,0.428571,0.0,0.636364,0.5
1.0,0.416667,0.472222,0.133333,0.444444,0.0,0.571429,1.0,0.363636,0.5


In [52]:
from sklearn.metrics import roc_auc_score

pd.DataFrame(full.groupby('country').apply(
    lambda group: pd.Series({
        'AUC': roc_auc_score(group.Y, group.p_1) if len(np.unique(group.Y)) > 1 else np.nan,
        'Test Cases': len(group)
    }),
    include_groups=False
))

Unnamed: 0_level_0,AUC,Test Cases
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Brazil Brasilia,0.9375,12.0
Cambodia,0.971193,36.0
Dominican Republic,0.846154,15.0
El Salvador,0.934211,27.0
Eswatini,,2.0
Honduras,0.94697,28.0
Malawi,,13.0
Nigeria,1.0,11.0
Tanzania,1.0,8.0


### Investigating Insufficiency Reasons

In [53]:
usable[
    usable.label == "insufficient"
].sample(50)

Unnamed: 0,image_id,usable,label,site_label,expert_label,expert_label_other,expert_label_notes,country,patient_id,svs_delivered,status,error
11762,PATH000007479,1,insufficient,Normal,Insufficient/Inadequate,,SCANT CELLULARITY,Malawi,PAVE-MW-0934,1,features_extracted,
5067,PATH000003084,1,insufficient,CIN1,Insufficient/Inadequate,,OUT OF FOCUS/RESCAN,Honduras,CMK0067,1,features_extracted,
11291,PATH000004655,1,insufficient,Normal,Insufficient/Inadequate,,,Malawi,PAVE-MW-0810,1,features_extracted,
9595,PATH000003638,1,insufficient,Normal,Insufficient/Inadequate,,,Malawi,PAVE-MW-0209,1,features_extracted,
776,PATH000009949,1,insufficient,Cervicitis,Insufficient/Inadequate,,MUCUS SCANT CELLULARITY,Brazil Brasilia,02-04978,1,features_extracted,
7610,PATH000002136,1,insufficient,Normal,Insufficient/Inadequate,,MUCUS INFLAMMATION,El Salvador,ES0902-0023,1,tbp,
14150,PATH000006120,1,insufficient,CIN1,Insufficient/Inadequate,,BLURRY RESCAN,Nigeria,YY01402,1,features_extracted,
8850,PATH000004125,1,insufficient,CIN3,Insufficient/Inadequate,,BLURRY,Malawi,PAVE-MW-0020,1,features_extracted,
6432,PATH000012077,1,insufficient,CIN1,Insufficient/Inadequate,,MUCUS SCANT CELLULARITY,El Salvador,ES0707-0011,1,tbp,
4896,PATH000008894,1,insufficient,CIN1,Insufficient/Inadequate,,MUCUS INFLAMMATION,Dominican Republic,302-1571,1,features_extracted,


In [54]:
import pandas as pd
from collections import Counter
import re # For cleaning text (e.g., removing extra whitespace if desired)

def get_most_common_char_ngrams(df: pd.DataFrame, column_name: str, 
                                  n: int = 3, top_n: int = 10,
                                  lowercase: bool = True, remove_extra_whitespace: bool = True) -> list:
    """
    Finds the most common character N-grams (substrings of a fixed length 'n')
    in a specified column of a Pandas DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the text column to analyze.
        n (int): The length of the character N-gram (substring) to find. Defaults to 3 (trigrams).
        top_n (int): The number of most common N-grams to return. Defaults to 10.
        lowercase (bool): If True, convert text to lowercase before processing. Defaults to True.
        remove_extra_whitespace (bool): If True, replace multiple spaces/tabs/newlines
                                        with a single space and strip leading/trailing whitespace.
                                        This affects how substrings with spaces are counted. Defaults to True.

    Returns:
        list: A list of tuples, where each tuple is ('substring', count)
              representing the character N-gram and its frequency, sorted by count
              in descending order.
              Returns an empty list if the column is not found or contains no valid text.
    """
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in DataFrame.")
        return []

    if n <= 0:
        raise ValueError("The character N-gram length 'n' must be a positive integer.")

    all_ngrams = []

    # Process each text entry in the specified column
    for text in df[column_name].dropna(): # .dropna() handles NaN/None values
        # Ensure text is a string before processing
        if not isinstance(text, str):
            continue

        if lowercase:
            text = text.lower()

        if remove_extra_whitespace:
            # Replace multiple whitespace characters (including newlines, tabs) with a single space
            # and strip leading/trailing whitespace
            text = re.sub(r'\s+', ' ', text).strip()

        # Generate character N-grams
        # Iterate from the start of the string up to the point where an n-gram can be formed
        for i in range(len(text) - n + 1):
            ngram = text[i : i + n]
            all_ngrams.append(ngram)

    # Count the frequency of each N-gram
    ngram_counts = Counter(all_ngrams)

    # Return the top N most common N-grams
    return ngram_counts.most_common(top_n)


In [55]:

n = 5

print(f"Most common 50 character {n}grams:\n")
common_trigrams = get_most_common_char_ngrams(
    usable[
        usable.label == "insufficient"
    ], 
    'expert_label_notes', 
    n=n, 
    top_n=50
)
for ngram, count in common_trigrams:
    print(f"'{ngram}': {count}")
print("\n" + "="*50 + "\n")

Most common 50 character 5grams:

'scant': 579
'cant ': 576
'ateri': 477
'teria': 477
'erial': 477
' mate': 476
'mater': 476
'lurry': 456
'blurr': 453
'urry ': 387
'ant m': 378
'nt ma': 378
't mat': 378
'mucus': 348
' scan': 319
'ucus ': 310
'resca': 283
'escan': 283
' resc': 276
' cell': 244
'rry r': 239
'ry re': 239
'y res': 226
'ation': 216
'infla': 214
'nflam': 214
'flamm': 214
'lamma': 214
'ammat': 214
'mmati': 214
'matio': 214
' infl': 212
'lular': 196
'cellu': 195
'ellul': 195
'llula': 195
'ulari': 183
'larit': 183
'arity': 183
'cus i': 178
'us in': 178
's inf': 178
'ant c': 175
'nt ce': 175
't cel': 174
'y sca': 118
'ry sc': 117
'rry s': 114
'blood': 94
'no ma': 89




In [56]:
usable['blurry'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("blu", case=False)))

usable['mucus'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("muc", case=False)))

usable['scant_material'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("mater", case=False)))

usable['scant_cells'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("cell", case=False)))

usable['inflammation'] = ((usable.label == "insufficient") & (usable.expert_label_notes.str.contains("inf", case=False)))

In [57]:
insuff = usable[
    usable.label == 'insufficient'
]

print(insuff[['scant_material', 'blurry', 'mucus', 'scant_cells', 'inflammation']].sum(), f"\nTotal: {len(insuff)}")


scant_material    475
blurry            450
mucus             347
scant_cells       254
inflammation      213
dtype: int64 
Total: 1480


In [58]:
insuff[
    ['scant_material', 'blurry', 'mucus', 'scant_cells', 'inflammation']
].corr()

Unnamed: 0,scant_material,blurry,mucus,scant_cells,inflammation
scant_material,1.0,-0.023364,-0.281392,-0.309083,-0.248894
blurry,-0.023364,1.0,-0.365795,-0.285274,-0.271012
mucus,-0.281392,-0.365795,1.0,0.226071,0.727256
scant_cells,-0.309083,-0.285274,0.226071,1.0,0.00227
inflammation,-0.248894,-0.271012,0.727256,0.00227,1.0


In [59]:
unsorted = insuff[
        ~insuff.scant_material & ~insuff.blurry & ~insuff.mucus & ~insuff.scant_cells & ~insuff.inflammation
    ]

len(unsorted)

235

In [60]:
len(unsorted[
unsorted.expert_label_notes.str.contains("foc", case=False)
    ])

34

In [61]:
n=8

get_most_common_char_ngrams(
    insuff[
        ~insuff.scant_material & ~insuff.blurry & ~insuff.mucus & ~insuff.scant_cells & ~insuff.inflammation
    ],
    'expert_label_notes', 
    n=n, 
    top_n=50
)




[('out of f', 34),
 ('ut of fo', 34),
 ('t of foc', 34),
 (' of focu', 34),
 ('of focus', 34),
 ('endometr', 28),
 ('ndometri', 28),
 ('bad qual', 27),
 ('ad quali', 27),
 ('d qualit', 27),
 (' quality', 27),
 ('quality ', 25),
 ('f focus ', 20),
 ('uality s', 20),
 (' focus r', 19),
 ('focus re', 19),
 ('dometriu', 16),
 ('ometrium', 16),
 ('s rescan', 16),
 ('ality sl', 16),
 ('lity sli', 16),
 ('ity slid', 16),
 ('ty slide', 16),
 ('ocus res', 15),
 ('cus resc', 15),
 ('us resca', 15),
 (' endomet', 14),
 ('scanning', 12),
 ('dometria', 12),
 ('ometrial', 12),
 ('metrial ', 12),
 (' artifac', 12),
 ('artifact', 12),
 ('staining', 12),
 (' scannin', 11),
 (' stainin', 11),
 ('etrial t', 10),
 ('trial ti', 10),
 ('rial tis', 10),
 ('ial tiss', 10),
 ('al tissu', 10),
 ('l tissue', 10),
 ('scant en', 9),
 ('cant end', 9),
 ('ant endo', 9),
 (' endocer', 9),
 ('endocerv', 9),
 ('ndocervi', 9),
 ('bubble a', 9),
 ('ubble ar', 9)]