In [1]:
import pandas as pd
import gzip
from sklearn.model_selection import train_test_split
import gzip_knn_classifier as gkc
# import gzip_knn_classifier_slow as gkc

In [2]:
df_data = pd.read_csv(
    "./data/Medical Text Dataset -Cancer Doc Classification.csv",
    header=None,
    skiprows=1,
    encoding="iso-8859-1",
)

In [3]:
df_data.shape

(7570, 3)

In [4]:
df_data.columns = ["srno", "class_label", "research_paper_text"]
df_data.head(5)

Unnamed: 0,srno,class_label,research_paper_text
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


In [5]:
(
    df_data
    .groupby("class_label")
    .size()
)

class_label
Colon_Cancer      2580
Lung_Cancer       2180
Thyroid_Cancer    2810
dtype: int64

In [6]:
df_data["research_paper_text_bytes"] = (
    df_data["research_paper_text"]
    .str.strip()
    .str.lower()
    .str.encode("iso-8859-1")
    .apply(gzip.compress)
)

In [7]:
class_labels_to_idx = {
    class_label: idx
    for idx, class_label in enumerate(df_data["class_label"].unique())
}

idx_to_class_label = {
    idx: class_label
    for class_label, idx in class_labels_to_idx.items()
}

df_data["class_label_idx"] = df_data["class_label"].apply(class_labels_to_idx.get)

In [8]:
df_data.head(5)

Unnamed: 0,srno,class_label,research_paper_text,research_paper_text_bytes,class_label_idx
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...,b'\x1f\x8b\x08\x00\xe0\xbe\xa3e\x02\xff\xb5\\[...,0
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us...",b'\x1f\x8b\x08\x00\xe0\xbe\xa3e\x02\xff\xc5[K\...,0
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...,b'\x1f\x8b\x08\x00\xe0\xbe\xa3e\x02\xff\xad\\\...,0
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...,b'\x1f\x8b\x08\x00\xe0\xbe\xa3e\x02\xff\xadZMz...,0
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...,b'\x1f\x8b\x08\x00\xe0\xbe\xa3e\x02\xff\xad\\K...,0


In [9]:
seed = 16021998

df_train, df_test = train_test_split(df_data, test_size=0.3, stratify=df_data["class_label"], random_state=seed)

print(f"Training size: {df_train.shape[0]}")
display(df_train.head(5))
display(df_train.groupby("class_label").size())

print(f"Testing size: {df_test.shape[0]}")
display(df_test.head(5))
display(df_test.groupby("class_label").size())

Training size: 5299


Unnamed: 0,srno,class_label,research_paper_text,research_paper_text_bytes,class_label_idx
4946,4946,Colon_Cancer,sarscov2 has resulted in numerous cases of cor...,b'\x1f\x8b\x08\x00\xe5\xbe\xa3e\x02\xff\xed]\x...,1
187,187,Thyroid_Cancer,EFSA for a scientiï¬c opinion on the risks fo...,b'\x1f\x8b\x08\x00\xe0\xbe\xa3e\x02\xff\xcd]m\...,0
852,852,Lung_Cancer,"""30 These findings prompted us to investigate ...",b'\x1f\x8b\x08\x00\xe1\xbe\xa3e\x02\xff\x9dY\x...,2
3251,3251,Thyroid_Cancer,"""Combination of thermally ablative focused ult...",b'\x1f\x8b\x08\x00\xe3\xbe\xa3e\x02\xff\xd5}[r...,0
519,519,Colon_Cancer,"""it is well understood that the level of molec...",b'\x1f\x8b\x08\x00\xe1\xbe\xa3e\x02\xff\xb5}K\...,1


class_label
Colon_Cancer      1806
Lung_Cancer       1526
Thyroid_Cancer    1967
dtype: int64

Testing size: 2271


Unnamed: 0,srno,class_label,research_paper_text,research_paper_text_bytes,class_label_idx
7481,7481,Colon_Cancer,four to nine percent of the sequences transcr...,b'\x1f\x8b\x08\x00\xe7\xbe\xa3e\x02\xff\xc5}\x...,1
6971,6971,Lung_Cancer,"""of erythrocytes in systemic lupus erythematos...",b'\x1f\x8b\x08\x00\xe6\xbe\xa3e\x02\xff\xbd\\[...,2
5978,5978,Colon_Cancer,natural killer nk cells are innate lymphocytes...,b'\x1f\x8b\x08\x00\xe6\xbe\xa3e\x02\xff\xbd]Iz...,1
1364,1364,Colon_Cancer,""" autism spectrum disorder asd is a developme...",b'\x1f\x8b\x08\x00\xe1\xbe\xa3e\x02\xff\xcd][\...,1
4163,4163,Thyroid_Cancer,"""researchWhat are the implications of using in...",b'\x1f\x8b\x08\x00\xe4\xbe\xa3e\x02\xff\xed]Y\...,0


class_label
Colon_Cancer      774
Lung_Cancer       654
Thyroid_Cancer    843
dtype: int64

In [10]:
clf = gkc.new(
    df_train["research_paper_text_bytes"],
    df_train["class_label_idx"],
)

In [11]:
y_test = df_test["class_label_idx"]
y_preds = [pred.label for pred in gkc.classify_many(clf, df_test["research_paper_text_bytes"], 5)]

In [12]:
(y_test == y_preds).value_counts()

class_label_idx
True     2055
False     216
Name: count, dtype: int64