In [86]:
import pickle
import pandas as pd
from sklearn.metrics import classification_report

In [87]:
with open(r".\CancerStagersModel.pkl",'rb') as f:
    model = pickle.load(f) # load the saved model

In [88]:
def df_test(dataframe: pd.DataFrame): # takes in a pandas dataframe 
    if all(x in dataframe.columns for x in model.feature_names_in_)==False:
        raise Exception(f"""Missing Mandatory Columns Detected. 
Please ensure your dataframe has the required columns:\n\n{list(model.feature_names_in_)}""")
    # checks that the mandatory columns are present in the dataframe
    elif (any(dataframe[model.feature_names_in_].isna().sum()>0)): 
        # checks if any of the mandatory columns (as found in model.feature_names_in_) has a missing value
        missing_cols = (list((dataframe[model.feature_names_in_].isna().sum()>0).index))
        raise Exception(f"Missing Data Detected in Columns {missing_cols}")

    else:
        # if above checks are fulfilled
        y_pred = model.predict(dataframe[model.feature_names_in_])
        dic = {"healthy":1,"screening stage cancer":1,"early stage cancer":2,"mid stage cancer":3,"late stage cancer":4}
        reverse_lis = [k for k,_ in sorted(dic.items(),key=lambda x:x[-1])]
        reverse_lis = [reverse_lis[0]+'/'+reverse_lis[1]]+reverse_lis[2:]
        r_dic = {(k+1):v for k,v in enumerate(reverse_lis)}
        target = 'class_label'
        df=None
        if target in dataframe.columns: 
        # if there is an actual class_label column present, can compare actual data with predicted data
            arr = dataframe[target].map(dic) # places both healthy and screening stage cancer in the same class
            df = pd.DataFrame({"Actual":dataframe[target].values,
                    "Modified Actual":arr.values,
                    "Predicted":y_pred})
            for e in df.columns[1:]:
                df[e]=df[e].map(r_dic) 
                # numeric labels (1,2,3,4) converted to meaningful information
                # (healthy/screening stage , early stage, mid stage, late stage)
            df['Match']=df['Modified Actual']==df['Predicted']
            print(df['Match'].sum()) 
            # prints the number of matches between predicted results and modified actual results
            print(df['Match'].sum()/df['Match'].__len__()*100)
            # expresses the above number as a percentage of total number of results (rough gauge of overall accuracy)
            print(df)
            # displays the information
        y_pred = pd.Series(list(map(lambda x:r_dic[x],y_pred)))
        y_pred.name = 'Predicted'
        return [y_pred,df]

In [89]:
test_path = r".\NUS_IT\Test_Set.csv" # default test path; can change to other csv files of a similar format
df = pd.read_csv(test_path)
df_sample = df.sample(50)

In [90]:
pd.set_option('expand_frame_repr', False)

res=df_test(df_sample)
if res[-1] is not None:
    pred,rf = res 
else:
    pred = res[0]

36
72.0
                    Actual                 Modified Actual                       Predicted  Match
0                  healthy  healthy/screening stage cancer              early stage cancer  False
1        late stage cancer               late stage cancer               late stage cancer   True
2         mid stage cancer                mid stage cancer                mid stage cancer   True
3       early stage cancer              early stage cancer                mid stage cancer  False
4   screening stage cancer  healthy/screening stage cancer              early stage cancer  False
5       early stage cancer              early stage cancer              early stage cancer   True
6        late stage cancer               late stage cancer               late stage cancer   True
7                  healthy  healthy/screening stage cancer  healthy/screening stage cancer   True
8         mid stage cancer                mid stage cancer                mid stage cancer   True
9       earl

In [91]:
print(pred)

0                 early stage cancer
1                  late stage cancer
2                   mid stage cancer
3                   mid stage cancer
4                 early stage cancer
5                 early stage cancer
6                  late stage cancer
7     healthy/screening stage cancer
8                   mid stage cancer
9                 early stage cancer
10    healthy/screening stage cancer
11    healthy/screening stage cancer
12                 late stage cancer
13    healthy/screening stage cancer
14    healthy/screening stage cancer
15                early stage cancer
16                 late stage cancer
17                early stage cancer
18                  mid stage cancer
19                early stage cancer
20    healthy/screening stage cancer
21                  mid stage cancer
22                  mid stage cancer
23                early stage cancer
24    healthy/screening stage cancer
25                early stage cancer
26                 late stage cancer
2

In [92]:
target = 'class_label'
if target in df.columns:
    print(classification_report(rf['Modified Actual'],pred)) 
    # more tailored version of checking precision, recall and f1 of each category

                                precision    recall  f1-score   support

            early stage cancer       0.60      0.71      0.65        17
healthy/screening stage cancer       0.73      0.57      0.64        14
             late stage cancer       1.00      0.92      0.96        12
              mid stage cancer       0.62      0.71      0.67         7

                      accuracy                           0.72        50
                     macro avg       0.74      0.73      0.73        50
                  weighted avg       0.74      0.72      0.72        50

