In [100]:
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import joblib
import nltk
import re
from nltk.stem import WordNetLemmatizer

In [101]:
stopwords_list = ['the', 'of', ',', 'and', 'in', ')', '(', 'with', 'a', 'to', 'patients', 'was', 'were', '%', 'for', 'or', 'is', 'by', 'that', 'than', 'from', 'an', 'at', 'this', 'as', 'be', 'had', 'after', 'on', 'not', 'less', 'disease', ';', 'are', 'these', 'p', '+/-', ':', 'we', 'group', 'treatment', 'during', 'study', '=', 'have', 'no', 'all', 'two', 'may', 'but', 'one', 'patient', 'who', 'cases', 'blood', 'years', 'clinical', 'between', '1', 'results', 'cells', '2', 'more', 'been', 'both', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves','pt','patient','what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def remove_stopwords(text: str) -> str:
    words = nltk.word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    filtered_words = [word for word in lemmatized_words if word.isalpha() and word not in stopwords_list]
    clean_text = ' '.join(filtered_words)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

In [102]:
# Load the datasets (train, test, validation)
train_df = pd.read_excel("../db/train_data.xlsx")
test_df = pd.read_excel("../db/test_data.xlsx")
val_df = pd.read_excel("../db/validation_data.xlsx")

train_df["Notes"] = train_df["Notes"].apply(remove_stopwords)
test_df["Notes"] = test_df["Notes"].apply(remove_stopwords)
val_df["Notes"] = val_df["Notes"].apply(remove_stopwords)

In [103]:
train_df = train_df[train_df['Class'] != 5]
test_df = test_df[test_df['Class'] != 5]
val_df = val_df[val_df['Class'] != 5]

In [104]:
val_df["Class"].unique()

array([2, 1, 4, 3])

In [105]:
# Replace class labels in the DataFrame
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}

train_df['Class'] = train_df['Class'].map(mapping)
test_df['Class'] = test_df['Class'].map(mapping)
val_df['Class'] = val_df['Class'].map(mapping)

X_train, y_train = train_df = train_df.drop('Class',axis=1), train_df.drop('Notes',axis=1)
X_test, y_test = test_df.drop('Class',axis=1), test_df.drop('Notes',axis=1)
X_val, y_val = val_df.drop('Class',axis=1), val_df.drop('Notes',axis=1)

In [106]:
from catboost import CatBoostClassifier, Pool

In [108]:
def fit_catboost(X_train, X_test, y_train, y_test, catboost_params={}):
    learn_pool = Pool(
        X_train, 
        y_train, 
        text_features=['Notes'],
        feature_names=list(X_train)
    )
    test_pool = Pool(
        X_test, 
        y_test, 
        text_features=['Notes'],
        feature_names=list(X_train)
    )
    
    catboost_default_params = {
        'iterations': 750,
        'learning_rate': 0.01,
        'eval_metric': 'MultiClass',
        }

    
    catboost_default_params.update(catboost_params)
    
    model = CatBoostClassifier(**catboost_default_params)
    model.fit(learn_pool, eval_set=test_pool)

    return model

In [109]:
model = fit_catboost(X_train, X_test, y_train, y_test)

0:	learn: 1.3702598	test: 1.3697365	best: 1.3697365 (0)	total: 261ms	remaining: 3m 15s
1:	learn: 1.3547805	test: 1.3539057	best: 1.3539057 (1)	total: 507ms	remaining: 3m 9s
2:	learn: 1.3400904	test: 1.3387723	best: 1.3387723 (2)	total: 781ms	remaining: 3m 14s
3:	learn: 1.3256676	test: 1.3239812	best: 1.3239812 (3)	total: 1.04s	remaining: 3m 15s
4:	learn: 1.3119235	test: 1.3099034	best: 1.3099034 (4)	total: 1.32s	remaining: 3m 16s
5:	learn: 1.2981466	test: 1.2958879	best: 1.2958879 (5)	total: 1.61s	remaining: 3m 19s
6:	learn: 1.2851811	test: 1.2826177	best: 1.2826177 (6)	total: 1.87s	remaining: 3m 18s
7:	learn: 1.2721890	test: 1.2695212	best: 1.2695212 (7)	total: 2.15s	remaining: 3m 19s
8:	learn: 1.2595481	test: 1.2564760	best: 1.2564760 (8)	total: 2.47s	remaining: 3m 23s
9:	learn: 1.2476000	test: 1.2441432	best: 1.2441432 (9)	total: 2.71s	remaining: 3m 20s
10:	learn: 1.2358021	test: 1.2320529	best: 1.2320529 (10)	total: 2.93s	remaining: 3m 17s
11:	learn: 1.2246303	test: 1.2205531	best:

In [110]:
y_pred = model.predict_proba(X_test)
y_pred

array([[0.01764582, 0.01340134, 0.03893844, 0.93001441],
       [0.82325407, 0.12460041, 0.02971141, 0.02243411],
       [0.07669816, 0.84719704, 0.04057644, 0.03552835],
       ...,
       [0.11745141, 0.03800254, 0.6779906 , 0.16655545],
       [0.04860856, 0.02420982, 0.84744984, 0.07973178],
       [0.24771971, 0.06387575, 0.6136845 , 0.07472005]])

**Training Accuracy**

In [111]:
train_res = model.predict(X_train)

accuracy = accuracy_score(y_train, train_res)
precision = precision_score(y_train, train_res, average='weighted')
recall = recall_score(y_train, train_res, average='weighted')
f1 = f1_score(y_train, train_res, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

class_report = classification_report(y_train, train_res)
print("Classification Report:\n", class_report)

# Confusion Matrix
cm = confusion_matrix(y_train, train_res)
print("Confusion Matrix:\n", cm)

Accuracy: 0.8873200442967885
Precision: 0.8871866530437228
Recall: 0.8873200442967885
F1 Score: 0.887145732089001
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90      2372
           1       0.87      0.86      0.86      1120
           2       0.85      0.83      0.84      1444
           3       0.92      0.91      0.92      2288

    accuracy                           0.89      7224
   macro avg       0.88      0.88      0.88      7224
weighted avg       0.89      0.89      0.89      7224

Confusion Matrix:
 [[2163   84   79   46]
 [ 106  960   31   23]
 [ 106   29 1197  112]
 [  68   34   96 2090]]


In [112]:
import numpy as np

In [113]:
train_confidence_scores = model.predict_proba(X_train)
train_confidence_scores_max = [max(train_confidence_scores[i]) for i in range(0,len(train_confidence_scores))]
train_confidence_scores_max

[0.9425238777257943,
 0.9236221240958147,
 0.6442255488714893,
 0.9501517429538693,
 0.8368939367142942,
 0.8667422746270975,
 0.8897469104443749,
 0.6843203254973068,
 0.948263451196076,
 0.8997877661540129,
 0.9371906391466946,
 0.8554864146435861,
 0.6556994770275814,
 0.9465884717233928,
 0.6890351605100613,
 0.8602041965164767,
 0.7299687217077443,
 0.8523782548444989,
 0.8522066029853355,
 0.8181597489024209,
 0.8978550033552964,
 0.9327878658852711,
 0.9237047154203744,
 0.930407482270777,
 0.741718378800766,
 0.8823536876701912,
 0.9274576499673075,
 0.9361020560840508,
 0.8487036260194092,
 0.8114571202221178,
 0.8532181881513277,
 0.8992117256084706,
 0.9255742605825701,
 0.902724413461962,
 0.6336951088667533,
 0.8538854827002643,
 0.8730892894597708,
 0.9220983435102186,
 0.4051105732577178,
 0.8458795492416292,
 0.8731457240352701,
 0.8308263888865847,
 0.860475607701651,
 0.9317754111441541,
 0.9076865222926779,
 0.8225945686773622,
 0.9259762161941026,
 0.678064635075591

In [114]:
train_confidence_scores = model.predict_proba(X_train)

train_df = pd.read_excel("../db/train_data.xlsx")
train_df = train_df[train_df['Class'] != 5]

df_results = pd.DataFrame()
df_results["Notes"] =train_df["Notes"]
df_results["Class"] = y_train["Class"]
df_results["Predicted"] = train_res

In [115]:
df_results["confidence_score"] = [max(lst) for lst in train_confidence_scores]

In [116]:
df_results.head()
df_results.to_excel("../Modelling/training_performance.xlsx",index=False)

**Evaluation Using Test Data**

In [122]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.813700051894136
Precision: 0.8119556461962415
Recall: 0.813700051894136
F1 Score: 0.8123333816586095


In [123]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85       633
           1       0.76      0.76      0.76       299
           2       0.75      0.68      0.71       385
           3       0.85      0.87      0.86       610

    accuracy                           0.81      1927
   macro avg       0.80      0.79      0.80      1927
weighted avg       0.81      0.81      0.81      1927

Confusion Matrix:
 [[550  39  33  11]
 [ 37 227  12  23]
 [ 48  13 262  62]
 [ 21  19  41 529]]


In [124]:

feature_importance = model.get_feature_importance()
print("Feature Importance:\n", feature_importance)

Feature Importance:
 [100.]


**Evaluation Using Validation Data**

In [125]:
val_res = model.predict(X_val)

In [126]:
accuracy = accuracy_score(y_val, val_res)
precision = precision_score(y_val, val_res, average='weighted')
recall = recall_score(y_val, val_res, average='weighted')
f1 = f1_score(y_val, val_res, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.7883817427385892
Precision: 0.786652518077659
Recall: 0.7883817427385892


In [127]:
# Generate a classification report
class_report = classification_report(y_val, val_res)
print("Classification Report:\n", class_report)

# Confusion Matrix
cm = confusion_matrix(y_val, val_res)
print("Confusion Matrix:\n", cm)

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.84      0.81       158
           1       0.76      0.75      0.75        75
           2       0.73      0.65      0.69        96
           3       0.83      0.85      0.84       153

    accuracy                           0.79       482
   macro avg       0.78      0.77      0.77       482
weighted avg       0.79      0.79      0.79       482

Confusion Matrix:
 [[132   9  11   6]
 [ 11  56   3   5]
 [ 16   3  62  15]
 [  8   6   9 130]]


In [128]:
model.save_model("../model_registry/catboost_10_09_2023.cbm",format="cbm")