In [38]:
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import joblib
import nltk
import re
from nltk.stem import WordNetLemmatizer

# Load the datasets (train, test, validation)
train_df = pd.read_excel("../db/train_data.xlsx")
test_df = pd.read_excel("../db/test_data.xlsx")
val_df = pd.read_excel("../db/validation_data.xlsx")

In [39]:
stopwords_list = ['the', 'of', ',', 'and', 'in', ')', '(', 'with', 'a', 'to', 'patients', 'was', 'were', '%', 'for', 'or', 'is', 'by', 'that', 'than', 'from', 'an', 'at', 'this', 'as', 'be', 'had', 'after', 'on', 'not', 'less', 'disease', ';', 'are', 'these', 'p', '+/-', ':', 'we', 'group', 'treatment', 'during', 'study', '=', 'have', 'no', 'all', 'two', 'may', 'but', 'one', 'patient', 'who', 'cases', 'blood', 'years', 'clinical', 'between', '1', 'results', 'cells', '2', 'more', 'been', 'both', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves','pt','patient','what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def remove_stopwords(text: str) -> str:
    words = nltk.word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    filtered_words = [word for word in lemmatized_words if word.isalpha() and word not in stopwords_list]
    clean_text = ' '.join(filtered_words)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()
    return clean_text

In [40]:
train_df["Notes"] = train_df["Notes"].apply(remove_stopwords)
test_df["Notes"] = test_df["Notes"].apply(remove_stopwords)
val_df["Notes"] = val_df["Notes"].apply(remove_stopwords)

In [41]:
train_df.head(3)

Unnamed: 0,Notes,Class
0,result contemporary radical cystectomy invasiv...,1
1,intraluminal pressure adjacent left colonic an...,5
2,greenfield filter primary mean therapy venous ...,4


In [42]:
train_df = train_df[train_df['Class'] != 5]
test_df = test_df[test_df['Class'] != 5]
val_df = val_df[val_df['Class'] != 5]

In [43]:
# Replace class labels in the DataFrame
mapping = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}

train_df['Class'] = train_df['Class'].map(mapping)
test_df['Class'] = test_df['Class'].map(mapping)
val_df['Class'] = val_df['Class'].map(mapping)

In [44]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train = tfidf_vectorizer.fit_transform(train_df['Notes'])

# Transform the test and validation data using the trained vectorizer
X_test = tfidf_vectorizer.transform(test_df['Notes'])
X_val = tfidf_vectorizer.transform(val_df['Notes'])

In [45]:
# Separate features and target variable
#X_train = train_df.drop(columns=['Class'])
y_train = train_df['Class']
#X_test = test_df.drop(columns=['Class'])
y_test = test_df['Class']
#X_val = val_df.drop(columns=['Class'])
y_val = val_df['Class']

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
# Create and train an XGBoost Classifier model for multi-class classification
#model = xgb.XGBClassifier()
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = model.predict(X_test)

In [48]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), average='weighted', multi_class='ovr')

In [49]:
# Print evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"AUC-ROC Score: {roc_auc:.2f}")

Accuracy: 0.76
Precision: 0.76
Recall: 0.76
F1 Score: 0.76
AUC-ROC Score: 0.92


In [50]:
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       633
           1       0.71      0.63      0.67       299
           2       0.64      0.61      0.63       385
           3       0.80      0.84      0.82       610

    accuracy                           0.76      1927
   macro avg       0.74      0.73      0.73      1927
weighted avg       0.76      0.76      0.76      1927



In [51]:
# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[524  45  39  25]
 [ 43 189  34  33]
 [ 58  17 236  74]
 [ 22  17  58 513]]
