In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import classification_report 
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline 
import matplotlib as plt 

path = 'C:/Users/e745092/Desktop/Visual Studio/OR_Cases_2020.csv'

data = pd.read_csv(path)
data.head()

Unnamed: 0,Lead Surgeon,Date,MRN,Diagnosis Codes,Case Procedures,Service,Case Indx,Primary Categ,Sub Categ
0,142333,1/6/2020,2348312,Gastric adenocarcinoma (HCC) [C16.9],Laparoscopic Subtotal Gastrectomy With Roux-En...,General,Y,Esophagus,G
1,131184,1/6/2020,8705179,Chronic fibrosing pancreatitis (HCC) [K86.1],"Total Pancreatectomy, Auto Islet Cell Transpla...",Transplant,Y,Pancreas,P
2,152600,1/8/2020,15415485,Malignant neoplasm of abdominal esophagus (HCC...,Minimally Invasive Esophagectomy Ivor Lewis [4...,Thoracic,Y,Esophagus,E
3,152600,1/9/2020,14936925,"History of esophagectomy [Z98.890, Z90.49]",Esophagogastroduodenoscopy W/ Dilation [78],Thoracic,Y,Esophagus,E-HxE
4,122994,1/10/2020,8796688,Malignant neoplasm of lesser curvature of stom...,"Egd [688], Laparoscopic Distal Gastrectomy Wi...",General,Y,Gastric,G


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import re
from sklearn.model_selection import train_test_split

data_na = data.dropna()
X = data_na.drop('Primary Categ', axis=1)
y = data_na['Primary Categ']


In [3]:
X.isnull().any()

Lead Surgeon       False
Date               False
MRN                False
Diagnosis Codes    False
Case Procedures    False
Service            False
Case Indx          False
Sub Categ          False
dtype: bool

def remove_brackets(df):
    df['DX1'] = df['Diagnosis Codes'].apply(lambda x: re.findall('\[(.*?)\]',x))
    df['Procedure_Codes1'] = df['Case Procedures'].apply(lambda x: re.findall('\[(.*?)\]',x))
    df['DX_Codes'] = df['DX1'].str.get(0)
    df['Procedure_Codes'] =  df['Procedure_Codes1'].str.join(', ')
    df = df.drop(['Lead Surgeon', 'Date', 'MRN', 'Diagnosis Codes', 'Case Procedures', 'DX1', 'Procedure_Codes1'], axis=1)
    return df

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class RemoveBrackets(BaseEstimator, TransformerMixin):
    def __init__(self, remove_bracket = True):
        self.remove_bracket = remove_bracket
    def fit(self, X, y=None):
        return self # nothing else to do 
    def transform(self, X):
        X['DX1'] = X['Diagnosis Codes'].apply(lambda x: re.findall('\[(.*?)\]',x))
        X['Procedure_Codes1'] = X['Case Procedures'].apply(lambda x: re.findall('\[(.*?)\]',x))
        X['DX_Codes'] = X['DX1'].str.get(0)
        X['Procedure_Codes'] =  X['Procedure_Codes1'].str.join(', ')
        X = X.drop(['Lead Surgeon', 'Date', 'MRN', 'Diagnosis Codes', 'Case Procedures', 'DX1', 'Procedure_Codes1'], axis=1)
        return X

In [5]:
pipeline = Pipeline([
    ('Remove Brackets', RemoveBrackets()),
    ('encode', OneHotEncoder(handle_unknown='ignore')),
    ])

In [6]:
X = pipeline.fit_transform(X)

In [7]:
X.shape

(346, 266)

In [8]:
from sklearn.mode0l_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
y.shape

(346,)

In [10]:
X.shape

(346, 266)

# OVR SVC Model 

In [12]:
from sklearn.metrics import classification_report 
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [13]:
ovr_clf = OneVsRestClassifier(SVC(degree=2, gamma="scale", random_state=42))

ovr_clf.fit(X_train, y_train)

OneVsRestClassifier(estimator=SVC(degree=2, random_state=42))

In [14]:
print(classification_report(y_test, ovr_clf.predict(X_test)))

              precision    recall  f1-score   support

      Breast       0.00      0.00      0.00         1
   Esophagus       0.84      1.00      0.91        21
     Gastric       1.00      0.71      0.83        14
        Lung       1.00      1.00      1.00        23
    Pancreas       0.92      1.00      0.96        11

    accuracy                           0.93        70
   macro avg       0.75      0.74      0.74        70
weighted avg       0.92      0.93      0.92        70



In [15]:
y_test.unique()

array(['Esophagus', 'Lung', 'Pancreas', 'Gastric', 'Breast'], dtype=object)