In [None]:
!pip install ds-box

In [None]:
import pandas as pd

In [None]:
import numpy as np
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 150
mpl.rcParams['figure.figsize'] = (10.0, 8.0)

## Data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('drive/MyDrive/June 24-25/mtsamples.csv')

In [None]:
data.loc[data.medical_specialty == ' Cardiovascular / Pulmonary', "medical_specialty"] = 'Heart'
data.loc[data.medical_specialty == ' Neurosurgery', 'medical_specialty'] = 'Brain'
data.loc[data.medical_specialty == ' Neurology', 'medical_specialty'] = 'Brain'
data.loc[data.medical_specialty == ' Urology', 'medical_specialty'] = 'Reproductive'
data.loc[data.medical_specialty == ' Obstetrics / Gynecology', 'medical_specialty'] = 'Reproductive'
data.loc[data.medical_specialty == ' Gastroenterology', 'medical_specialty'] = 'Digestive'
data.loc[data.medical_specialty == ' Nephrology', 'medical_specialty'] = 'Digestive'
data = data[data.medical_specialty.isin(['Heart', 'Brain', 'Reproductive', 'Digestive'])]
data = data[['transcription', 'medical_specialty']]
data.rename(columns = {'transcription':'Report', 'medical_specialty':'speciality'}, inplace = True)
data = data.dropna()
X=data

In [None]:
import string

def remove_punct(text):
    for p in string.punctuation:
        text = text.replace(p, ' ')
    text = ' '.join(text.split())
    return text

In [None]:
X['Report'] = X['Report'].map(lambda x: remove_punct(x).lower())

## Train/test stuff

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X['Report'], X['speciality'], test_size=0.2, random_state=42)

In [None]:
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
y_train = y_train.replace({"Heart": 0, "Brain": 1, "Digestive": 2, "Reproductive": 3}).to_numpy()
y_test = y_test.replace({"Heart": 0, "Brain": 1, "Digestive": 2, "Reproductive": 3}).to_numpy()

In [None]:
y_train = np.squeeze(y_train)
y_test = np.squeeze(y_test)

## Model


In [None]:
from nltk.stem.snowball import FrenchStemmer
from dsbox.ml.neural_networks.processing import Text2Sequence

from dsbox.ml.neural_networks.keras_factory.text_models import LSTMFactory, CNN_LSTMFactory
from dsbox.ml.neural_networks.processing.workflow import TextNeuralNetPipeline

The `LSTMFactory` factory returns a neural network with this architecture:

In [None]:
lstm_model = LSTMFactory().create_model(100)
SVG(model_to_dot(lstm_model, show_shapes=True, dpi=70).create(prog='dot', format='svg'))

The `CNN_LSTMFactory` factory returns a neural network with this architecture:

In [None]:
cnn_lstm_model = CNN_LSTMFactory().create_model(100)
SVG(model_to_dot(cnn_lstm_model, show_shapes=True, dpi=70).create(prog='dot', format='svg'))

# Training

In [None]:
model = TextNeuralNetPipeline(text2seq=Text2Sequence(stemmer=FrenchStemmer()), 
                              factory_class=CNN_LSTMFactory, 
                              num_labels=4)

In [None]:
model.fit(X_train, y_train, 
                        epochs=15,
                        batch_size=100, 
          huffle=True)

### Test and metrics

In [None]:
from sklearn import metrics

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
print(metrics.accuracy_score(y_test, y_pred))