In [28]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV


from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from imblearn.over_sampling import SMOTE

In [29]:
# load data
df_test = pd.read_csv('../data/processed/mtsamples_nlp.csv')
df_test.transcription=df_test.transcription.astype(str)

In [30]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)
df_test_X = df_test['transcription_f'].astype(str)
df_test_X.shape

(2976,)

In [31]:
# split data into train and test set 
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)
X_train.shape
X_test.shape

(596,)

In [32]:
# vectorize X_train

def vectorize_data(X_train):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    return X_train

def get_features(X_train):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    features = vectorizer.get_feature_names_out()
    return features

X_train_vec = vectorize_data(X_train)
print(X_train_vec.shape)

features = get_features(X_train)
features

(2380, 2574)


array(['0007', '005', '01', ..., 'zygoma', 'zygomatic', 'zyprexa'],
      dtype=object)

In [33]:
# smote oversampling to balance data
def smote_oversample(X_train_vec, y_train):
    sm = SMOTE(random_state=42)
    X_train_vec, y_train = sm.fit_resample(X_train_vec, y_train)
    return X_train_vec, y_train

X_train_vec, y_train = smote_oversample(X_train_vec, y_train)
X_train_vec

<8921x2574 sparse matrix of type '<class 'numpy.int64'>'
	with 163353 stored elements in Compressed Sparse Row format>

In [34]:
# Remove highly correlated features and return X_train_vec as array

def decorrelate(X, threshold):
    X = pd.DataFrame(X.toarray(), columns=features)
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X = X.drop(X[to_drop], axis=1)
    return X


X_train_transformed = decorrelate(X_train_vec, 0.85)
X_train_transformed.shape

(8921, 2018)

In [35]:
X_train_transformed

Unnamed: 0,0007,005,01,0125,020,025,03,0395,05,050,...,year,yellowish,yolk,zithromax,zocor,zofran,zoloft,zone,zygoma,zygomatic
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8916,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# The “saga” solver is a variant of “sag” that also supports the non-smooth penalty="l1" 
# This is therefore the solver of choice for sparse multinomial logistic regression
# L1 tends to shrink coefficients to zero whereas L2 tends to shrink coefficients evenly. 
# L1 is therefore useful for feature selection, as we can drop any variables associated with coefficients that go to zero
lr = LogisticRegression(random_state=42, multi_class='multinomial', solver='saga', penalty='l1').fit(X_train_transformed, y_train)



In [37]:
X_test_vec = vectorize_data(X_test)

def get_features(X_train):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    features = vectorizer.get_feature_names_out()
    return features

features = get_features(X_test)

# sparse matrix to dataframe
X_test_vec = pd.DataFrame(X_test_vec.toarray(), columns=features)
X_test_vec

Unnamed: 0,005,01,0125,025,03,0395,05,075,08302003,092assessment1,...,wound,x2,xanax,xigris,xiphoid,xylocaine,year,zocor,zoloft,zygoma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
592,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
593,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
y_pred = lr.predict(X_test_vec)

Feature names unseen at fit time:
- 08302003
- 1000
- 118
- 123456procedures1
- 1351
- ...
Feature names seen at fit time, yet now missing:
- 0007
- 020
- 050
- 100
- 1001
- ...



ValueError: X has 1595 features, but LogisticRegression is expecting 2018 features as input.