In [None]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle

import pandas as pd
import numpy as np
import re
import os
import eli5

In [None]:
# lab_names = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_dictionary_v01.csv')
# lab_names = lab_names['test'].tolist()

lab_names = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_dictionary_train_cleaned_v04.csv')
lab_names = lab_names['test name'].tolist()

VALUES = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_additional_values.csv')
VALUES = VALUES['values'].tolist()
VALUES = [str(v).upper for v in VALUES]

UNITS = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_units_v02.csv')
UNITS = UNITS['units'].tolist()
UNITS = [str(u).upper() for u in UNITS]
UNITS = list(set(UNITS))

ABNORMAL = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_abnormal.csv')
ABNORMAL = ABNORMAL['abnormal flag'].tolist()
ABNORMAL = [str(v).upper() for v in ABNORMAL]

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


def word2features(sent, i):
    word = str(sent[i][0])
    postag = str(sent[i][1])
    
    # values
    # units
    # references
    # abnormal flags
    # datetime
    
    isTestName = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word)>-1])>0 else False
    isValue = True if re.match(r'\d*.?\d+|[0-9\.\-\s\>\<\=]+', word) or word in VALUES else False    
    isUnit = True if word.upper() in UNITS else False
    isDecimal = True if re.findall(r'(\.)', word) and word.isdigit() else False
    isDigit = True if word.isdigit() else False
    isRange = True if re.match(r'\([0-9\.\-\s\>\<\=]+\)', word) else False # match parenthesis+digits
    isAbnormal = True if word.upper() in ABNORMAL else False
    isDatetime = True if re.match(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}:\d{1,2}[:\d+]*|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}:\d{1,2}[:\d+]*', str(word)) else False
    #isRange = True if re.match(r'\(\d+-\d+\)', word) else False
    
    features = {
        'word.isTestName()': word if isTestName else '',
        'word.isValue()': word if isValue else '',
        'word.isUnit()': word if isUnit else '',
        'word.isDecimal()': word if isDecimal else '',
        'word.isDigit()': word if isDigit else '',
        'word.isRange()': word if isRange else '',
        'word.isAbnormal()': word if isAbnormal else '',
        'word.isDatetime()': word if isDatetime else '',
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    try:
        if i > 0:
            word1 = sent[i - 1][0]
            postag1 = sent[i - 1][1]

            isTestName1 = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word1)>-1])>0 else False
            isValue1 = True if re.match(r'\d*.?\d+|[0-9\.\-\s\>\<\=]+', word1)  or word in VALUES else False    
            isUnit1 = True if word1.upper() in UNITS else False
            isDecimal1 = True if re.findall(r'(\.)', word1) and word.isdigit() else False
            isDigit1 = True if word1.isdigit() else False
            isRange1 = True if re.match(r'\([0-9\.\-\s\>\<\=]+\)', word1) else False # match parenthesis+digits
            isAbnormal1 = True if word1.upper() in ABNORMAL else False
            isDatetime1 = True if re.match(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}:\d{1,2}[:\d+]*|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}:\d{1,2}[:\d+]*', str(word1)) else False


            features.update({
                '-1word.isTestName()': word1 if isTestName1 else '',
                '-1word.isValue()': word1 if isValue1 else '',
                '-1word.isUnit()': word1 if isUnit1 else '',
                '-1word.isDecimal()': word1 if isDecimal1 else '',
                '-1word.isDigit()': word1 if isDigit1 else '',
                '-1word.isRange()': word1 if isRange1 else '',
                '-1word.isAbnormal()': word1 if isAbnormal1 else '',
                '-1word.isDatetime()': word1 if isDatetime1 else '',
                #'-1postag': postag1,
                #'-1postag[:2]': postag1[:2],
            })
        else:
            #features['BOS'] = True
            features.update({
                'BOS': word
            })

        if i < len(sent) - 1:
            word1 = sent[i + 1][0]
            postag1 = sent[i + 1][1]


            isTestName1 = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word1)>-1])>0 else False
            isValue1 = True if re.match(r'\d*.?\d+|[0-9\.\-\s\>\<\=]+', word1)  or word in VALUES else False    
            isUnit1 = True if word1.upper() in UNITS else False
            isDecimal1 = True if re.findall(r'(\.)', word1) and word.isdigit() else False
            isDigit1 = True if word1.isdigit() else False
            isRange1 = True if re.match(r'\([0-9\.\-\s\>\<\=]+\)', word1) else False # match parenthesis+digits
            isAbnormal1 = True if word1.upper() in ABNORMAL else False
            isDatetime1 = True if re.match(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}:\d{1,2}[:\d+]*|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}:\d{1,2}[:\d+]*', str(word1)) else False


            features.update({
                '+1word.isTestName()': word1 if isTestName1 else '',
                '+1word.isValue()': word1 if isValue1 else '',
                '+1word.isUnit()': word1 if isUnit1 else '',
                '+1word.isDecimal()': word1 if isDecimal1 else '',
                '+1word.isDigit()': word1 if isDigit1 else '',
                '+1word.isRange()': word1 if isRange1 else '',
                '+1word.isAbnormal()': word1 if isAbnormal1 else '',
                '+1word.isDatetime()': word1 if isDatetime1 else '',
                #'+1postag': postag1,
                #'+1postag[:2]': postag1[:2],
            })
        else:
            #features['EOS'] = True        
            features.update({
                'EOS': word
            })
    except:
        print(sent,word)

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]




In [None]:
def extract_labs(t):
    if len(t)<1:
        return ['','','','','','','',]
    
    
    #values = [re.sub(r'[\"|\'\[\]]','',str(w)) for w in values]
    v = t
    v = re.sub(r'\"','',str(v).strip())            
    v = re.sub(r'[^A-Z0-9\:\/\-\.\(\)]',' ',str(v).strip().upper())
    #v = re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\2',str(v).strip().upper())
    v = re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\3',str(v).strip().upper())
    v = re.sub(r'(\([A-Z\s]+\))','',str(v).strip().upper())
    v = ','.join(re.split(r'[\s]+',str(v)))
    v = re.sub(r'(\d{1,2}\/\d{1,2}\/\d{2,4})(,)(\d{1,2}:\d{1,2}[:\d+]*)',r'\1 \3',v)
    #values[0] = v
    #arow.append(v)
    tags = []
    #print(values)
    values = re.split(r',',str(t))
    for v in values:
        if re.search(r'[\-|\/|\:]',v):
            tags.append([0,v, 'CD'])
        else:
            wordlist = nltk.pos_tag(nltk.word_tokenize(v)) 
            for i in range(len(wordlist)):
                aword, apos = wordlist[i]
                tags.append([0,aword, apos])
            
    
    tagged_df = pd.DataFrame(tags)
    if tagged_df.shape[0]==0:
        return ['','','','','','','',]
    tagged_df = tagged_df.rename(columns={0:'Sentence #',1:'Word',2:'POS'})
    
    #print(tagged_df.head())
    func = lambda s: [(w, p) for w, p in zip(s["Word"].values.tolist(), s["POS"].values.tolist())]
    grouped = tagged_df.groupby("Sentence #").apply(func)
    sentences = [s for s in grouped]
    test_sents = sentences
    
    X_test = [sent2features(s) for s in test_sents]
    y_pred = crf_model.predict(X_test)
    prediction = []
    for sent, label in zip(test_sents,y_pred):
        asent = []
        aContext = []
        aValue = []
        aUnit = []
        aRef = []
        aAbnorm = []
        aDtime = []
        for s, l in zip(sent, label):
            asent.append(s[0])

            if l == 'O':
                aContext.append(s[0])                
            if l == 'VALUE':
                aValue.append(s[0])
            elif l == 'UNIT':
                aUnit.append(s[0])
            elif l == 'REF':
                aRef.append(s[0])
            elif l == 'ABNORM':
                aAbnorm.append(s[0])
            elif l == 'DTIME':
                aDtime.append(s[0])
                
        prediction.append([' '.join(asent), ' '.join(aContext), ' '.join(aValue), \
                           ' '.join(aUnit), ' '.join(aRef), ' '.join(aAbnorm), ' '.join(aDtime), ','.join(label)])
    sent, context,value,unit,ref,abnorm,dtime, pred = prediction[0]    
    return [sent, context,value,unit,ref,abnorm,dtime, pred]

In [None]:
def make_tagged_set(df_sent):
    # raw, lab name, value, unit, reference, abnormal flag, datetime
    df_sent = df_sent.fillna(value='')

    filtered = []
    for key, values in df_sent.iterrows():
        arow = []
        values = [re.sub(r'[\"|\'\[\]]','',str(w)) for w in values]
        v = values[0]
        v = re.sub(r'\"','',str(v).strip())            
        v = re.sub(r'[^A-Z0-9\:\/\-\.\(\)]',' ',str(v).strip().upper())
        #v = re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\2',str(v).strip().upper())
        v = re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\3',str(v).strip().upper())
        v = re.sub(r'(\([A-Z\s]+\))','',str(v).strip().upper())
        v = ','.join(re.split(r'[\s]+',str(v)))
        #v = re.sub(r'(\d{1,2}\/\d{1,2}\/\d{2,4})(,)(\d{1,2}:\d{1,2}[:\d+]*)',r'\1 \3',v)
        v = re.sub(r'(\d{1,2}\/\d{1,2}\/\d{2,4})(,)(\d{1,2}:\d{1,2}[:\d+]*)','',v)
        values[0] = v
        #arow.append(v)
        tags = []
        #print(values)
        for aword in re.split(r',',str(values[0])):
            atag = ''
            apos = ''
            if len(values[2])>0 and len([w for w in values[2].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'VALUE'
                apos = 'CD'
                tags.append((aword, apos, atag))
            elif len(values[3])>0 and len([w for w in values[3].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'UNIT'
                apos = 'CD'
                tags.append((aword, apos, atag))
            elif len(values[4])>0 and len([w for w in values[4].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'REF'
                apos = 'CD'
                tags.append((aword, apos, atag))
            elif len(values[5])>0 and len([w for w in values[5].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'ABNORM'
                apos = 'NN'
                tags.append((aword, apos, atag))
            elif len(values[6])>0 and len([w for w in values[6].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'DTIME'
                apos = 'CD'
                tags.append((aword, apos, atag))
            else:
                atag = 'O'                
                wordlist = nltk.pos_tag(nltk.word_tokenize(aword)) 
                for i in range(len(wordlist)):
                    aword, apos = wordlist[i]
                    tags.append((aword, apos, atag))
        filtered.append(pd.DataFrame(tags))

    tagged_list = []
    for key, values in enumerate(filtered):
        for k,atagged in values.iterrows():
            tagged_list.append([key, atagged[0], atagged[1], atagged[2]])
    tagged_df = pd.DataFrame(tagged_list)
    tagged_df = tagged_df.rename(columns={0: 'Sentence #', 1: 'Word', 2: 'POS', 3: 'Tag'})
    tagged_df.shape
    
    return tagged_df

In [None]:
data = pd.read_csv('/home/jovyan/work/NER-Test/data/train/train_round123_v31_structured_v01.csv') 
data = data.dropna()
data.shape

In [None]:
data = pd.read_csv('data/train/train_round123_v31_structured_v01.csv')
data = data.fillna(value='')
data['value'] = data['value'].apply(lambda s: [w for w in re.sub(r'[\[|\]]','',str(s)).split(',')])
data['unit'] = data['unit'].apply(lambda s: [w for w in re.sub(r'[\[|\]]','',str(s)).split(',')])
data['reference_range'] = data['reference_range'].apply(lambda s: [w for w in re.sub(r'[\[|\]]','',str(s)).split(',')])
data['abnormal_flag'] = data['abnormal_flag'].apply(lambda s: [w for w in re.sub(r'[\[|\]]','',str(s)).split(',')])
data['date_time'] = data['date_time'].apply(lambda s: [w for w in re.sub(r'[\[|\]]','',str(s)).split(',')])
data.shape

In [None]:
data.head()

In [None]:
data.to_csv('data/train/train_round123_v31_structured_v02.csv',index=False)

In [None]:
df_train_data = pd.read_csv('data/test/train_round123_v31_structured_v02.csv')
df_train_data[df_train_data['check']==False].to_csv('data/test/train_round123_v31_structured_v02_label_false.csv',index=False)
df_train_data[df_train_data['check']==True].to_csv('data/test/train_round123_v31_structured_v02_label_true.csv',index=False)

df_train_data.head()

In [None]:
df_test = pd.read_csv('data/test/train_round123_v31_structured_v02_label.csv')
df_test.shape

# Load Training data

In [None]:
df_train_data = pd.read_csv('data/train/train_round123_v31_structured_v02_label_true.csv')
df_train_data.shape

In [None]:
train = df_train_data.copy()
train = train.dropna(subset=['lab_result_0'])
train_tagged = make_tagged_set(train)

In [None]:
train_tagged['check'] = train_tagged[['Word','POS']].apply(lambda s: True if (s[0].isdigit() and float(s[0])<0) or re.search(r'[^A-Z]+',s[1]) else False, axis=1)
train_tagged = train_tagged[train_tagged['check']==False].copy()

In [None]:
train_tagged.to_csv('data/train/train_round123_v31_structured_v02_tagged.csv')

In [None]:
train_tagged = train_tagged.fillna(method="ffill")
words = list(set(train_tagged["Word"].values))
n_words = len(words)
tags = list(set(train_tagged["Tag"].values))
n_tags = len(tags)
getter = SentenceGetter(train_tagged)
train_sentences = getter.sentences
train_sents = train_sentences

In [None]:
df_test_data = pd.read_csv('data/train/train_round123_v31_structured_v02_label_true.csv')
df_test_data.shape

In [None]:
test = df_test_data.copy()
test = test.dropna(subset=['lab_result_0'])
test_tagged = make_tagged_set(test)


test_tagged = test_tagged.fillna(method="ffill")
words = list(set(test_tagged["Word"].values))
n_words = len(words)
tags = list(set(test_tagged["Tag"].values))
n_tags = len(tags)
getter = SentenceGetter(test_tagged)
test_sentences = getter.sentences
test_sents = test_sentences

In [None]:
sent2features(train_sents[0])[0]

In [None]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [None]:
%%time
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
len(X_test), len(y_test)

In [None]:
%%time

labels = tags # list(crf.classes_)

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


lr_poly = Pipeline([('polynomial_trans', PolynomialFeatures(20)),
                    ('standard_scaler', StandardScaler()),
                    ('crf', crf)])

parameters = {'crf__algorithm':('lbfgs'), 'crf__max_iterations':[20,50,100]}
clf = GridSearchCV(lr_poly, parameters)
clf.fit(X,y)

y_pred = clf.predict(X)

# Evaluation

In [None]:
from sklearn import metrics

print("Mean absolute error:", metrics.mean_absolute_error(y, y_pred))
print("Mean squared error:", metrics.mean_squared_error(y, y_pred))
print("R^2:", metrics.r2_score(y, y_pred))

In [None]:
from sklearn.linear_model import SGDRegressor
import numpy as np

from sklearn.preprocessing import StandardScaler

colors = ['#ba2121ff', '#42a5f5ff', '#efa016ff', '#000000ff', '#6c5353ff']

def fit_SGD(X, y, random_state, eta0, max_iter):
    # SGD will perform much better if we scale the data!
    X_scaled = StandardScaler().fit_transform(X)
    return SGDRegressor(random_state=random_state, eta0=eta0, max_iter=max_iter).fit(X_scaled, y)

coefs = []
iterations = range(1, 100, 2)
for n_iter in iterations:
    sgd_regressor = fit_SGD(X, y, random_state=42, eta0=0.001, max_iter=n_iter)
    coefs.append(sgd_regressor.coef_)

for idx, c in enumerate(np.array(coefs).T):
    plt.plot(iterations, c, label=r'$\beta_{}$'.format(idx+1),
            color=colors[idx])

# Obtain same coefficients with LinearRegression:
X_scaled = StandardScaler().fit_transform(X)
lr = LinearRegression().fit(X_scaled, y);
for coeff in lr.coef_:
    plt.axhline(y=coeff, color='k', linestyle='--', alpha=0.3)
    
plt.ylabel(r'Optimal $\beta$')
plt.xlabel('Number of iteration steps')
plt.title('Obtained Model Parameters vs. Number of Iteration Steps Taken')
plt.legend(bbox_to_anchor=(1, 1));

In [None]:

y_pred = lr_year_built.predict(X)

plt.plot(X, y, 'o', color = 'k', label='training data')
plt.plot(X, y_pred, color='#42a5f5ff', label='linear model prediction')
plt.xlabel('Year built')
plt.ylabel('Home price ($)')
plt.legend()

print("Training R^2:", metrics.r2_score(y, y_pred))

In [None]:

print('Training R^2:', metrics.r2_score(y, y_pred))

plt.plot(X['YearBuilt'], y, 'o', color='k', label='training data')
plt.plot(X['YearBuilt'], y_pred, color='#42a5f5ff', label='quadratic model prediction')
plt.xlabel('Year built')
plt.ylabel('Home price ($)')
plt.legend();

In [None]:
prediction = []
for sent, true, pred in zip(test['lab_result_0'].tolist(),y_test,y_pred):   #y_pred = crf.predict(X_test)        
    flag = (true == pred)
    prediction.append([sent, true, pred,flag])
df_pred = pd.DataFrame(prediction)
test['true_label'] = df_pred[1]
test['pred_label'] = df_pred[2]
test['check'] = df_pred[3]                             
test.to_csv('data/test/train_round123_v31_structured_v02_label_filtered.csv',index=False)

# Simple Test

In [None]:
crf_model = pickle.load(open('/home/jovyan/work/NER-Test/models/crf_model_lab_round123_v031.pkl','rb'))
v = 'WBC                  5.83 (OCT 08) 7.07 (OCT 07) 7.17 (OCT 06) 6.99 (OCT 05)'
v =  re.sub(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}[:\d{1,2}]+|\d{1,2}\/\d{1,2}\/\d{2,4}','',str(v))
v
extract_labs(v)

# Testing

In [None]:
data = pd.read_csv('data/train/train_round123_v31_structured_v02.csv')
#data_freq = pd.read_csv('data/train/train_round12_freq.csv')
data = data.fillna(value='')
data.shape

In [None]:
%%time
crf_model = pickle.load(open('/home/jovyan/work/NER-Test/models/crf_model_lab_round123_v031_structured_v02.pkl','rb'))
#data = pd.read_csv('data/train/train_round3.csv')
X_test = data.copy()
X_test = X_test.fillna(value='')
#sent, context,value,unit,ref,abnorm,dtime
X_test['lab_result'] = X_test['lab_result_0'].apply(extract_labs)
X_test['LAB_TEXT'] = X_test['lab_result'].apply(lambda s:s[0])
X_test['LAB_TEST'] = X_test['lab_result'].apply(lambda s:s[1])
X_test['LAB_VALUE'] = X_test['lab_result'].apply(lambda s:s[2])
X_test['LAB_UNIT'] = X_test['lab_result'].apply(lambda s:s[3])
X_test['LAB_REF'] = X_test['lab_result'].apply(lambda s:s[4])
X_test['LAB_ABNORM'] = X_test['lab_result'].apply(lambda s:s[5])
X_test['LAB_DTIME'] = X_test['lab_result'].apply(lambda s:s[6])
X_test['LAB_PRED'] = X_test['lab_result'].apply(lambda s:s[7])
X_test.head()

In [None]:
X_test.to_csv('/home/jovyan/work/NER-Test/data/test/train_round123_v31_structured_v02_v02.csv',index=False)

# Performance of frequent tests

In [None]:
df_data = pd.read_csv('data/train/train_round123_validated_v31.csv')
df_data_freq = pd.read_csv('data/train/train_round123_validated_freq_v31.csv')
df_data = df_data.fillna(value='')
df_data_freq = df_data_freq[df_data_freq['freq']>2][['lab_result_0','CLASS']].copy()
df_data = pd.concat([df_data[df_data['CLASS']==''],df_data_freq])
df_data['lab_result_0'] = df_data['lab_result_0'].apply(lambda s: re.sub(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}[:\d{1,2}]+|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}[:\d{1,2}]+','',str(s)).strip())
df_data = df_data.fillna(value='')
df_data['select'] = df_data['lab_result_0'].apply(lambda s: False if len(str(s).strip())<1 else True)
df_data = df_data[df_data['select']][['lab_result_0','CLASS']].copy()

df_data = df_data.fillna(value='')
df_data['lab_result'] = df_data['lab_result_0'].apply(extract_labs)
df_data['LAB_TEXT'] = df_data['lab_result'].apply(lambda s:s[0])
df_data['LAB_TEST'] = df_data['lab_result'].apply(lambda s:s[1])

df_data['check'] = df_data.apply(lambda s: True if s[1]=='LAB' and s[4]=='' else False, axis=1)
df_data.to_csv('/home/jovyan/work/NER-Test/data/test/train_round123_validated_v32_freq.csv',index=False)
df_data.shape, df_data[df_data['check']].shape

In [None]:
# #Lab samples 
df_train_data.shape, df_data_freq[df_data_freq['freq']>2].shape, data.shape, data[data['CLASS']=='LAB'].shape