# Active larning labelling to create train and dev sets for the text classifier.

In [2]:
import pandas as pd
import numpy as np 

import pathlib
import os 
import shutil
from os import listdir
from os.path import isfile, join
import glob

import matplotlib
import matplotlib.pyplot as plt
import os
from time import time
import numpy as np
import pylab as pl
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import itertools
import shutil
from sklearn import preprocessing
from  sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
import random
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split

## I. Create files and load datasets

In [4]:
path = os.getcwd()

shutil.rmtree("train", ignore_errors=True)

shutil.rmtree("test", ignore_errors=True)

shutil.rmtree("unlabeled", ignore_errors=True)

shutil.rmtree("all_clac", ignore_errors=True)

shutil.rmtree("all_pas_clac", ignore_errors=True)

In [5]:
# clac is the name we gave to sequences of texte referring to a major adverse event (positive)
# pas_clac are sequences that do not refer to a major adverse event (negative sample).

path = os.getcwd()

path = "train/pas_clac"
os.makedirs(path)

path = "test/clac"
os.makedirs(path)

path = "test/pas_clac"
os.makedirs(path)

path = "train/clac"
os.makedirs(path)

path = "unlabeled/unlabeled"
os.makedirs(path)

# all_clac and all_pas_clac will be used to shuffle samples.

path_all_clac = "./all_clac"
os.makedirs(path_all_clac)

path_all_pas_clac = "./all_pas_clac"
os.makedirs(path_all_pas_clac)

In [None]:
# Load text sequence to annotate

extraction_tc = pd.read_excel('extraction_tc.xlsx')

df_extraction_tc = pd.DataFrame(extraction_tc)

df_stacked1 = df_extraction_tc.set_index(['IPP', 'DDK']).stack()

extraction_tc2 = pd.read_excel('extraction_tc2.xlsx')

df_extraction_tc2 = pd.DataFrame(extraction_tc2)

df_stacked2 = df_extraction_tc2.set_index(['IPP', 'DDK']).stack()

extraction_tc3 = pd.read_excel('extraction_tc3.xlsx')

df_extraction_tc3 = pd.DataFrame(extraction_tc3)

df_stacked3 = df_extraction_tc3.set_index(['IPP', 'DDK']).stack()

df_stacked = pd.concat([df_stacked1, df_stacked2, df_stacked3])

In [None]:
# Create a correspondence table (between sequence and patient ID).

df_stacked = df_stacked.reset_index()

df_stacked.rename(columns={0:'texte_complication', 'level_2':'type_complication'}, inplace=True)

df_stacked.to_excel('table_de_correspondance.xlsx')

In [None]:
# Texte preprocessing

noise_list = [",", ".", "?", ";", ":", "/", "!", "-", "+", "ÿ", "à"] 
word_noise_list = ["a", "ainsi", "assez", "au", "aux", "ce", "ceci", "cela",
                   "car", "ces", "cette", "ce", "celle", "du", "en", "il", "ils"
                   "elle", "elles", "que", "qui", "qu'", "se", "son", "sa", "ses",
                   "le", "la", "les", "l'", "un", "une", "de", "des", "au", "du"]


def remove_noise(input_text):
    noise_free_words = [caracter for caracter in input_text.lower() if caracter not in noise_list]
    noise_free_text = "".join(noise_free_words) 
    word_list = [word for word in noise_free_text.split(" ") if word not in word_noise_list]
    ouput_text = " ".join(word_list)
    return ouput_text

df_stacked['texte_complication'] = df_stacked['texte_complication'].apply(remove_noise)

In [None]:
# Create a representative devset to be annotated without active learning

# Select 10% of the samples at random
df_stacked
index = df_stacked.index
number_of_rows = len(index)
x = number_of_rows // 10
print("Il faut mettre de coté :", x, "exemples.")

df_dev = df_stacked.sample(n=x, random_state=1)
df_dev.to_excel("dev_unlabeled.xlsx") 

# Other samples
df_model = pd.concat([df_stacked, df_dev]).drop_duplicates(keep=False)

In [None]:
# Files .csv with samples in it and nammed after correspondence table index

directory = pathlib.Path().absolute()
DATA_FOLDER = directory
TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train")
TEST_FOLDER = os.path.join(DATA_FOLDER, "test")
UNLABELED_FOLDER = os.path.join(DATA_FOLDER, "unlabeled")
ENCODING = 'utf-8'
categories = ['clac', 'pas_clac']

for idx, row_data in df_stacked.iterrows():
    file_name = os.path.join(UNLABELED_FOLDER, 'unlabeled')
    file_name = os.path.join(os.path.join(file_name), str(idx) + '.txt')
       
    create_file = open(file_name, "w+", encoding='utf-8')
    create_file.write(row_data['texte_complication'])
    create_file.close()

## II. Labelling at random (300 samples to begin)

In [None]:
s = 300


for row in df_model.sample(s).iterrows():
    file_name = os.path.join(UNLABELED_FOLDER, 'unlabeled')
    file_name = os.path.join(os.path.join(file_name), str(row[0]) + '.txt')
    print(row[1]['texte_complication'])
    labelNumber = input("Enter the correct label number: 1 = CLAC OU 2 = pas une CLAC :")
    while labelNumber.isdigit()== False:
        labelNumber = input("Enter the correct label number")
    labelNumber = int(labelNumber)
    category = categories[labelNumber - 1]
    if np.random.rand() < .3:
        dstDir = os.path.join(TEST_FOLDER, category)         
    else:
        dstDir = os.path.join(TRAIN_FOLDER, category) 
    print(dstDir)
    print(file_name)
    shutil.move(file_name, dstDir)

## III. Active labelling

In [None]:
# Functions definition

# File size of the dataset
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

# Fit terminal size
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

# Benchmark classifiers
def benchmark(clf, X_train, X_test, y_train, y_test, X_unlabeled):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()

    # Create a scaler fitted to X_train to later standarize all the subsets with the same scale
    scaler = preprocessing.StandardScaler(with_mean=False)
    scaler = scaler.fit(X_train)

    X_train = scaler.transform(X_train)  # Standardizing     
    clf.fit(X_train, y_train)

    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    X_test = scaler.transform(X_test) # Standardizing
    pred = clf.predict(X_test) 
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.f1_score(y_test, pred)
    accscore = metrics.accuracy_score(y_test, pred)
    print ("pred count is %d" %len(pred))
    print ('accuracy score:     %0.3f' % accscore)
    print("f1-score:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    print("classification report:")
    print(metrics.classification_report(y_test, pred,
                                            target_names=categories))
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
    print("tn :", tn, "fp :", fp, "fn :", fn, "tp :", tp)

    # Plot non-normalized confusion matrix
    titles_options = [("Confusion matrix, without normalization", None),
                      ("Normalized confusion matrix", 'true')]
    for title, normalize in titles_options:
        disp = plot_confusion_matrix(clf, X_test, y_test,
                             display_labels=categories,
                             cmap=plt.cm.Blues,
                             normalize=normalize)
        disp.ax_.set_title(title)
        print(title)
        print(disp.confusion_matrix)
    plt.show()

    print("confidence for unlabeled data:")

    X_unlabeled = scaler.transform(X_unlabeled) # Standardizing ------------------

    # compute absolute confidence for each unlabeled sample in each class
    ## To suggest negative samples (i.e. more likely to be major adverse event)
    question_samples = []
    confidences = - clf.decision_function(X_unlabeled)
    sorted_confidences = np.argsort(confidences)
    high_confidence_samples = sorted_confidences[-NUM_QUESTIONS:]
    question_samples.extend(high_confidence_samples.tolist())

    ## OR to suggest samples closest to the line (hardest to classify)
    confidences = np.abs(clf.decision_function(X_unlabeled))
    sorted_confidences = np.argsort(confidences)
    low_confidence_samples = sorted_confidences[0:NUM_QUESTIONS]
    question_samples.extend(low_confidence_samples.tolist())
    
    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time, question_samples



#######

NUM_QUESTIONS = 2
PLOT_RESULTS = True
ACTIVE = True
directory = pathlib.Path().absolute()
DATA_FOLDER = directory
TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train")
TEST_FOLDER = os.path.join(DATA_FOLDER, "test")
UNLABELED_FOLDER = os.path.join(DATA_FOLDER, "unlabeled")
ENCODING = 'utf-8'

while True:
    data_train = load_files(TRAIN_FOLDER, encoding=ENCODING)
    data_test = load_files(TEST_FOLDER, encoding=ENCODING)
    data_unlabeled = load_files(UNLABELED_FOLDER, encoding=ENCODING)
    categories = data_train.target_names
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    data_unlabeled_size_mb = size_mb(data_unlabeled.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d documents - %0.3fMB (unlabeled set)" % (
        len(data_unlabeled.data), data_unlabeled_size_mb))
    print("%d categories" % len(categories))
    print()
    y_train = data_train.target
    y_test =  data_test.target
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    vectorizer = TfidfVectorizer(encoding= ENCODING, use_idf=True, norm='l1', binary=False, sublinear_tf=True, min_df=0.001, max_df=1.0, ngram_range=(1, 2), analyzer='word', stop_words=None)
    
    # the output of the fit_transform (x_train) is a sparse csc matrix.
    X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    # print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    # print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    
    print("Extracting features from the unlabled dataset using the same vectorizer")
    t0 = time()
    X_unlabeled = vectorizer.transform(data_unlabeled.data)
    duration = time() - t0
    # print("done in %fs at %0.3fMB/s" % (duration, data_unlabeled_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_unlabeled.shape)
    print()

    results = []
    results.append(benchmark(LinearSVC(loss='l2', penalty='l2', 
                                                dual=False, tol=1e-3, class_weight='balanced'), 
                                                 X_train, X_test, y_train, y_test, X_unlabeled))
    
    # make some plots
    indices = np.arange(len(results))
    results = [[x[i] for x in results] for i in range(5)]
    
    clf_names, score, training_time, test_time, question_samples = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)
    
    if PLOT_RESULTS:
        pl.figure(figsize=(12,8))
        pl.title("Score")
        pl.barh(indices, score, .2, label="score", color='r')
        pl.barh(indices + .3, training_time, .2, label="training time", color='g')
        pl.barh(indices + .6, test_time, .2, label="test time", color='b')
        pl.yticks(())
        pl.legend(loc='best')
        pl.subplots_adjust(left=.25)
        pl.subplots_adjust(top=.95)
        pl.subplots_adjust(bottom=.05)
        
        for i, c in zip(indices, clf_names):
            pl.text(-.3, i, c)
        pl.savefig('ngramoptimize.png')
        pl.show()

    if ACTIVE:
        for i in question_samples[0]:
            filename = data_unlabeled.filenames[i]
            print (filename)
            print ('**************************content***************************')
            print (data_unlabeled.data[i])
            print ('**************************content end***********************')
            print ("Annotate this text (select one label):")
            for i in range(0, len(categories)):
                print ("%d = %s" %(i+1, categories[i]))
            labelNumber = input("Enter the correct label number:")
            while labelNumber.isdigit()== False:
                labelNumber = input("Enter the correct label number (a number please):")
            labelNumber = int(labelNumber)
            category = categories[labelNumber - 1] 
            dstDir = os.path.join(TRAIN_FOLDER, category) 
            shutil.move(filename, dstDir)
            
        #shuffle train and test sets
            
            dstDir = os.path.join("./all_clac") 
            srcDir = os.path.join("./train/clac/")
            # selectionner les fichiers texte à déplacer
            files = [f for f in glob.glob(srcDir + "/*.txt", recursive=True)]

            #for f in files:
             #   print(f)
            
            for f in files:
                shutil.move(f, dstDir)
                
            dstDir = os.path.join("./all_clac") 
            srcDir2 = os.path.join("./test/clac")

            files = [f for f in glob.glob(srcDir2 + "/*.txt", recursive=True)]

            #for f in files:
             #   print(f)

            for f in files:
                shutil.move(f, dstDir)
            
            dstDir = os.path.join("./all_pas_clac") 
            srcDir = os.path.join("./train/pas_clac/")

            files = [f for f in glob.glob(srcDir + "/*.txt", recursive=True)]

            #for f in files:
            #    print(f)

            for f in files:
                shutil.move(f, dstDir)
    
            dstDir = os.path.join("./all_pas_clac") 
            srcDir2 = os.path.join("./test/pas_clac")

            files = [f for f in glob.glob(srcDir2 + "/*.txt", recursive=True)]

            #for f in files:
            #     print(f)

            for f in files:
                shutil.move(f, dstDir)
            
            srcDir_all_clac = os.path.join("./all_clac")
            srcDir_all_pas_clac = os.path.join("./all_pas_clac")

            # To count samples in each class
            no_clac = len(os.listdir(srcDir_all_clac))
            print("Nombre de complications liées au cathéterisme :")
            print(no_clac)

            no_pas_clac = len(os.listdir(srcDir_all_pas_clac))
            print("Nombre d'exemples qui ne sont pas des complications liées au cathéterisme :")
            print(no_pas_clac)

            no_total = len( (os.listdir(srcDir_all_clac)) + (os.listdir(srcDir_all_pas_clac)) )
            print("Total :")  
            print(no_total)
            
            s_test_clac = int(no_clac * 0.3)
            print("Nombre d'exemples de clac pour le test set :")
            print(s_test_clac)

            s_test_pas_clac = int(no_pas_clac * 0.3)
            print("Nombre d'exemples de pas clac pour le test set :")
            print(s_test_pas_clac)

            s_train_clac = no_clac - s_test_clac
            print("Nombre d'exemples de clac pour le train set :")
            print(s_train_clac)

            s_train_pas_clac = no_pas_clac - s_test_pas_clac
            print("Nombre d'exemples de pas clac pour le train set :")
            print(s_train_pas_clac)
            
            s_train = s_train_clac + s_train_pas_clac
            print("Nombre d'exemples pour le train set :")
            print(s_train)

            s_test = s_test_clac + s_test_pas_clac
            print("Nombre d'exemples pour le test set :")
            print(s_test)
        
            def move(srcDir, dstDir, share=None):
                files = [f for f in glob.glob(srcDir + "/*.txt", recursive=True)]
                if share is None:
                    share =  len(files)
                for f in random.sample(files, share):
                    shutil.move(f, dstDir)
                    
            # test_clac

            srcDir = os.path.join("./all_clac")
            dstDir = os.path.join("./test/clac")
            share = s_test_clac

            move(srcDir, dstDir, share)
            
            # train_clac

            srcDir = os.path.join("./all_clac")
            dstDir = os.path.join("./train/clac")
            share = (s_train_clac - 1)
            move(srcDir, dstDir, share)


            # test_pas_clac

            srcDir = os.path.join("./all_pas_clac")
            dstDir = os.path.join("./test/pas_clac")
            share = s_test_pas_clac

            move(srcDir, dstDir, share)
            
            # train_pas_clac

            srcDir = os.path.join("./all_pas_clac")
            dstDir = os.path.join("./train/pas_clac")
            share = (s_train_pas_clac - 1)
            move(srcDir, dstDir, share)
            
            
    else:
        break                      

## IV. Classify unlabeled samples

## V. Create DEV and TRAIN datasets for FLAIR Text Classifier

In [None]:
# To group samples and make with 30% of them a DEV set and 70% of them a TRAIN set
# tu train FLAIR Text Classifier.
# We will evaluate classifier performances on a TEST set labelled at random (i.e. without
# active learning)

# Files with major adverse event samples
path_clac1 = 'train/clac'
path_clac2 = 'test/clac'

ext = '.txt' # Select your file delimiter

file_dict = {} # Create an empty dict

# Select only files with the ext extension
txt_files_1 = [i for i in os.listdir(path_clac1) if os.path.splitext(i)[1] == ext]
txt_files_2 = [i for i in os.listdir(path_clac2) if os.path.splitext(i)[1] == ext]

# to check samples number
print(len(txt_files_1))
print(len(txt_files_2))

print('********************************************')

# Iterate over your txt files
for f in txt_files_1:
    # Open them and assign them to file_dict
    with open(os.path.join(path_clac1,f)) as file_object:
        file_dict[f] = file_object.read()
        
for f in txt_files_2:
    # Open them and assign them to file_dict
    with open(os.path.join(path_clac2,f)) as file_object:
        file_dict[f] = file_object.read()
        
# Iterate over your dict and print the key/val pairs.
for i in file_dict:
    print (i, file_dict[i])
    
# Create a dataframe from this dict
df = pd.DataFrame.from_dict(file_dict, orient='index')

# Add label 1
df['label'] = "1"

# reset index et nommer les colonnes
df = df.reset_index()
df.columns = ["index", "exemple", "label"]

print(df)

In [None]:
df.to_csv('touteslesclacs.csv', sep=';', encoding='utf-8')

In [None]:
# Files with NON major adverse event samples
path_clac1 = 'train/pas_clac'
path_clac2 = 'test/pas_clac'

ext = '.txt' # Select your file delimiter

file_dict = {} # Create an empty dict

# Select only files with the ext extension
txt_files_1 = [i for i in os.listdir(path_clac1) if os.path.splitext(i)[1] == ext]
txt_files_2 = [i for i in os.listdir(path_clac2) if os.path.splitext(i)[1] == ext]

# check
print(len(txt_files_1))
print(len(txt_files_2))
print('********************************************')

# Iterate over your txt files
for f in txt_files_1:
    # Open them and assign them to file_dict
    with open(os.path.join(path_clac1,f)) as file_object:
        file_dict[f] = file_object.read()
        
for f in txt_files_2:
    # Open them and assign them to file_dict
    with open(os.path.join(path_clac2,f)) as file_object:
        file_dict[f] = file_object.read()
        
# Iterate over your dict and print the key/val pairs.
for i in file_dict:
    print (i, file_dict[i])
    
# Create a dataframe from this dict
df = pd.DataFrame.from_dict(file_dict, orient='index')

# Add label 0
df['label'] = "0"

# reset index et nommer les colonnes
df = df.reset_index()
df.columns = ["index", "exemple", "label"]

print(df)

In [None]:
df.to_csv('touteslespasclacs.csv', sep=';', encoding='utf-8')

In [None]:
# Concat every annotated samples (positives or not)
# and then split DEV and TRAIN

In [None]:
clac = pd.read_csv('touteslesclacs.csv', sep=';', encoding='utf-8')
pas_clac = pd.read_csv('touteslespasclacs.csv', sep=';', encoding='utf-8')

clac = clac[['index', 'exemple', 'label']]
pas_clac = pas_clac[['index', 'exemple', 'label']]

exemples_annotes = pd.concat([clac, pas_clac])

train, test = train_test_split(exemples_annotes, test_size=0.3)

print(train)

print('*****************************************')

print(test)

In [None]:
train.to_csv('train.csv', sep=';', encoding='utf-8')

In [None]:
test.to_csv('dev.csv', sep=';', encoding='utf-8')