<a href="https://colab.research.google.com/github/Nuccy90/Master_thesis/blob/master/GloVe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sklearn
import pandas as pd
import numpy as np
import re
import itertools
import io
import nltk
import pickle
from tensorflow import keras
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier
from datetime import datetime
from google.colab import drive
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
def read_data(name):
    
    df = pd.read_csv(name)
    df['Text'] = df['Text'].fillna("  ")
    df['Title'] = df['Title'].fillna("  ")
    df = df[(df["Text"] != "  ") | (df["Title"]!= "  ")]
    docs = df['Title'] + df['Text']
    Y = df["Diagnosis"].values
    
    return df, docs, Y

In [0]:
def cleanText(text):
    
    sw = stopwords.words('english')[35:]
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = re.sub(r'\d{6,}', r'<NUM>', text)
    text = text.lower()
    text = ' '.join(word for word in text.split() if not word in sw)
    
    return text

In [0]:
def get_embed_mat(glove_file, max_features, tokenizer):
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embed_dim = 200
    
    # word vectors
    embeddings_index = dict(get_coefs(*s.rstrip().rsplit(' ')) for s in open(glove_file, encoding='utf8'))
    print('Found %s word vectors.' % len(embeddings_index))

    # embedding matrix
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    all_embs = np.stack(embeddings_index.values()) #for random init
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), 
                                        (num_words, embed_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    max_features = embedding_matrix.shape[0]
    
    return embedding_matrix

In [0]:
def sort_by_date(df):

    di = {}        
    for index, row in df.iterrows():

        sub_id = row["Subject"]
        diagnosis = row["Diagnosis"]
        if sub_id in di:
            di[sub_id][1].append(row["Text"]+row["Title"])
            di[sub_id][2].append(row["Date"])
        else:
            di[sub_id] = [diagnosis,[row["Text"]+row["Title"]], [row["Date"]]]

    for key in di:
        list_of_datetimes = [datetime.strptime(x, ' %Y-%m-%d %H:%M:%S ') for x in di[key][2]]

        lists = sorted(zip(*[list_of_datetimes, di[key][1]]))
        dates, texts = list(zip(*lists))

        di[key][1] = texts
        di[key][2] = dates
        
    return di

In [0]:
def predict_texts(di, model, tokenizer):
    print("Starting prediction...")
    pred_dict = {}
    
    def predict_scores(sub_id, texts, model = model, tokenizer = tokenizer, pred_dict = pred_dict):
    
        #add score to dictionary
        x_texts = tokenizer.texts_to_sequences(texts)
        x_texts = sequence.pad_sequences(x_texts, maxlen=100)
        scores = model.predict(x_texts)
        pred_dict[sub_id] = scores
            
        return pred_dict
    
    for sub_id in di:
        texts = di[sub_id][1]
        pred_dict = predict_scores(sub_id, texts)
    
    print("Prediction done!")
    return pred_dict

In [0]:
def create_train_vectors(pred_dict, di):
    
    x_train = []
    y_train = []

    for i in range(1,2000):
        for sub_id in pred_dict:
        
            if i >= len(pred_dict[sub_id]):
                pass
            else:
                seen = pred_dict[sub_id][:i]
                avg = np.mean(seen)
                sd = np.std(seen)
                top_n = int(round((20*i)/100))
                topn_avg = np.mean(np.sort(seen)[top_n:])
                bottomn_avg = np.mean(np.sort(seen)[:top_n+1])
                diff = topn_avg - bottomn_avg
                n_texts = (i-1)/(1999-1)
            
                x = np.array([n_texts,avg,sd,topn_avg,diff])
                x_train.append(x)
                y_train.append(di[sub_id][0])
                
    return np.array(x_train), np.array(y_train)

In [0]:
def predict_test_vectors(pred_dict, di, clf2):
    
    verdict_dict = {}
    
    for i in range(1,2000):
        for sub_id in pred_dict:
        
            if i >= len(pred_dict[sub_id]):
                verdict_dict[sub_id].append(verdict_dict[sub_id][-1])
            else:
                seen = np.array(pred_dict[sub_id][:i])
                avg = np.mean(seen)
                sd = np.std(seen)
                top_n = int(round((20*i)/100))
                topn_avg = np.mean(np.sort(seen)[top_n:])
                bottomn_avg = np.mean(np.sort(seen)[:top_n+1])
                diff = topn_avg - bottomn_avg
                n_texts = (i-1)/(1999-1)

                x = np.array([n_texts,avg,sd,topn_avg,diff])
                x = x.reshape(1,-1)
                verdict = clf2.predict(x)[0]

                if sub_id in verdict_dict:
                    verdict_dict[sub_id].append(verdict)
                else:
                    verdict_dict[sub_id] = [verdict]
                
    return verdict_dict

In [0]:
def evaluate(verdict_dict, o):
    
    # create dataframe to hold the data necessary for the final calculations
    cols = ["subject", "true_risk", "risk_decision", "delay", "erde"]

    df_final = pd.DataFrame(index = range(94),columns = cols)

    count = 0
    with open("/content/drive/My Drive/risk_test_users.txt", 'r') as f:
        for line in f.readlines():
            df_final.iloc[count]['subject'] = line.split('\t')[0].strip()
            df_final.iloc[count]['true_risk'] = float(line.split('\t')[1].strip())
            count += 1


    # here i put the risk_decision and delay in the dataframe  
    for key in verdict_dict:
        sub_row = df_final.index[df_final['subject'] == key].tolist()[0]

        df_final.iloc[sub_row,2] = verdict_dict[key][-1]

        if (df_final.iloc[sub_row,2] == 1) & (df_final.iloc[sub_row,1] == 1):
            df_final.iloc[sub_row,3] = verdict_dict[key].index(1)

    #extract the data
    risk_d = df_final['risk_decision']
    t_risk = df_final['true_risk']
    k = df_final['delay']
    erde = df_final['erde']

    # Count of how many true positives there are
    true_pos = len(df_final[t_risk==1])

    # Count of how many positive cases the system decided there were
    pos_decisions = len(df_final[risk_d==1])

    # Count of how many of them are actually true positive cases
    pos_hits = len(df_final[(t_risk==1) & (risk_d==1)])

    # Total count of users
    total_users = len(df_final)

    # ERDE calculations
    for i in range(total_users):
        if(risk_d[i] == 1 and t_risk[i] == 0):
            erde.iloc[i] = float(true_pos)/total_users
        elif(risk_d[i] == 0 and t_risk[i] == 1):
            erde.iloc[i] = 1.0
        elif(risk_d[i] == 1 and t_risk[i] == 1):
            erde.iloc[i] = 1.0 - (1.0/(1.0+np.exp(k[i]-o)))
        elif(risk_d[i] == 0 and t_risk[i] == 0):
            erde.iloc[i] = 0.0

    # Calculation of F1, Precision, Recall and global ERDE
    precision = float(pos_hits)/pos_decisions
    recall = float(pos_hits)/true_pos
    F1 = 2 * (precision * recall) / (precision + recall)
    erde_global = erde.mean() * 100

    #indiv_erde = df_final.iloc[:,['subject','erde']]
    #print (indiv_erde.to_string())
    print ('Global ERDE (with o = %d): %.2f' % (o, erde_global), '%')
    print ('F1: %.2f' % F1)
    print ('Precision: %.2f' % precision)
    print ('Recall: %.2f' % recall)
    return df_final

In [0]:
# read data, clean up texts and tokenize

df_train, docs_train, Y_train = read_data('/content/drive/My Drive/training.csv')
df_test, docs_test, Y_test = read_data('/content/drive/My Drive/test.csv')

docs_train = docs_train.apply(cleanText)
docs_test = docs_test.apply(cleanText)

In [0]:
# get vectors

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(docs_train)

X_train = tokenizer.texts_to_sequences(docs_train)
X_test = tokenizer.texts_to_sequences(docs_test)

max_text_length = 100
X_train = sequence.pad_sequences(X_train, maxlen=max_text_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_text_length)

In [0]:
glove_file = "/content/drive/My Drive/glove.twitter.27B.200d.txt"
max_features = 10000
embedding_matrix = get_embed_mat(glove_file, max_features, tokenizer)

Found 1193514 word vectors.


In [0]:
# create Keras classifier with embedding layer

embedding_dim = 200

model = Sequential()
model.add(Embedding(10000, embedding_dim, input_length=max_text_length, 
                    weights=[embedding_matrix], trainable=False))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 100)          120400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)      

In [0]:
filepath="/content/drive/My Drive/Models/glove_new_split.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint, EarlyStopping(monitor='val_loss',min_delta=0.0001)]
model.fit(X_train, Y_train, validation_split=0.2, callbacks=callbacks_list, epochs=10, batch_size=128)

Instructions for updating:
Use tf.cast instead.
Train on 163076 samples, validate on 40770 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.92882, saving model to /content/drive/My Drive/Models/glove_new_split.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.92882 to 0.92978, saving model to /content/drive/My Drive/Models/glove_new_split.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.92978 to 0.92987, saving model to /content/drive/My Drive/Models/glove_new_split.hdf5


<keras.callbacks.History at 0x7f1292e996d8>

In [0]:
model = keras.models.load_model('/content/drive/My Drive/Models/glove_new_split.hdf5')

In [0]:
scores = model.evaluate(X_test, Y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 91.39%


In [0]:
# sort texts by date and make predictions for both training and test set
df_train['Text'] = df_train['Text'].apply(cleanText)
df_test['Text'] = df_test['Text'].apply(cleanText)

dict_train = sort_by_date(df_train)
dict_test = sort_by_date(df_test)

In [0]:
pred_dict_train = predict_texts(dict_train, model, tokenizer)
pred_dict_test = predict_texts(dict_test, model, tokenizer)

Starting prediction...
Prediction done!
Starting prediction...
Prediction done!


In [0]:
x_train, y_train = create_train_vectors(pred_dict_train, dict_train)
x_test, y_test = create_train_vectors(pred_dict_test, dict_test)

In [0]:
clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(10,2))
clf.fit(x_train, y_train)

#pickle.dump(clf, open('/content/drive/My Drive/Models/clf_glove_new_split.sav', 'wb'))

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [0]:
clf.score(x_test, y_test)

0.9308847402597402

In [0]:
from sklearn.linear_model import LogisticRegression

clf2 = LogisticRegression(max_iter=2000, class_weight = 'balanced',solver='saga')
clf2.fit(x_train, y_train)
clf2.score(x_test, y_test)

0.8501826298701298

In [0]:
# predict test set and evaluate

verdict_dict = predict_test_vectors(pred_dict_test, dict_test, clf)
df_final = evaluate(verdict_dict, 5)

Global ERDE (with o = 5): 8.71 %
F1: 0.86
Precision: 1.00
Recall: 0.75


In [0]:
df_final

Unnamed: 0,subject,true_risk,risk_decision,delay,erde
0,subject8411,1,0,,1
1,subject626,0,0,,0
2,subject6670,0,0,,0
3,subject5220,0,0,,0
4,subject2359,0,0,,0
5,subject6333,0,0,,0
6,subject5469,0,0,,0
7,subject5241,0,0,,0
8,subject31,0,0,,0
9,subject1152,1,1,10,0.993307
