In [None]:
import ast
import pandas as pd
import numpy as np
import glob
import random
import os
import re
from datetime import datetime
import pyreadr
import itertools
from functools import reduce
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer #convert text comment into a numeric vector
from sklearn.feature_extraction.text import TfidfTransformer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC# Support Vector Machine
from sklearn.pipeline import Pipeline #pipeline to implement steps in series
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
import preprocessor as p
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_curve, auc
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict, KFold, LeaveOneOut
from IPython.display import clear_output
import gensim
from sklearn.model_selection import train_test_split
import textblob
nltk.download('averaged_perceptron_tagger')
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import mifs
from sklearn.utils.class_weight import compute_sample_weight
from collections import Counter
pd.set_option('display.max_colwidth', None)
import torch
import transformers
from  transformers import DistilBertModel, DistilBertTokenizer, AutoModel, AutoTokenizer

#######################################################################
#word2vec
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import logging

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [None]:
path = "/Users/Maryam/Library/Mobile Documents/com~apple~CloudDocs/_____________projects/ED_visit_speech_clinicalNotes_OASIS/DATA/"

path_data = "/Users/Maryam/Library/Mobile Documents/com~apple~CloudDocs/_____________projects/___predicting_cogntive_status_symptoms/DATA/"

path_results = "/Users/Maryam/Library/Mobile Documents/com~apple~CloudDocs/_____________projects/___predicting_cogntive_status_symptoms/RESULTS/"

In [None]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

def classifier_results(X, y, t, clf):
    loo = LeaveOneOut()
    all_y_pred_probs = []
    y_tests = []
    for i, (train_idx, test_idx) in enumerate(loo.split(X)):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        X_train_t = t.fit_transform(X_train)
        X_test_t = t.transform(X_test)
        model = clf.fit(X_train_t, y_train)
        y_pred_probs = model.predict_proba(X_test_t)
        all_y_pred_probs.append(y_pred_probs.ravel().tolist()+y_test.tolist())
        y_tests.append(y_test[0])
    all_y_pred_probs = np.array(all_y_pred_probs)
    # Optimum F1 score
    ## keep probabilities for the positive outcome only
    probs = all_y_pred_probs[:, 1]
    ## define thresholds
    thresholds = np.arange(0, 1, 0.001)
    ## evaluate each threshold
    f1_scores = [f1_score(y_tests, to_labels(probs, t)) for t in thresholds]
    ## get best threshold
    ix = np.argmax(f1_scores)
    f1_score_optimum = round(f1_scores[ix]*100, 2)
    ## get f2_score, precision, and recall based on f1_score_optimum
    f2_score = round(fbeta_score(y_tests, to_labels(probs, thresholds[ix]), beta=2)*100, 2)
    precision = round(precision_score(y_tests, to_labels(probs, thresholds[ix]))*100, 2)
    recall = round(recall_score(y_tests, to_labels(probs, thresholds[ix]))*100, 2)
    # AUC
    fpr, tpr, threshs = roc_curve(y_tests, probs)
    auc_score = auc(fpr, tpr)
    auc_score = round(auc_score*100, 2)
    return f1_score_optimum, auc_score, f2_score, precision, recall, round(thresholds[ix]*100, 2), all_y_pred_probs


seed=55
clf_svm_rbf = SVC(kernel='rbf', probability=True, random_state=seed)#, class_weight='balanced')
clf_svm_lin = SVC(kernel='linear', probability=True, random_state=seed)
clf_xgb = XGBClassifier(random_state=seed)
clf_lr = LogisticRegression(random_state=seed)
clfs_list = [clf_svm_rbf, clf_svm_lin, clf_xgb, clf_lr]
clfs_name = ['SVM-rbf', 'SVM-linear', 'XGB', 'LR']

df_result = pd.DataFrame(columns=['Approach', 'Pipline', 'pretrain_weights', 'embedding_structure',
                                  'Using_jmim', 'Classifier',
                                  'Best_threshold_MinMax', 'F1-score_MinMax', 'AUC_MinMax','F2-score_MinMax',
                                  'Precision_MinMax', 'Recall_MinMax',
                                  'Best_threshold_StandardScaler', 'F1-score_StandardScaler', 'AUC_StandardScaler',
                                  'F2-score_StandardScaler', 'Precision_StandardScaler', 'Recall_StandardScaler'
                                  ])

# Classification

In [None]:
def classifying_all(df, clfs_list, clfs_name, get_probability=False, get_minmax=False, approach=None,
                    pipline=None, pretrain_weights=None, embedding_structure=None, using_jmim=False,
                    stan_scal=False, unit_vector=False, nurse=False, most_important_features=False,
                    n_features_jmim='auto'):

    if 'date_time' in df.columns.tolist():
        df = df.drop(['date_time'], axis=1)

    if nurse:
        add_to_columns = '_ns'
    else:
        add_to_columns = '_pt'
    # Tf-IDF
    if pipline=='TF-IDF' or pipline=='UMLS':
        if pipline=='UMLS':
            variable = 'cui_result_to_tfidf'
        else:
            variable = 'text'

        tfidf = TfidfVectorizer(stop_words= {'english'})
        X_tf = tfidf.fit_transform(df[variable].to_numpy()).toarray()
        df_new = pd.DataFrame(data=X_tf, columns=np.array(tfidf.get_feature_names(), dtype=object)+add_to_columns).copy()
    # LIWC
    elif pipline=='LIWC':
        df_new = df.drop(['study_id', 'text', 'outcome', 'hosp_ed_ind'], axis=1).copy()
        df_new.columns = df_new.columns + add_to_columns
    # OASIS
    elif pipline=='OASIS':
        df_new = df.drop(['study_id', 'oasis_id', 'outcome'], axis=1).copy()
    # Turn taking feature
    elif pipline=='Turn_taking':
        df_new = df.drop(['study_id', 'outcome', 'hosp_ed_ind'], axis=1).copy()
    # Other pipline
    else:
        df_new = df.drop(['study_id', 'outcome'], axis=1).copy()

    X = df_new.to_numpy()
    y = df['outcome'].to_numpy()

    if using_jmim:
        t = MinMaxScaler()
        if stan_scal:
            t = StandardScaler()
        X_t = t.fit_transform(X)

        if unit_vector:
            ratio = np.sqrt(np.sum(np.square(X_t), axis=1))
            ratio = ratio.reshape(ratio.shape[0], 1)
            X_t = X_t/ratio

        MIFS = mifs.MutualInformationFeatureSelector(method='JMIM', k=5, n_features=n_features_jmim, verbose=2)
        MIFS.fit_transform(X_t, df['outcome'])

        df_new_jm = df_new.iloc[:, MIFS.ranking_]
        X = df_new_jm.to_numpy()
        df_new_jm['study_id'] = df['study_id']
        df_new_jm['outcome'] = df['outcome']

        if using_jmim and most_important_features:
            clear_output()
            df_mif = pd.DataFrame({'Selected feature':df_new.columns[MIFS.ranking_], 'JMIM value':MIFS.mi_})
            return df_mif.sort_values('JMIM value', ascending=False)

    row = 0
    for clf, clf_name in zip(clfs_list, clfs_name):
        f1_mm, auc_mm, f2_mm, prec_mm, recall_mm, best_thresh_mm, all_y_pred_probs_mm = classifier_results(X, y, MinMaxScaler(), clf)
        f1_ss, auc_ss, f2_ss, prec_ss, recall_ss, best_thresh_ss, all_y_pred_probs_ss = classifier_results(X, y, StandardScaler(), clf)
        df_result.loc[row, :] = {'Classifier':clf_name,
                                 'Best_threshold_MinMax':best_thresh_mm,
                                 'F1-score_MinMax':f1_mm, 'AUC_MinMax':auc_mm, 'F2-score_MinMax':f2_mm,
                                 'Precision_MinMax':prec_mm, 'Recall_MinMax':recall_mm,
                                 'Best_threshold_StandardScaler':best_thresh_ss,
                                 'F1-score_StandardScaler':f1_ss, 'AUC_StandardScaler':auc_ss, 'F2-score_StandardScaler':f2_ss,
                                 'Precision_StandardScaler':prec_ss, 'Recall_StandardScaler':recall_ss}
        row += 1

    # To export proability
    if get_probability:
        if get_minmax:
            df_probs = pd.DataFrame(data=all_y_pred_probs_mm,
                                    columns=['Proability class 0', 'Proability class 1', 'ground truth label'])
            clear_output()
            return df_probs
        else:
            df_probs = pd.DataFrame(data=all_y_pred_probs_ss,
                                    columns=['Proability class 0', 'Proability class 1', 'ground truth label'])
            clear_output()
            return df_probs
    #################
    df_result['Approach'] = approach
    df_result['Pipline'] = pipline
    df_result['pretrain_weights'] = pretrain_weights
    df_result['embedding_structure'] = embedding_structure

    if using_jmim:
        df_result['Using_jmim'] = 'Yes'
        return df_result.copy(), df_new_jm.copy()

    else:
        df_result['Using_jmim'] = 'No'
        df_new['study_id'] = df['study_id']
        df_new['outcome'] = df['outcome']
        return df_result.copy(), df_new.copy()

In [None]:
#feature : fr
#oasis : os
#jmim : jm

## Part 1: OASIS data

In [None]:
df_os = pd.read_excel(path_data+'OASIS_ci_symptoms_6.5.23.xlsx')
df_os['outcome'].value_counts()

0    28
1    19
Name: outcome, dtype: int64

### Without jmim

In [None]:
df_result_os, df_fr_os = classifying_all(df_os, clfs_list, clfs_name, pipline='OASIS', using_jmim=False)

#### Probability

In [None]:
# df_probs_1 = classifying_all(df_os, [clfs_list[2]], [clfs_name[2]],
#                                         get_probability=True, get_minmax=False, pipline='OASIS', using_jmim=False)
# df_probs_1.to_excel(path_results+'Best_probability_for_each_approach/OASIS_probs.xlsx', index=False)

### With jmim

In [None]:
df_result_os_jm, df_fr_os_jm = classifying_all(df_os, clfs_list, clfs_name,
                                               pipline='OASIS', using_jmim=True, stan_scal=True, unit_vector=True)

In [None]:
df_jm_oasis = classifying_all(df_os, clfs_list, clfs_name, pipline='OASIS', using_jmim=True,
                                               stan_scal=True, unit_vector=True, most_important_features=True)
df_jm_oasis.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/OASIS_JMIM_features.xlsx', index=False)

### Aggregate

In [None]:
df_result_oasis_concat = pd.concat([df_result_os, df_result_os_jm]).reset_index(drop=True)\
                                                .drop(['pretrain_weights', 'embedding_structure', 'Approach'], axis=1)
df_result_oasis_concat.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/1_OASIS.xlsx', index=False)

## Transformer

In [None]:
def preprocessing_text(df_text):
    # remove punctuation marks
    df_text['clean_text'] = df_text['text'].apply(lambda x: re.sub(r'http\S+', '', x))

    punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

    df_text['clean_text'] = df_text['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

    # convert text to lowercase
    df_text['clean_text'] = df_text['clean_text'].str.lower()

    # remove numbers
    df_text['clean_text'] = df_text['clean_text'].str.replace("[0-9]", " ")

    # remove whitespaces
    df_text['clean_text'] = df_text['clean_text'].apply(lambda x:' '.join(x.split()))

    return df_text

In [None]:
def classifying_using_bert(df, clfs_list, clfs_name, approach, pretrain_weights, embedding_structure,
                           using_jmim, unit_vector=False, clinical=False, nurse=False, n_features_jmim='auto'):
    ##############################################
    SEED = 55
    transformers.set_seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    #############################################

    # filling hosp_ed_ind of 1010 study_id with 0
    df = preprocessing_text(df.copy()).copy()
    # BERT
    model = AutoModel.from_pretrained(pretrain_weights)
    tokenizer = AutoTokenizer.from_pretrained(pretrain_weights)
    tokenized = df["clean_text"].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True, )))

    max_len = 512
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)

    input_ids1 = torch.tensor(padded)
    attention_mask1 = torch.tensor(attention_mask)

    for param in model.parameters():
        param.requires_grad = False

    last_hidden_states1 = model(input_ids1, attention_mask=attention_mask1)
    lhs1 = last_hidden_states1[0].numpy()

    if embedding_structure == 'mean':
        X = np.mean(lhs1, axis=1)

    elif embedding_structure == 'first':
        X = lhs1[:, 0, :]

    if nurse:
        add_person_col = '_ns'
    else:
        add_person_col = '_pt'

    if clinical:
        add_data_col = '_clinical'
    else:
        add_data_col = '_transcription'

    df_embedding = pd.DataFrame(data=X, columns=["embedding_dim_" + str(i) + add_data_col + add_person_col  for i in range(X.shape[1])])
    df_embedding['study_id'] = df["study_id"]
    df_embedding['outcome'] = df['outcome']
    y = df['outcome'].to_numpy()

    if using_jmim:
        t = MinMaxScaler()
        # t = StandardScaler()
        X_t = t.fit_transform(X)

        if unit_vector:
            ratio = np.sqrt(np.sum(np.square(X_t), axis=1))
            ratio = ratio.reshape(ratio.shape[0], 1)
            X_t = X_t/ratio

        MIFS = mifs.MutualInformationFeatureSelector(method='JMIM', k=5, n_features=n_features_jmim, verbose=2)
        MIFS.fit_transform(X_t, df['outcome'])

        df_embed_jm = df_embedding.drop(['study_id', 'outcome'], axis=1).iloc[:, MIFS.ranking_]

        X = df_embed_jm.to_numpy()
        df_embed_jm['study_id'] = df_embedding['study_id']
        df_embed_jm['outcome'] = df_embedding['outcome']

    row = 0
    for clf, clf_name in zip(clfs_list, clfs_name):
        f1_mm, auc_mm, f2_mm, prec_mm, recall_mm, best_thresh_mm, all_y_pred_probs_mm = classifier_results(X, y, MinMaxScaler(), clf)
        f1_ss, auc_ss, f2_ss, prec_ss, recall_ss, best_thresh_ss, all_y_pred_probs_ss = classifier_results(X, y, StandardScaler(), clf)
        df_result.loc[row, :] = {'Classifier':clf_name,
                                 'Best_threshold_MinMax':best_thresh_mm,
                                 'F1-score_MinMax':f1_mm, 'AUC_MinMax':auc_mm, 'F2-score_MinMax':f2_mm,
                                 'Precision_MinMax':prec_mm, 'Recall_MinMax':recall_mm,
                                 'Best_threshold_StandardScaler':best_thresh_ss,
                                 'F1-score_StandardScaler':f1_ss, 'AUC_StandardScaler':auc_ss, 'F2-score_StandardScaler':f2_ss,
                                 'Precision_StandardScaler':prec_ss, 'Recall_StandardScaler':recall_ss}
        row += 1
    ####################
    df_result['Approach'] = approach
    df_result['pretrain_weights'] = pretrain_weights
    df_result['embedding_structure'] = embedding_structure

    if using_jmim:
        df_result['Using_jmim'] = 'Yes'
        return df_result.copy(), df_embed_jm.copy()
    else:
        df_result['Using_jmim'] = 'No'
        return df_result.copy(), df_embedding.copy()


## Load clinical note:

In [None]:
df_last_clinic = pd.read_excel(path+'clinical_note_final_data/last_datetime_clinical_note_of_each_patient.xlsx')
df_last_clinical = df_last_clinic.set_index('study_id').drop(1064).reset_index()
df_last_clinical = pd.merge(df_last_clinical, df_os[['study_id', 'outcome']], on='study_id')
df_comb_clinic = pd.read_excel(path+'clinical_note_final_data/combining_clinical_note_of_datetimes_for_each_patient.xlsx')
df_comb_clinical = df_comb_clinic.set_index('study_id').drop(1064).reset_index()
df_comb_clinical = pd.merge(df_comb_clinical, df_os[['study_id', 'outcome']], on='study_id')

## Load patient data:
## transcription + UMLS + LIWC + Turn taking + Linguistic

In [None]:
df_last_transcription = pd.read_excel(path+'transcription_final_data/patient/last_datetime_text_of_each_patient.xlsx')
df_last_transcription = df_last_transcription.set_index('study_id').drop(1022).reset_index()
df_last_transcription = pd.merge(df_last_transcription, df_os[['study_id', 'outcome']], on='study_id')
df_comb_transcription = pd.read_excel(path+'transcription_final_data/patient/combining_text_of_datetimes_for_each_patient.xlsx')
df_comb_transcription = df_comb_transcription.set_index('study_id').drop(1022).reset_index()
df_comb_transcription = pd.merge(df_comb_transcription, df_os[['study_id', 'outcome']], on='study_id')

df_last_umls = pd.read_excel(path+'transcription_final_data/patient/last_datetime_umls_of_each_patient.xlsx')
df_last_umls = df_last_umls.set_index('study_id').drop(1022).reset_index()
df_last_umls = pd.merge(df_last_umls, df_os[['study_id', 'outcome']], on='study_id')
df_comb_umls = pd.read_excel(path+'transcription_final_data/patient/combining_umls_of_datetimes_for_each_patient.xlsx')
df_comb_umls = df_comb_umls.set_index('study_id').drop(1022).reset_index()
df_comb_umls = pd.merge(df_comb_umls, df_os[['study_id', 'outcome']], on='study_id')

df_last_liwc = pd.read_excel(path+'transcription_final_data/patient/LIWC2015 Results (last_datetime_text_of_each_patient).xlsx')
df_last_liwc = df_last_liwc.rename(columns={'Source (A)':'study_id', 'Source (B)':'text', 'Source (C)':'hosp_ed_ind'})
df_last_liwc = df_last_liwc.set_index('study_id').drop(1022).reset_index()
df_last_liwc = pd.merge(df_last_liwc, df_os[['study_id', 'outcome']], on='study_id')
df_comb_liwc = pd.read_excel(path+'transcription_final_data/patient/LIWC2015 Results (combining_text_of_datetimes_for_each_patient).xlsx')
df_comb_liwc = df_comb_liwc.rename(columns={'Source (A)':'study_id', 'Source (B)':'hosp_ed_ind', 'Source (C)':'text'})
df_comb_liwc = df_comb_liwc.set_index('study_id').drop(1022).reset_index()
df_comb_liwc = pd.merge(df_comb_liwc, df_os[['study_id', 'outcome']], on='study_id')

df_last_turn = pd.read_excel(path+'transcription_final_data/patient/last_datetime_related_turn_taking_features_of_each_patient.xlsx')
df_last_turn = df_last_turn.set_index('study_id').drop(1022).reset_index()
df_last_turn = pd.merge(df_last_turn, df_os[['study_id', 'outcome']], on='study_id')
df_mean_turn = pd.read_excel(path+'transcription_final_data/patient/mean_datetime_related_turn_taking_features_of_each_patient.xlsx')
df_mean_turn = df_mean_turn.set_index('study_id').drop(1022).reset_index()
df_mean_turn = pd.merge(df_mean_turn, df_os[['study_id', 'outcome']], on='study_id')

df_last_ling = pd.read_excel(path_data+'Linguistic_features_last_date_time_cognitive_status_symptoms.xlsx')
df_comb_ling = pd.read_excel(path_data+'Linguistic_features_combine_text_cognitive_status_symptoms.xlsx')

### UMLS

#### last date time

In [None]:
df_result_last_umls_jm, df_fr_last_umls_jm = classifying_all(df_last_umls, clfs_list, clfs_name,
                                                pipline='UMLS', using_jmim=True, stan_scal=True, unit_vector=True)

#### combine text

In [None]:
df_result_comb_umls_jm, df_fr_comb_umls_jm = classifying_all(df_comb_umls, clfs_list, clfs_name,
                                                pipline='UMLS', using_jmim=True, stan_scal=True, unit_vector=True)

### LIWC

#### last date time

In [None]:
df_result_last_liwc_jm, df_fr_last_liwc_jm = classifying_all(df_last_liwc, clfs_list, clfs_name,
                                                                         pipline='LIWC', using_jmim=True)

#### combine text

In [None]:
df_result_comb_liwc_jm, df_fr_comb_liwc_jm = classifying_all(df_comb_liwc, clfs_list, clfs_name,
                                                             pipline='LIWC', using_jmim=True)

In [None]:
df_comb_liwc_jmim = classifying_all(df_comb_liwc, clfs_list, clfs_name,
                                        pipline='LIWC', using_jmim=True, most_important_features=True)
df_comb_liwc_jmim.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/LIWC_JMIM_features_for_combined_text_pt.xlsx', index=False)

### Turn taking

#### last date time

In [None]:
df_result_last_turn_jm, df_fr_last_turn_jm = classifying_all(df_last_turn, clfs_list, clfs_name,
                                                             pipline='Turn_taking', using_jmim=True)

#### mean date time

In [None]:
df_result_mean_turn_jm, df_fr_mean_turn_jm = classifying_all(df_mean_turn, clfs_list, clfs_name,
                                                             pipline='Turn_taking', using_jmim=True)

In [None]:
df_mean_turn_jmim = classifying_all(df_mean_turn, clfs_list, clfs_name,
                                    pipline='Turn_taking', using_jmim=True, most_important_features=True)
df_mean_turn_jmim.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/Turn_taking_JMIM_features_for_mean_pt.xlsx', index=False)

### Linguistic features

#### last date time

In [None]:
df_result_last_ling_jm, df_fr_last_ling_jm = classifying_all(df_last_ling, clfs_list, clfs_name,
                                            pipline='Linguistic', using_jmim=True)

#### combine text

In [None]:
df_result_comb_ling_jm, df_fr_comb_ling_jm = classifying_all(df_comb_ling, clfs_list, clfs_name,
                                            pipline='Linguistic', using_jmim=True)

In [None]:
df_comb_ling_jmim = classifying_all(df_comb_ling, clfs_list, clfs_name,
                                    pipline='Linguistic', using_jmim=True, most_important_features=True)
df_comb_ling_jmim.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/Linguistic_JMIM_features_for_combined_text_pt.xlsx', index=False)

### Aggregate features


In [None]:
all_df_fr_last_jm = [df_fr_last_ling_jm, df_fr_last_turn_jm, df_fr_last_liwc_jm, df_fr_last_umls_jm]
# all_df_fr_last_jm = [df_fr_last_turn_jm, df_fr_last_liwc_jm]
df_fr_last_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), all_df_fr_last_jm)

In [None]:
all_without_umls_df_fr_last_jm = [df_fr_last_ling_jm, df_fr_last_turn_jm, df_fr_last_liwc_jm]
# all_df_fr_last_jm = [df_fr_last_turn_jm, df_fr_last_liwc_jm]
df_without_umls_fr_last_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), all_without_umls_df_fr_last_jm)

In [None]:
all_df_fr_comb_jm = [df_fr_comb_ling_jm, df_fr_mean_turn_jm, df_fr_comb_liwc_jm, df_fr_comb_umls_jm]
# all_df_fr_comb_jm = [df_fr_mean_turn_jm, df_fr_comb_liwc_jm]
df_fr_comb_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), all_df_fr_comb_jm)

In [None]:
all_without_umls_df_fr_comb_jm = [df_fr_comb_ling_jm, df_fr_mean_turn_jm, df_fr_comb_liwc_jm]
# all_df_fr_comb_jm = [df_fr_mean_turn_jm, df_fr_comb_liwc_jm]≥
df_without_umls_fr_comb_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), all_without_umls_df_fr_comb_jm)

### last date time

#### without jmim

In [None]:
df_result_4_last, df_fr_last_4 = classifying_all(df_fr_last_jm, clfs_list, clfs_name,
    approach='last datetime text', pipline='Linguistic_jmim + Turn_taking_jmim + LIWC_jmim + UMLS_jmim', using_jmim=False)

#### with jmim

In [None]:
df_result_4_last_jm, df_fr_last_4_jm = classifying_all(df_fr_last_jm, clfs_list, clfs_name,
    approach='last datetime text', pipline='Linguistic_jmim + Turn_taking_jmim + LIWC_jmim + UMLS_jmim', using_jmim=True)

### Combine text

#### without jmim

In [None]:
df_result_4_comb, df_fr_comb_4 = classifying_all(df_fr_comb_jm, clfs_list, clfs_name,
    approach='combine text', pipline='Linguistic_jmim + Turn_taking_jmim + LIWC_jmim + UMLS_jmim', using_jmim=False)

#### with jmim

In [None]:
df_result_4_comb_jm, df_fr_comb_4_jm = classifying_all(df_fr_comb_jm, clfs_list, clfs_name,
    approach='combine text', pipline='Linguistic_jmim + Turn_taking_jmim + LIWC_jmim + UMLS_jmim', using_jmim=True)

### Aggregate results

In [None]:
df_result_4_set = pd.concat([df_result_4_last, df_result_4_last_jm, df_result_4_comb, df_result_4_comb_jm])\
                            .drop(['pretrain_weights', 'embedding_structure'], axis=1).reset_index(drop=True)
df_result_4_set.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Linguistic_Turn_taking_jmim_LIWC_jmim_UMLS_jmim.xlsx', index=False)

## Part 2: OASIS + Clinical note

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_last_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/2_OASIS_Clinical_note.xlsx')
    last_index = df_result_last_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_last_all.loc[last_index, 'Approach']
    second = df_result_last_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_last_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_last_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except FileNotFoundError:
    run_again_idx = 0
    df_result_last_all = pd.DataFrame()

for episode in episodes[run_again_idx:]:

    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_clinical = df_last_clinical.copy()
    elif episode[0]=='combined text':
        df_clinical = df_comb_clinical.copy()

    df_result_bert_jm, df_fr_bert_jm = classifying_using_bert(df_clinical, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)

    df_fr_jm = pd.merge(df_fr_os_jm, df_fr_bert_jm, on=['study_id', 'outcome'])

    df_result_final, df_fr_final = classifying_all(df_fr_jm, clfs_list, clfs_name,
                               approach=episode[0], pipline='oasis_jmim + transformer_jmim',
                               pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])

    df_result_last_all = pd.concat([df_result_last_all, df_result_final]).reset_index(drop=True)
    df_result_last_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/2_OASIS_Clinical_note.xlsx', index=False)


### Best model probabilities of part 2

In [None]:
episodes[19]

('combined text', True, 'bert-base-uncased', 'mean')

In [None]:
episode = episodes[19]

print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                  .format(episode[0], episode[1], episode[2], episode[3]))

if episode[0]=='last date time':
    df_clinical = df_last_clinical.copy()
elif episode[0]=='combined text':
    df_clinical = df_comb_clinical.copy()

df_result_bert_jm, df_fr_bert_jm = classifying_using_bert(df_clinical, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True)

df_fr_jm = pd.merge(df_fr_os_jm, df_fr_bert_jm, on=['study_id', 'outcome'])

##############
df_probs_combined_2 = classifying_all(df_fr_jm, [clfs_list[3]], [clfs_name[3]],
        get_probability=True, get_minmax=False,
        approach=episode[0], pipline='oasis_jmim + transformer_jmim',
        pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])

In [None]:
df_probs_combined_2.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Best_probability_for_each_approach/2_OASIS_Clinical_note_combined_text_probs.xlsx', index=False)

## Part 3:
## patient: (transformer on transcription + LIWC + Turn taking + Linguistic)

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/3_Linguistic_Turn_taking_LIWC_Transcription.xlsx')
    last_index = df_result_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_all.loc[last_index, 'Approach']
    second = df_result_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except FileNotFoundError:
    df_result_all = pd.DataFrame()
    run_again_idx = 0

for episode in episodes[run_again_idx:]:

    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_transcription = df_last_transcription.copy()
        df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
        df_fr_turn_jm = df_fr_last_turn_jm.copy()
        df_fr_ling_jm = df_fr_last_ling_jm.copy()
        df_frs_jm = df_without_umls_fr_last_jm.copy()

    elif episode[0]=='combined text':
        df_transcription = df_comb_transcription.copy()
        df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
        df_fr_turn_jm = df_fr_mean_turn_jm.copy()
        df_fr_ling_jm = df_fr_comb_ling_jm.copy()
        df_frs_jm = df_without_umls_fr_comb_jm.copy()
    ##############
    df_result_bert_jm, df_fr_bert_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)
    ##############
    dataframes = [df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_bert_jm]
    df_fr_4set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_1, df_fr_4set_jm_no_yes_jm = classifying_all(df_fr_4set_jm, clfs_list, clfs_name,
            approach=episode[0], pipline='linguistic_jmim + turn_taking_jmim + liwc_jmim '
                                                                 '+ transformer_on_transcription_jmim',
            pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    # 3_1 -> 1(3) + 1
    df_fr_1_3set_jm = pd.merge(df_frs_jm, df_fr_bert_jm, on=['study_id', 'outcome'])

    df_result_final_2, df_fr_1_3set_jm_no_yes_jm = classifying_all(df_fr_1_3set_jm, clfs_list, clfs_name,
            approach=episode[0], pipline='(linguistic_jmim + turn_taking_jmim + liwc_jmim)jmim '
                                                                   '+ transformer_on_transcription_jmim',
            pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    df_result_all = pd.concat([df_result_all, df_result_final_1, df_result_final_2]).reset_index(drop=True)
    df_result_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/3_Linguistic_Turn_taking_LIWC_Transcription.xlsx', index=False)

### Best model probabilities of part 3

In [None]:
episodes[21]

('combined text', True, 'distilbert-base-uncased', 'mean')

In [None]:
episode = episodes[21]

print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                  .format(episode[0], episode[1], episode[2], episode[3]))

if episode[0]=='last date time':
    df_transcription = df_last_transcription.copy()
    df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
    df_fr_turn_jm = df_fr_last_turn_jm.copy()
    df_fr_ling_jm = df_fr_last_ling_jm.copy()
    df_frs_jm = df_without_umls_fr_last_jm.copy()

elif episode[0]=='combined text':
    df_transcription = df_comb_transcription.copy()
    df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
    df_fr_turn_jm = df_fr_mean_turn_jm.copy()
    df_fr_ling_jm = df_fr_comb_ling_jm.copy()
    df_frs_jm = df_without_umls_fr_comb_jm.copy()
##############
df_result_bert_jm, df_fr_bert_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True)

##############
# 3_1 -> 1(3) + 1
df_fr_1_3set_jm = pd.merge(df_frs_jm, df_fr_bert_jm, on=['study_id', 'outcome'])

##############
df_probs_combined_3 = classifying_all(df_fr_1_3set_jm, [clfs_list[1]], [clfs_name[1]],
        get_probability=True, get_minmax=False,
        approach=episode[0], pipline='(linguistic_jmim + turn_taking_jmim + liwc_jmim)jmim '
                                                               '+ transformer_on_transcription_jmim',
        pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])


In [None]:
df_probs_combined_3.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Best_probability_for_each_approach/3_Linguistic_Turn_taking_LIWC_Transcription_combined_text_probs.xlsx', index=False)

## Part 3_1:
## patient: transformer on transcription

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/3.1_transcription_pt.xlsx')
    last_index = df_result_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_all.loc[last_index, 'Approach']
    second = df_result_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except FileNotFoundError:
    df_result_all = pd.DataFrame()
    run_again_idx = 0

for episode in episodes[run_again_idx:]:

    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_transcription = df_last_transcription.copy()

    elif episode[0]=='combined text':
        df_transcription = df_comb_transcription.copy()


    ##############
    df_result_bert_jm, df_fr_bert_jm = classifying_using_bert(df_transcription, [clfs_list[3]], [clfs_name[3]],
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=episode[1])

    ##############
    df_result_all = pd.concat([df_result_all, df_result_bert_jm]).reset_index(drop=True)
    df_result_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/3.1_transcription_pt.xlsx', index=False)

## Part 4:
## patient: (OASIS + Clinical note) +
## patient: (transformer on transcription + LIWC + Turn taking + Linguistic)

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_last_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/4_OASIS_Linguistic_Turn_taking_LIWC_Transcription_Clinical.xlsx')
    last_index = df_result_last_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_last_all.loc[last_index, 'Approach']
    second = df_result_last_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_last_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_last_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except FileNotFoundError:
    df_result_last_all = pd.DataFrame()
    run_again_idx = 0

for episode in episodes[run_again_idx:]:
    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_clinical = df_last_clinical.copy()
        df_transcription = df_last_transcription.copy()
        df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
        df_fr_turn_jm = df_fr_last_turn_jm.copy()
        df_fr_ling_jm = df_fr_last_ling_jm.copy()
        df_frs_jm = df_without_umls_fr_last_jm.copy()

    elif episode[0]=='combined text':
        df_clinical = df_comb_clinical.copy()
        df_transcription = df_comb_transcription.copy()
        df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
        df_fr_turn_jm = df_fr_mean_turn_jm.copy()
        df_fr_ling_jm = df_fr_comb_ling_jm.copy()
        df_frs_jm = df_without_umls_fr_comb_jm.copy()

    ##############
    df_result_bert_transc_jm, df_fr_bert_transc_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)
    ##############
    df_result_bert_clinic_jm, df_fr_bert_clinic_jm = classifying_using_bert(df_clinical, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)
    ##############
    dataframes = [df_fr_os_jm, df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_bert_clinic_jm, df_fr_bert_transc_jm]
    df_fr_6set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_1, df_fr_6set_jm_no_yes_jm = classifying_all(df_fr_6set_jm, clfs_list, clfs_name,
            approach=episode[0], pipline='oasis_jmim + linguistic_jmim + turn_taking_jmim + liwc_jmim + transformer_on_clinical_jmim + transformer_on_transcription_jmim',
            pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    # 1_3_1_1 -> 1 + 1(3) + 1 + 1
    dataframes = [df_fr_os_jm, df_frs_jm, df_fr_bert_clinic_jm, df_fr_bert_transc_jm]
    df_fr_1_3_1_1set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_2, df_fr_1_3_1_1set_jm_no_yes_jm = classifying_all(df_fr_1_3_1_1set_jm, clfs_list, clfs_name,
            approach=episode[0], pipline='oasis_jmim + (linguistic_jmim + turn_taking_jmim + liwc_jmim)jmim + transformer_on_clinical_jmim + transformer_on_transcription_jmim',
            pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    df_result_last_all = pd.concat([df_result_last_all, df_result_final_1, df_result_final_2]).reset_index(drop=True)
    df_result_last_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/4_OASIS_Linguistic_Turn_taking_LIWC_Transcription_Clinical.xlsx', index=False)


## Load nurse data:
## transcription + UMLS + LIWC

In [None]:
df_last_transcription_ns = pd.read_excel(path+'transcription_final_data/nurse/last_datetime_text_of_each_nurse.xlsx')
df_last_transcription_ns = df_last_transcription_ns.set_index('study_id').drop(1022).reset_index()
df_last_transcription_ns = pd.merge(df_last_transcription_ns, df_os[['study_id', 'outcome']], on='study_id')
df_comb_transcription_ns = pd.read_excel(path+'transcription_final_data/nurse/combining_text_of_datetimes_for_each_nurse.xlsx')
df_comb_transcription_ns = df_comb_transcription_ns.set_index('study_id').drop(1022).reset_index()
df_comb_transcription_ns = pd.merge(df_comb_transcription_ns, df_os[['study_id', 'outcome']], on='study_id')

df_last_umls_ns = pd.read_excel(path+'transcription_final_data/nurse/last_datetime_umls_of_each_nurse.xlsx')
df_last_umls_ns = df_last_umls_ns.set_index('study_id').drop(1022).reset_index()
df_last_umls_ns = pd.merge(df_last_umls_ns, df_os[['study_id', 'outcome']], on='study_id')
df_comb_umls_ns = pd.read_excel(path+'transcription_final_data/nurse/combining_umls_of_datetimes_for_each_nurse.xlsx')
df_comb_umls_ns = df_comb_umls_ns.set_index('study_id').drop(1022).reset_index()
df_comb_umls_ns = pd.merge(df_comb_umls_ns, df_os[['study_id', 'outcome']], on='study_id')

df_last_liwc_ns = pd.read_excel(path+'transcription_final_data/nurse/LIWC2015 Results (last_datetime_text_of_each_nurse).xlsx')
df_last_liwc_ns = df_last_liwc_ns.rename(columns={'Source (A)':'study_id', 'Source (B)':'date_time', 'Source (C)':'text', 'Source (D)':'hosp_ed_ind'})
df_last_liwc_ns = df_last_liwc_ns.set_index('study_id').drop(1022).reset_index()
df_last_liwc_ns = pd.merge(df_last_liwc_ns, df_os[['study_id', 'outcome']], on='study_id')
df_comb_liwc_ns = pd.read_excel(path+'transcription_final_data/nurse/LIWC2015 Results (combining_text_of_datetimes_for_each_nurse).xlsx')
df_comb_liwc_ns = df_comb_liwc_ns.rename(columns={'Source (A)':'study_id', 'Source (B)':'hosp_ed_ind', 'Source (C)':'text'})
df_comb_liwc_ns = df_comb_liwc_ns.set_index('study_id').drop(1022).reset_index()
df_comb_liwc_ns = pd.merge(df_comb_liwc_ns, df_os[['study_id', 'outcome']], on='study_id')


### UMLS

#### last date time

In [None]:
df_result_last_umls_jm_ns, df_fr_last_umls_jm_ns = classifying_all(df_last_umls_ns, clfs_list, clfs_name,
                                    pipline='UMLS', using_jmim=True, stan_scal=True, unit_vector=True, nurse=True)

#### combine text

In [None]:
df_result_comb_umls_jm_ns, df_fr_comb_umls_jm_ns = classifying_all(df_comb_umls_ns, clfs_list, clfs_name,
                                    pipline='UMLS', using_jmim=True, stan_scal=True, unit_vector=True, nurse=True)

### LIWC

#### last date time

In [None]:
df_result_last_liwc_jm_ns, df_fr_last_liwc_jm_ns = classifying_all(df_last_liwc_ns, clfs_list, clfs_name,
                                                                   pipline='LIWC', using_jmim=True, nurse=True)

#### combine text

In [None]:
df_result_comb_liwc_jm_ns, df_fr_comb_liwc_jm_ns = classifying_all(df_comb_liwc_ns, clfs_list, clfs_name,
                                                                   pipline='LIWC', using_jmim=True, nurse=True)

In [None]:
df_comb_liwc_jmim_ns = classifying_all(df_comb_liwc_ns, clfs_list, clfs_name,
                                    pipline='LIWC', using_jmim=True, nurse=True, most_important_features=True)
df_comb_liwc_jmim_ns.to_excel(path_results+'nurse/One_leave_out/Respond_to_reviewers/Most_important_features/LIWC_JMIM_features_for_combined_text_ns.xlsx', index=False)

### Aggregate features


In [None]:
all_df_fr_last_jm_ns = [df_fr_last_liwc_jm_ns, df_fr_last_umls_jm_ns]
df_fr_last_jm_ns = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), all_df_fr_last_jm_ns)

In [None]:
all_df_fr_comb_jm_ns = [df_fr_comb_liwc_jm_ns, df_fr_comb_umls_jm_ns]
df_fr_comb_jm_ns = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), all_df_fr_comb_jm_ns)


### last date time

#### without jmim

In [None]:
df_result_2_last_ns, df_fr_last_2_ns = classifying_all(df_fr_last_jm_ns, clfs_list, clfs_name,
            approach='last datetime text', pipline='LIWC_jmim + UMLS_jmim', using_jmim=False)

#### with jmim

In [None]:
df_result_2_last_jm_ns, df_fr_last_2_jm_ns = classifying_all(df_fr_last_jm_ns, clfs_list, clfs_name,
            approach='last datetime text', pipline='LIWC_jmim + UMLS_jmim', using_jmim=True)

### Combine text

#### without jmim

In [None]:
df_result_2_comb_ns, df_fr_comb_2_ns = classifying_all(df_fr_comb_jm_ns, clfs_list, clfs_name,
            approach='combine text', pipline='LIWC_jmim + UMLS_jmim', using_jmim=False)

#### with jmim

In [None]:
df_result_2_comb_jm_ns, df_fr_comb_2_jm_ns = classifying_all(df_fr_comb_jm_ns, clfs_list, clfs_name,
            approach='combine text', pipline='Turn_taking_jmim + LIWC_jmim + UMLS_jmim', using_jmim=True)


### Aggregate results

In [None]:
df_result_2_set = pd.concat([df_result_2_last_ns, df_result_2_last_jm_ns, df_result_2_comb_ns, df_result_2_comb_jm_ns])\
                            .drop(['pretrain_weights', 'embedding_structure'], axis=1).reset_index(drop=True)
df_result_2_set.to_excel(path_results+'nurse/One_leave_out/Respond_to_reviewers/LIWC_jmim_UMLS_jmim.xlsx', index=False)

In [None]:
df_probs_last_4.to_excel(path_results+'Best_probability_for_each_approach/OASIS_clinical_(turn_taking_liwc_umls_transcription)_pt_(liwc_umls_transcription)_ns_last_date_time_probs.xlsx', index=False)

In [None]:
df_probs_comb_4.to_excel(path_results+'Best_probability_for_each_approach/OASIS_clinical_(turn_taking_liwc_umls_transcription)_pt_(liwc_umls_transcription)_ns_combined_text_probs.xlsx', index=False)

## Part 5:
## patient: (transformer on transcription + LIWC + Turn taking + Linguistic) +
## nurse: (transformer on transcription + LIWC)

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_last_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/5_(turn_taking_liwc_linguistic)pt_(liwc)ns_(transcription)pt_(transcription)ns.xlsx')
    last_index = df_result_last_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_last_all.loc[last_index, 'Approach']
    second = df_result_last_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_last_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_last_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except:
    df_result_last_all = pd.DataFrame()
    run_again_idx = 0

for episode in episodes[run_again_idx:]:

    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_transcription = df_last_transcription.copy()
        df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
        df_fr_turn_jm = df_fr_last_turn_jm.copy()
        df_fr_ling_jm = df_fr_last_ling_jm.copy()
        df_frs_jm = df_without_umls_fr_last_jm.copy()
        # nurse
        df_transcription_ns = df_last_transcription_ns.copy()
        df_fr_liwc_jm_ns = df_fr_last_liwc_jm_ns.copy()

    elif episode[0]=='combined text':
        df_transcription = df_comb_transcription.copy()
        df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
        df_fr_turn_jm = df_fr_mean_turn_jm.copy()
        df_fr_ling_jm = df_fr_comb_ling_jm.copy()
        df_frs_jm = df_without_umls_fr_comb_jm.copy()
        # nurse
        df_transcription_ns = df_comb_transcription_ns.copy()
        df_fr_liwc_jm_ns = df_fr_comb_liwc_jm_ns.copy()


    ##############
    df_result_bert_transc_jm, df_fr_bert_transc_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)
    ##############
    df_result_bert_transc_jm_ns, df_fr_bert_transc_jm_ns = classifying_using_bert(df_transcription_ns, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True, nurse=True)
    ##############
    dataframes = [df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_liwc_jm_ns,
                  df_fr_bert_transc_jm, df_fr_bert_transc_jm_ns]
    df_fr_6set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_1, df_fr_6set_jm_no_yes_jm = classifying_all(df_fr_6set_jm, clfs_list, clfs_name,
        approach=episode[0], pipline='linguistic_jmim_pt + turn_taking_jmim_pt + liwc_jmim_pt + '\
        'liwc_jmim_ns + transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns'
        , pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    # 3_1_1_1 -> 1(3) + 1 + 1 + 1
    dataframes = [df_frs_jm, df_fr_liwc_jm_ns, df_fr_bert_transc_jm, df_fr_bert_transc_jm_ns]
    df_fr_3_1_1_1set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_2, df_fr_3_1_1_1set_jm_no_yes_jm = classifying_all(df_fr_3_1_1_1set_jm, clfs_list, clfs_name,
        approach=episode[0], pipline='(linguistic_jmim_pt + turn_taking_jmim_pt + liwc_jmim_pt)jmim + '\
        'liwc_jmim_ns + transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns'
        , pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    df_result_last_all = pd.concat([df_result_last_all, df_result_final_1, df_result_final_2]).reset_index(drop=True)
    df_result_last_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/5_(turn_taking_liwc_linguistic)pt_(liwc)ns_(transcription)pt_(transcription)ns.xlsx', index=False)


### Best model probabilities of part 5

In [None]:
episodes[20]

('combined text', True, 'distilbert-base-uncased', 'first')

In [None]:
episode = episodes[20]

print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                  .format(episode[0], episode[1], episode[2], episode[3]))

if episode[0]=='last date time':
    df_transcription = df_last_transcription.copy()
    df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
    df_fr_turn_jm = df_fr_last_turn_jm.copy()
    df_fr_ling_jm = df_fr_last_ling_jm.copy()
    df_frs_jm = df_without_umls_fr_last_jm.copy()
    # nurse
    df_transcription_ns = df_last_transcription_ns.copy()
    df_fr_liwc_jm_ns = df_fr_last_liwc_jm_ns.copy()

elif episode[0]=='combined text':
    df_transcription = df_comb_transcription.copy()
    df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
    df_fr_turn_jm = df_fr_mean_turn_jm.copy()
    df_fr_ling_jm = df_fr_comb_ling_jm.copy()
    df_frs_jm = df_without_umls_fr_comb_jm.copy()
    # nurse
    df_transcription_ns = df_comb_transcription_ns.copy()
    df_fr_liwc_jm_ns = df_fr_comb_liwc_jm_ns.copy()


##############
df_result_bert_transc_jm, df_fr_bert_transc_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True)
##############
df_result_bert_transc_jm_ns, df_fr_bert_transc_jm_ns = classifying_using_bert(df_transcription_ns, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True, nurse=True)
##############
dataframes = [df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_liwc_jm_ns,
              df_fr_bert_transc_jm, df_fr_bert_transc_jm_ns]
df_fr_6set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

##############
df_probs_combined_5 = classifying_all(df_fr_6set_jm, [clfs_list[0]], [clfs_name[0]],
        get_probability=True, get_minmax=False,
        approach=episode[0], pipline='linguistic_jmim_pt + turn_taking_jmim_pt + liwc_jmim_pt + '\
        'liwc_jmim_ns + transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns',
        pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])


In [None]:
df_probs_combined_5.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Best_probability_for_each_approach/5_(turn_taking_liwc_linguistic)pt_(liwc)ns_(transcription)pt_(transcription)ns_combined_text_probs.xlsx', index=False)

## Part 5_1:
## patient: (transformer on transcription) + nurse: (transformer on transcription)

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_last_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/5_1_(transcription)pt_(transcription)ns.xlsx')
    last_index = df_result_last_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_last_all.loc[last_index, 'Approach']
    second = df_result_last_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_last_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_last_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except:
    df_result_last_all = pd.DataFrame()
    run_again_idx = 0

for episode in episodes[run_again_idx:]:

    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_transcription_pt = df_last_transcription.copy()
        # nurse
        df_transcription_ns = df_last_transcription_ns.copy()

    elif episode[0]=='combined text':
        df_transcription_pt = df_comb_transcription.copy()
        # nurse
        df_transcription_ns = df_comb_transcription_ns.copy()


    ##############
    df_result_bert_transc_jm_pt, df_fr_bert_transc_jm_pt = classifying_using_bert(df_transcription_pt, [clfs_list[3]], [clfs_name[3]],
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)
    ##############
    df_result_bert_transc_jm_ns, df_fr_bert_transc_jm_ns = classifying_using_bert(df_transcription_ns, [clfs_list[3]], [clfs_name[3]],
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True, nurse=True)
    ##############
    df_fr_pt_ns_jm = pd.merge(df_fr_bert_transc_jm_pt, df_fr_bert_transc_jm_ns ,on=['study_id', 'outcome'], how='inner')

    df_result_final, df_fr_pt_ns_jm_no_yes_jm = classifying_all(df_fr_pt_ns_jm, [clfs_list[3]], [clfs_name[3]],
        approach=episode[0], pipline='transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns'
        , pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])

    ##############
    df_result_last_all = pd.concat([df_result_last_all, df_result_final]).reset_index(drop=True)
    df_result_last_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/5_1_(transcription)pt_(transcription)ns.xlsx', index=False)


## Part 6:
## patient: (OASIS + Clinical note) +
## patient: (transformer on transcription + LIWC + Turn taking + Linguistic) +
## nurse: (transformer on transcription + LIWC)

In [None]:
pretrain_weights_list = ['bert-base-uncased', 'distilbert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT']
embedding_structure_list = ['first', 'mean']
approach_list = ['last date time', 'combined text']
using_jmim_list = [False, True]
all_list = [approach_list, using_jmim_list, pretrain_weights_list, embedding_structure_list]
episodes = list(itertools.product(*all_list))

In [None]:
try:
    df_result_last_all = pd.read_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/6_OASIS_(linguistic_turn_taking_liwc)pt_(liwc)ns_clinical_(transcription)pt_(transcription)ns.xlsx')
    last_index = df_result_last_all.shape[0]-1
    last_run_tupple = ()
    first = df_result_last_all.loc[last_index, 'Approach']
    second = df_result_last_all.loc[last_index, 'Using_jmim']
    second = True if second=='Yes' else False
    third = df_result_last_all.loc[last_index, 'pretrain_weights']
    fourth = df_result_last_all.loc[last_index, 'embedding_structure']
    last_run_tupple = (first, second, third, fourth)
    run_again_idx = episodes.index(last_run_tupple)+1
except:
    df_result_last_all = pd.DataFrame()
    run_again_idx = 0

for episode in episodes[run_again_idx:]:

    print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                      .format(episode[0], episode[1], episode[2], episode[3]))

    if episode[0]=='last date time':
        df_clinical = df_last_clinical.copy()
        df_transcription = df_last_transcription.copy()
        df_fr_ling_jm = df_fr_last_ling_jm.copy()
        df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
        df_fr_turn_jm = df_fr_last_turn_jm.copy()
        df_frs_jm = df_fr_last_jm.copy()
        # nurse
        df_transcription_ns = df_last_transcription_ns.copy()
        df_fr_liwc_jm_ns = df_fr_last_liwc_jm_ns.copy()
        df_frs_jm_ns = df_fr_last_jm_ns.copy()

    elif episode[0]=='combined text':
        df_clinical = df_comb_clinical.copy()
        df_transcription = df_comb_transcription.copy()
        df_fr_ling_jm = df_fr_comb_ling_jm.copy()
        df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
        df_fr_turn_jm = df_fr_mean_turn_jm.copy()
        df_frs_jm = df_fr_comb_jm.copy()
        # nurse
        df_transcription_ns = df_comb_transcription_ns.copy()
        df_fr_liwc_jm_ns = df_fr_comb_liwc_jm_ns.copy()
        df_frs_jm_ns = df_fr_comb_jm_ns.copy()

    ##############
    df_result_bert_transc_jm, df_fr_bert_transc_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True)
    ##############
    df_result_bert_clinic_jm, df_fr_bert_clinic_jm = classifying_using_bert(df_clinical, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True, clinical=True)
    ##############
    df_result_bert_transc_jm_ns, df_fr_bert_transc_jm_ns = classifying_using_bert(df_transcription_ns, clfs_list, clfs_name,
                                                         approach=episode[0], pretrain_weights=episode[2],
                                                         embedding_structure=episode[3], using_jmim=True, nurse=True)
    ##############
    dataframes = [df_fr_os_jm, df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_liwc_jm_ns,
                  df_fr_bert_clinic_jm, df_fr_bert_transc_jm, df_fr_bert_transc_jm_ns]
    df_fr_8set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_1, df_fr_8set_jm_no_yes_jm = classifying_all(df_fr_8set_jm, clfs_list, clfs_name,
        approach=episode[0], pipline='oasis_jmim + linguistic_jmim_pt + turn_taking_jmim_pt + liwc_jmim_pt + '\
        'liwc_jmim_ns + transformer_on_clinical_jmim + '\
        'transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns',
        pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    # 1_3_1_1_1_1 -> 1 + 1(3) + 1 + 1 + 1 + 1
    dataframes = [df_fr_os_jm, df_frs_jm, df_fr_liwc_jm_ns, df_fr_bert_clinic_jm, df_fr_bert_transc_jm, df_fr_bert_transc_jm_ns]
    df_fr_1_3_1_1_1_1set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

    df_result_final_2, df_fr_1_3_1_1_1_1set_jm_no_yes_jm = classifying_all(df_fr_1_3_1_1_1_1set_jm, clfs_list, clfs_name,
        approach=episode[0], pipline='oasis_jmim + (linguistic_jmim_pt + turn_taking_jmim_pt + liwc_jmim_pt)jmim + '\
        'liwc_jmim_ns + transformer_on_clinical_jmim + '\
        'transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns',
        pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])
    ##############
    df_result_last_all = pd.concat([df_result_last_all, df_result_final_1, df_result_final_2]).reset_index(drop=True)
    df_result_last_all.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/6_OASIS_(linguistic_turn_taking_liwc)pt_(liwc)ns_clinical_(transcription)pt_(transcription)ns.xlsx', index=False)


### Best model probabilities of part 6

In [None]:
episodes[13]

('combined text', False, 'bert-base-uncased', 'mean')

In [None]:
episode = episodes[13]

print('\nApproach: {}\t\t  Using_jmim: {}\nPretrain_weights: {}\t  embedding_structure: {}'
                                                  .format(episode[0], episode[1], episode[2], episode[3]))

if episode[0]=='last date time':
    df_clinical = df_last_clinical.copy()
    df_transcription = df_last_transcription.copy()
    df_fr_ling_jm = df_fr_last_ling_jm.copy()
    df_fr_liwc_jm = df_fr_last_liwc_jm.copy()
    df_fr_turn_jm = df_fr_last_turn_jm.copy()
    df_fr_umls_jm = df_fr_last_umls_jm.copy()
    df_frs_jm = df_fr_last_jm.copy()
    # nurse
    df_transcription_ns = df_last_transcription_ns.copy()
    df_fr_liwc_jm_ns = df_fr_last_liwc_jm_ns.copy()
    df_fr_umls_jm_ns = df_fr_last_umls_jm_ns.copy()
    df_frs_jm_ns = df_fr_last_jm_ns.copy()

elif episode[0]=='combined text':
    df_clinical = df_comb_clinical.copy()
    df_transcription = df_comb_transcription.copy()
    df_fr_ling_jm = df_fr_comb_ling_jm.copy()
    df_fr_liwc_jm = df_fr_comb_liwc_jm.copy()
    df_fr_turn_jm = df_fr_mean_turn_jm.copy()
    df_fr_umls_jm = df_fr_comb_umls_jm.copy()
    df_frs_jm = df_fr_comb_jm.copy()
    # nurse
    df_transcription_ns = df_comb_transcription_ns.copy()
    df_fr_liwc_jm_ns = df_fr_comb_liwc_jm_ns.copy()
    df_fr_umls_jm_ns = df_fr_comb_umls_jm_ns.copy()
    df_frs_jm_ns = df_fr_comb_jm_ns.copy()

##############
df_result_bert_transc_jm, df_fr_bert_transc_jm = classifying_using_bert(df_transcription, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True)
##############
df_result_bert_clinic_jm, df_fr_bert_clinic_jm = classifying_using_bert(df_clinical, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True, clinical=True)
##############
df_result_bert_transc_jm_ns, df_fr_bert_transc_jm_ns = classifying_using_bert(df_transcription_ns, clfs_list, clfs_name,
                                                     approach=episode[0], pretrain_weights=episode[2],
                                                     embedding_structure=episode[3], using_jmim=True, nurse=True)
##############
dataframes = [df_fr_os_jm, df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_liwc_jm_ns,
              df_fr_bert_clinic_jm, df_fr_bert_transc_jm, df_fr_bert_transc_jm_ns]
df_fr_8set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

##############
df_probs_combined_6 = classifying_all(df_fr_8set_jm, [clfs_list[1]], [clfs_name[1]],
        get_probability=True, get_minmax=False,
        approach=episode[0], pipline='oasis_jmim + linguistic_jmim_pt + turn_taking_jmim_pt + liwc_jmim_pt + '\
        'liwc_jmim_ns + transformer_on_clinical_jmim + '\
        'transformer_on_transcription_jmim_pt + transformer_on_transcription_jmim_ns',
        pretrain_weights=episode[2], embedding_structure=episode[3], using_jmim=episode[1])



In [None]:
df_probs_combined_6.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Best_probability_for_each_approach/6_OASIS_(linguistic_turn_taking_liwc)pt_(liwc)ns_clinical_(transcription)pt_(transcription)ns_combined_text_probs.xlsx', index=False)

#### Most important features of part 6

In [None]:
# These are not important for jmim features ([clfs_list[1]], [clfs_name[1]])
df_fr_8set_jm_jmim = classifying_all(df_fr_8set_jm, [clfs_list[1]], [clfs_name[1]], using_jmim=True, most_important_features=True)
df_fr_8set_jm_jmim.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/OASIS_(linguistic_turn_taking_liwc)pt_(liwc)ns_clinical_(transcription)pt_(transcription)ns_combined_text.xlsx', index=False)

#### Most important features of part 6 without any transcription and clinical note

In [None]:
dataframes = [df_fr_os_jm, df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_liwc_jm_ns]
df_fr_5set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

# These are not important for jmim features ([clfs_list[1]], [clfs_name[1]])
df_fr_5set_jm_jmim = classifying_all(df_fr_5set_jm, [clfs_list[1]], [clfs_name[1]], using_jmim=True, most_important_features=True)
df_fr_5set_jm_jmim.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/OASIS_(linguistic_turn_taking_liwc)pt_(liwc)ns.xlsx', index=False)

#### Most important features of part 6 with UMLS and without any transcription and clinical note

In [None]:
dataframes = [df_fr_os_jm, df_fr_ling_jm, df_fr_turn_jm, df_fr_liwc_jm, df_fr_umls_jm, df_fr_liwc_jm_ns, df_fr_umls_jm_ns]
df_fr_7set_jm = reduce(lambda left,right: pd.merge(left, right ,on=['study_id', 'outcome'], how='inner'), dataframes)

# These are not important for jmim features ([clfs_list[1]], [clfs_name[1]])
df_fr_7set_jm_jmim = classifying_all(df_fr_7set_jm, [clfs_list[1]], [clfs_name[1]], using_jmim=True, most_important_features=True, n_features_jmim=40)
df_fr_7set_jm_jmim.to_excel(path_results+'patient/One_leave_out/Respond_to_reviewers/Most_important_features/OASIS_(linguistic_turn_taking_liwc_umls)pt_(liwc_umls)ns.xlsx', index=False)