Train disease classifiers with features generated by `reddit_feature_gen.ipynb`.

---

In [66]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import pickle
import ast

In [None]:
# # included for convenience to help find correct paths
# import os
# os.getcwd()
# os.listdir("..")

Constants.

In [3]:
# paths
MEDRED_REPRODUCIBLE_DIR = "../"
# are features for the DL implementation or MetaMap?
EMBEDDING_TYPE = "DL"
# features and results
FEATURES_IN = MEDRED_REPRODUCIBLE_DIR + "data/validation/Reddit/" + EMBEDDING_TYPE + "_embedded_features.pckl" # without the certainty level; attached dynamically later
FEATURES_OUT = MEDRED_REPRODUCIBLE_DIR + "data/validation/Reddit/" + EMBEDDING_TYPE + "_all_results.csv"
# where to save models
MODEL_OUT_DIR = MEDRED_REPRODUCIBLE_DIR + "resources/predictors/"



Further preprocessing for xgboost models.

In [5]:
def prepare_training_data_for_one_disease(df, disease):
    '''
    Tags a single disease in the data for classification, labeling each either 1 or 0.
    
    Originally been built with oversampling for feature balancing, but executing portions have this hardcoded to off. As such,
    this feature is removed in favor of retaining simpler and more legible code.

    df: data frame o fpreprocessed features for all diseases
    disease: disease category being predicted; originally named DISEASE7s for unstated reasons--legacy naming from early draft?
    '''
    
    # number of rows corresponding to the disease being predicted
    dis_size = len(df[df['disease']==disease])
    sample_size = dis_size
    
    # denote target disease (positive class)
    df_dis = df[df['disease'] == disease]
    df_dis = df_dis.sample(n=sample_size, random_state=7).reset_index()
    df_dis['disease'] = 1
    
    # denote all other diseases (negative class)
    df_others = df[df['disease'] != disease]
    df_others = df_others.sample(n=sample_size, random_state=7).reset_index()
    df_others['disease'] = 0
    
    # concat negative and positive records back together
    df_sample = pd.concat([df_dis, df_others])
    df_sample = df_sample.drop(columns=['index'])
    
    # format and return
    training = df_sample.copy()
    training = training.reset_index(drop=True)
    return training

In [27]:
def XGBoost_cross_validate(training, disease_number_labels):
    '''
    5-fold CV binary classification with xgboost, trained with 1k trees and max tree depth of 4.
    Configured to use 11 parallel cores via n_jobs in the model fit call.
    The evaluation metric is overriden to "error" to match defautl at time of MedRed publishing.
    '''
    training_labels = training["disease"].astype(int)
    training_features = pd.DataFrame(training["features"].tolist())
    
    AUC_results = []
    f1_results = []
    results = []

    # set up folds
    kf = StratifiedKFold(n_splits=5, random_state=7, shuffle=True)
    # train across folds
    for train_index, test_index in kf.split(training_features,training_labels):
        # split out data
        #   training set
        X_train = training_features.loc[train_index]
        y_train = training_labels.loc[train_index]
        #   test set
        X_test = training_features.loc[test_index]
        y_test = training_labels.loc[test_index]

        # fit model
        model = XGBClassifier(n_estimators=50, n_jobs=11, max_depth=4, eval_metric="error")  # 1000 200
        model.fit(X_train, y_train.values.ravel())
        
        # get test set predictions to use for metrics
        predictions = model.predict(X_test)
        
        # check performance
        results.append(precision_recall_fscore_support(y_test, predictions))
        f1_results.append(f1_score(y_true=y_test, y_pred=predictions, average='weighted'))
        AUC_results.append(metrics.roc_auc_score(y_test, predictions))

    # get test set summary stats across folds
    f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
    AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
    
    return f1_results_avg, AUC_results_avg, results, model

def eval_functions(f1_results_avg, AUC_results_avg, results):
    '''
    Aggregate perfomance metrics across folds.
    '''
    results_avg = pd.np.mean(results, axis=0)
    results_std = pd.np.std(results, axis=0)
    P_res = np.array([np.mean(results_avg[0]), np.mean(results_std[0])])
    R_res = np.array([np.mean(results_avg[1]), np.mean(results_std[1])])
    support_res = np.array([np.mean(results_avg[3]), np.mean(results_std[3])])
    print("F1 average score ", f1_results_avg, np.mean(results_avg[2]))
    print("AUC average score ", AUC_results_avg)
    print("P average score ", P_res)
    print("R average score ", R_res)
    print("Support per class ", support_res)
    
    return {'F1':f1_results_avg, 'AUC': AUC_results_avg, 'support': support_res,
           'P': P_res, 'R': R_res}

Define pipeline from preprocessed features to modeling.

The diseases modeled may controlled by adjusting `all_sr` here.

In [37]:
def predict_with_certainty(certainty, features_file=FEATURES_IN, results_file=FEATURES_OUT):

    # define diseases for which to model
    all_sr = ['bpd', 'cfs','crohnsdisease', 'dementia',  'depression',\
            'diabetes', 'dysautonomia', 'gastroparesis','hypothyroidism', 'ibs', \
            'interstitialcystitis', 'kidneystones', 'menieres', 'multiplesclerosis',\
            'parkinsons', 'psoriasis']
    # all_sr = ['sleepapnea']
    # all_sr = ['rheumatoid']
    disease_values_dict = {el:i for i, el in enumerate(all_sr)}
    disease_names = list(disease_values_dict.keys())
    disease_labels = list(disease_values_dict.values())

    # load features with desired certainty cutoff
    #   get input file name
    features_file = features_file.replace(".pckl", "_{:.2f}.pckl".format(certainty))
    results_file = results_file.replace(".csv", "_{:.2f}.csv".format(certainty))
    #   read in
    features = pd.read_pickle(features_file)

    # minimal final preprocessing
    features.rename(columns={'vec':'features'}, inplace=True)
    features = features.drop(columns=['subreddit', 'entities'])
    disease = features['disease']
    print ("Post count per disease is", {d:n for d, n in zip(all_sr, features.groupby('disease').size())})

    # get the classes sizes for the smallest type of disease in each model (before 1/0 encoding)
    #   can potentially help inform over/under sampling decisions
    # min_class_size = min(features.groupby('disease').size())
    # print("Class sizes", min_class_size)

    print('Distribution before imbalancing: {}'.format(Counter(disease)))

    all_res = defaultdict(int) # for metrics
    # for each disease, train a model
    for disease in disease_labels:
        print(disease)
        # labels to predict
        disease_number_labels = [0, 1]
        # prep the data
        balanced_features = prepare_training_data_for_one_disease(features, disease)
        # run the classifier with CV
        f1_results_avg, AUC_results_avg, results, model = \
            XGBoost_cross_validate(balanced_features, disease_number_labels)
        # show performance metrics
        print("RESULTS for ~~~~~~~~~~~~~~~~ ", disease_names[disease], str(disease), "~~~~~~~~~~~~~~~~~")
        res = eval_functions(f1_results_avg, AUC_results_avg, results)
        all_res[disease_names[disease]] =  res
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        # save model
        pickle.dump(model, open(MODEL_OUT_DIR + disease_names[disease] + "_" + EMBEDDING_TYPE + "_" + str(certainty) + ".pickle.dat", "wb"))

    df_res = pd.DataFrame(all_res)
    df_res.to_csv(results_file)

Run the models. Note that the bug preventing appropriate tagging for rheumatoid arthritis will cause that instance to fail if configured. As-is, it was simply excluded. Additionally, the sleep apnea data tends to have issues with stratification which require it to be run separately.

In [38]:
predict_with_certainty(0.90, features_file=FEATURES_IN, results_file=FEATURES_OUT)

Post count per disease is {'bpd': 14138, 'cfs': 5135, 'crohnsdisease': 13106, 'dementia': 764, 'depression': 54731, 'diabetes': 20168, 'dysautonomia': 916, 'gastroparesis': 375, 'hypothyroidism': 4163, 'ibs': 8522, 'interstitialcystitis': 1002, 'kidneystones': 729, 'menieres': 375, 'multiplesclerosis': 6353, 'parkinsons': 328, 'psoriasis': 2916}
Distribution before imbalancing: Counter({4: 54731, 5: 20168, 0: 14138, 2: 13106, 9: 8522, 13: 6353, 1: 5135, 8: 4163, 15: 2916, 17: 2759, 10: 1002, 6: 916, 3: 764, 11: 729, 12: 375, 7: 375, 14: 328})
0


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  bpd 0 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8393069710079978, 0.004061015075943679] 0.8393067628769069
AUC average score  [0.8395448591551762, 0.0039946285494548575]
P average score  [0.84155922 0.00734913]
R average score  [0.83954486 0.01028902]
Support per class  [2.82760000e+03 4.89897949e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  cfs 1 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8246169352810405, 0.009869310106071805] 0.8246169352810405
AUC average score  [0.824732229795521, 0.009790505916455255]
P average score  [0.82554573 0.01430545]
R average score  [0.82473223 0.01966983]
Support per class  [1027.    0.]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  crohnsdisease 2 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8524765960024148, 0.0054792936861052036] 0.8524766296196626
AUC average score  [0.8525867174447075, 0.0054695053255906225]
P average score  [0.85363753 0.00568401]
R average score  [0.85258672 0.00593503]
Support per class  [2.6212e+03 4.0000e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  dementia 3 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8509351247457717, 0.01577997923860379] 0.850931480639864
AUC average score  [0.8514232886136911, 0.015794737688363685]
P average score  [0.85636324 0.01883357]
R average score  [0.85142329 0.02062364]
Support per class  [152.8   0.4]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  depression 4 ~~~~~~~~~~~~~~~~~
F1 average score  [0.88471540147948, 0.0026520816772864888] 0.8847153984950529
AUC average score  [0.8847361940947271, 0.0026606574926161222]
P average score  [0.88501818 0.00332891]
R average score  [0.88473619 0.00382738]
Support per class  [1.09462e+04 4.00000e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  diabetes 5 ~~~~~~~~~~~~~~~~~
F1 average score  [0.9082366991698972, 0.006670744937841757] 0.9082366508199486
AUC average score  [0.9082699914598956, 0.0066522218233354205]
P average score  [0.90884226 0.00686091]
R average score  [0.90826999 0.00748905]
Support per class  [4.03360000e+03 4.89897949e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  dysautonomia 6 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8481446996435535, 0.012932714642949624] 0.8481430809904275
AUC average score  [0.8482478023283442, 0.012855914393307374]
P average score  [0.84920132 0.01897883]
R average score  [0.8482478  0.02594593]
Support per class  [183.2   0.4]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  gastroparesis 7 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8194675537181506, 0.03249089666883932] 0.8194675537181506
AUC average score  [0.8200000000000001, 0.03238655413730964]
P average score  [0.82382312 0.03632669]
R average score  [0.82       0.03933605]
Support per class  [75.  0.]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  hypothyroidism 8 ~~~~~~~~~~~~~~~~~
F1 average score  [0.861201127124081, 0.009250839694134417] 0.86120150735637
AUC average score  [0.8614008103241295, 0.009101914700841787]
P average score  [0.86336055 0.01039153]
R average score  [0.86140081 0.01392346]
Support per class  [8.32600000e+02 4.89897949e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  ibs 9 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8589297573032896, 0.008774170309507863] 0.8589300864186848
AUC average score  [0.8589553302218, 0.008744427923453734]
P average score  [0.85917054 0.00886923]
R average score  [0.85895533 0.01064515]
Support per class  [1.70440000e+03 4.89897949e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  interstitialcystitis 10 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8531252216126285, 0.017220794909675534] 0.8531319266070355
AUC average score  [0.8533333333333333, 0.017027038834861313]
P average score  [0.85470504 0.02608595]
R average score  [0.85333333 0.03497557]
Support per class  [200.4          0.48989795]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  kidneystones 11 ~~~~~~~~~~~~~~~~~
F1 average score  [0.9100921552073673, 0.011012489552247309] 0.9100915862106382
AUC average score  [0.9101417099669344, 0.011003571845572341]
P average score  [0.91101198 0.02073988]
R average score  [0.91014171 0.0245026 ]
Support per class  [145.8   0.4]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
12


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  menieres 12 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8717773788589996, 0.025401380533616808] 0.8717773788589996
AUC average score  [0.8719999999999999, 0.02543837870445183]
P average score  [0.87481337 0.03795589]
R average score  [0.872      0.04444937]
Support per class  [75.  0.]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  multiplesclerosis 13 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8627022214101195, 0.006182726708452204] 0.8627019280488206
AUC average score  [0.8627404796273007, 0.006166453824929683]
P average score  [0.86315335 0.00910952]
R average score  [0.86274048 0.0114573 ]
Support per class  [1.27060000e+03 4.89897949e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
14


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  parkinsons 14 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8139134941462671, 0.03446572015638429] 0.8138987146775358
AUC average score  [0.8140326340326339, 0.03434505510771586]
P average score  [0.81493674 0.0361381 ]
R average score  [0.81403263 0.04201988]
Support per class  [65.6         0.48989795]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  psoriasis 15 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8969327272764808, 0.010127733252645764] 0.8969324217207393
AUC average score  [0.8969454169505863, 0.01012506869529715]
P average score  [0.89717231 0.01242908]
R average score  [0.89694542 0.01372996]
Support per class  [5.832e+02 4.000e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


Now the sleep apnea model.

In [39]:
def predict_with_certainty(certainty, features_file=FEATURES_IN, results_file=FEATURES_OUT):

    # define diseases for which to model
    # all_sr = ['bpd', 'cfs','crohnsdisease', 'dementia',  'depression',\
    #         'diabetes', 'dysautonomia', 'gastroparesis','hypothyroidism', 'ibs', \
    #         'interstitialcystitis', 'kidneystones', 'menieres', 'multiplesclerosis',\
    #         'parkinsons', 'psoriasis']
    all_sr = ['sleepapnea']
    # all_sr = ['rheumatoid']
    disease_values_dict = {el:i for i, el in enumerate(all_sr)}
    disease_names = list(disease_values_dict.keys())
    disease_labels = list(disease_values_dict.values())

    # load features with desired certainty cutoff
    #   get input file name
    features_file = features_file.replace(".pckl", "_{:.2f}.pckl".format(certainty))
    results_file = results_file.replace(".csv", "_{:.2f}.csv".format(certainty))
    #   read in
    features = pd.read_pickle(features_file)

    # minimal final preprocessing
    features.rename(columns={'vec':'features'}, inplace=True)
    features = features.drop(columns=['subreddit', 'entities'])
    disease = features['disease']
    print ("Post count per disease is", {d:n for d, n in zip(all_sr, features.groupby('disease').size())})

    # get the classes sizes for the smallest type of disease in each model (before 1/0 encoding)
    #   can potentially help inform over/under sampling decisions
    # min_class_size = min(features.groupby('disease').size())
    # print("Class sizes", min_class_size)

    print('Distribution before imbalancing: {}'.format(Counter(disease)))

    all_res = defaultdict(int) # for metrics
    # for each disease, train a model
    for disease in disease_labels:
        print(disease)
        # labels to predict
        disease_number_labels = [0, 1]
        # prep the data
        balanced_features = prepare_training_data_for_one_disease(features, disease)
        # run the classifier with CV
        f1_results_avg, AUC_results_avg, results, model = \
            XGBoost_cross_validate(balanced_features, disease_number_labels)
        # show performance metrics
        print("RESULTS for ~~~~~~~~~~~~~~~~ ", disease_names[disease], str(disease), "~~~~~~~~~~~~~~~~~")
        res = eval_functions(f1_results_avg, AUC_results_avg, results)
        all_res[disease_names[disease]] =  res
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        # save model
        pickle.dump(model, open(MODEL_OUT_DIR + disease_names[disease] + "_" + EMBEDDING_TYPE + "_" + str(certainty) + ".pickle.dat", "wb"))

    df_res = pd.DataFrame(all_res)
    df_res.to_csv(results_file)

predict_with_certainty(0.90, features_file=FEATURES_IN, results_file=FEATURES_OUT.replace(".csv", "2.csv"))


Post count per disease is {'sleepapnea': 14138}
Distribution before imbalancing: Counter({4: 54731, 5: 20168, 0: 14138, 2: 13106, 9: 8522, 13: 6353, 1: 5135, 8: 4163, 15: 2916, 17: 2759, 10: 1002, 6: 916, 3: 764, 11: 729, 12: 375, 7: 375, 14: 328})
0


  f1_results_avg = [pd.np.mean(f1_results), pd.np.std(f1_results)]
  AUC_results_avg = [pd.np.mean(AUC_results), pd.np.std(AUC_results)]
  results_avg = pd.np.mean(results, axis=0)
  results_std = pd.np.std(results, axis=0)


RESULTS for ~~~~~~~~~~~~~~~~  sleepapnea 0 ~~~~~~~~~~~~~~~~~
F1 average score  [0.8393069710079978, 0.004061015075943679] 0.8393067628769069
AUC average score  [0.8395448591551762, 0.0039946285494548575]
P average score  [0.84155922 0.00734913]
R average score  [0.83954486 0.01028902]
Support per class  [2.82760000e+03 4.89897949e-01]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


Finally, evaluate the correlation between post count and F1 score, and the number of drugs and F1 score--per condition.

In [67]:
# load features with desired certainty cutoff
#   read in
features = pd.read_pickle(FEATURES_IN.replace(".pckl", "_{:.2f}.pckl".format(0.9)))
# define diseases
all_sr = ['bpd', 'cfs','crohnsdisease', 'dementia',  'depression',\
            'diabetes', 'dysautonomia', 'gastroparesis','hypothyroidism', 'ibs', \
            'interstitialcystitis', 'kidneystones', 'menieres', 'multiplesclerosis',\
            'parkinsons', 'psoriasis', 'sleepapnea']
# minimal final preprocessing, as before
features.rename(columns={'vec':'features'}, inplace=True)
features = features.drop(columns=['subreddit', 'entities'])
disease = features['disease']
# get post counts per disease
post_count = pd.DataFrame([(d, n) for d, n in zip(all_sr, features.groupby('disease').size())], columns=['disease', 'n_posts'])
post_count = post_count.set_index('disease')
# get F1 scores
f1 = pd.read_csv(FEATURES_OUT.replace(".csv", "_{:.2f}.csv".format(0.9)))
post_count = post_count.join(f1.iloc[0])
post_count.columns = ["n_posts", "F1"]
# add sleep apnea, since run separately
f1_sleepapnea = pd.read_csv(FEATURES_OUT.replace(".csv", "2_{:.2f}.csv".format(0.9))).iloc[0, 1]
post_count.loc["sleepapnea", "F1"] = f1_sleepapnea
# extract mean F1 score for each disease
post_count["F1"] = [ast.literal_eval(i)[0] for i in post_count["F1"]]
# get correlation
print(post_count)
print(np.corrcoef(post_count.n_posts, post_count.F1))

Unnamed: 0_level_0,n_posts,F1
disease,Unnamed: 1_level_1,Unnamed: 2_level_1
bpd,14138,0.839307
cfs,5135,0.824617
crohnsdisease,13106,0.852477
dementia,764,0.850935
depression,54731,0.884715
diabetes,20168,0.908237
dysautonomia,916,0.848145
gastroparesis,375,0.819468
hypothyroidism,4163,0.861201
ibs,8522,0.85893
