In [3]:
#Author: ML Tlachac, WPI
#For DepreST-CAT, 2021
#Modified this notebook to allow replacing ground truth values with LLM simulated data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
from scipy import stats
import collections
import operator
import argparse
import random
import pickle
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVR
from sklearn.metrics import recall_score
from sklearn import metrics
from statistics import mean
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn import utils
from sklearn.datasets import load_digits
from sklearn import svm
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.svm import SVR
from sklearn.decomposition import PCA, KernelPCA, NMF

In [4]:
drive_base = "../"

In [12]:
# Function to completely replace real scores with GPT simulated
def replace_gad_phq_df(df, replace_ids, split):

    print("replacing orginal scores with GPT GAD and PHQ scores")
    with open(drive_base + "outputs/gpt-4-gad-given-phq", 'rb') as file:
        chat_gpt_scores_dict_gad = pickle.load(file)

    with open(drive_base + "outputs/gpt-4-phq-given-gad", 'rb') as file:
        chat_gpt_scores_dict_phq = pickle.load(file)

    print("replacing in", len(replace_ids))
    for i in replace_ids:
        gad_score = sum(chat_gpt_scores_dict_gad[i])
        phq_score = sum(chat_gpt_scores_dict_phq[i])


        if int(gad_score) >= split:
            df.loc[df['id'] == i, 'gad7'] = 1
        else:
            df.loc[df['id'] == i, 'gad7'] = 0
        if int(phq_score) >= split:
            df.loc[df['id'] == i, 'phq9'] = 1
        else:
            df.loc[df['id'] == i, 'phq9'] = 0
    return df



In [13]:
# Function to replace real scores with GPT generated or SVR generated scores
def replace_gad_phq_SVR_or_GPT(df, original_df, replace_ids, split, gpt=False):
    data_split = 0.7
    model_ids = replace_ids[: int(data_split*len(replace_ids))]
    predict_ids = replace_ids[int(data_split*len(replace_ids)):]

    if gpt:
        print(f"replacing original scores with GPT GAD and PHQ scores using GPT for {(1-data_split) * 100}% data")
        with open(drive_base + "outputs/gpt-4-gad-given-phq", 'rb') as file:
            chat_gpt_scores_dict_gad = pickle.load(file)

        with open(drive_base + "outputs/gpt-4-phq-given-gad", 'rb') as file:
            chat_gpt_scores_dict_phq = pickle.load(file)

    else:
        print(f"replacing original scores with GPT GAD and PHQ scores using SVR model {(1-data_split) * 100}% data")
        model_data = original_df[original_df['id'].isin(model_ids)]
        x_data = np.array(model_data['gad7']).reshape(-1, 1)
        y_data = np.array(model_data['phq9']).reshape(-1, 1)


        y_svr_model = SVR(kernel="rbf", C=10, gamma=0.1, epsilon=.1)
        x_svr_model = SVR(kernel="rbf", C=10, gamma=0.1, epsilon=.1)
        y_svr_model.fit(x_data, y_data)
        x_svr_model.fit(y_data, x_data)

    print("replacing in", len(predict_ids))
    for i in predict_ids:
        if gpt:
            new_gad_score = sum(chat_gpt_scores_dict_gad[i])
            new_phq_score = sum(chat_gpt_scores_dict_phq[i])
        else:
            orig_gad = original_df.loc[df['id'] == i, 'gad7'].iloc[0]
            orig_phq = original_df.loc[df['id'] == i, 'phq9'].iloc[0]
            new_gad_score = round(x_svr_model.predict([[orig_phq]])[0])
            new_phq_score = round(y_svr_model.predict([[orig_gad]])[0])

        if int(new_gad_score) >= split:
            df.loc[df['id'] == i, 'gad7'] = 1
        else:
            df.loc[df['id'] == i, 'gad7'] = 0

        if int(new_phq_score) >= split:
            df.loc[df['id'] == i, 'phq9'] = 1
        else:
            df.loc[df['id'] == i, 'phq9'] = 0

    return df

In [14]:
# Function to replace real scores with GPT generated or Linear Regression generated scores
def replace_gad_phq_linear_model_or_GPT(df, original_df, replace_ids, split, gpt=False):
    data_split = 0.7
    model_ids = replace_ids[: int(data_split*len(replace_ids))]
    predict_ids = replace_ids[int(data_split*len(replace_ids)):]

    if gpt:
        print(f"replacing original scores with GPT GAD and PHQ scores using GPT for {(1-data_split) * 100}% data")
        with open(drive_base + "outputs/gpt-4-gad-given-phq", 'rb') as file:
            chat_gpt_scores_dict_gad = pickle.load(file)

        with open(drive_base + "outputs/gpt-4-phq-given-gad", 'rb') as file:
            chat_gpt_scores_dict_phq = pickle.load(file)

    else:
        print("replacing original scores with GPT GAD and PHQ scores using linear model")
        model_data = original_df[original_df['id'].isin(model_ids)]
        x_data = np.array(model_data['gad7'])
        y_data = np.array(model_data['phq9'])

        coefficients = np.polyfit(x_data, y_data, 1)  # Fit a first-degree (linear) polynomial


        m = coefficients[0]
        b = coefficients[1]
        y_fit = m * x_data + b

    print("replacing in", len(predict_ids))
    for i in predict_ids:

        if gpt:
            new_gad_score = sum(chat_gpt_scores_dict_gad[i])
            new_phq_score = sum(chat_gpt_scores_dict_phq[i])
        else:
            orig_gad = original_df.loc[df['id'] == i, 'gad7'].iloc[0]
            orig_phq = original_df.loc[df['id'] == i, 'phq9'].iloc[0]
            new_gad_score = int((orig_phq - b)/m)
            new_phq_score = int(m*orig_gad + b)

        # print(gad_score, phq_score)


        if int(new_gad_score) >= split:
            df.loc[df['id'] == i, 'gad7'] = 1
        else:
            df.loc[df['id'] == i, 'gad7'] = 0

        if int(new_phq_score) >= split:
            df.loc[df['id'] == i, 'phq9'] = 1
        else:
            df.loc[df['id'] == i, 'phq9'] = 0

    return df

In [None]:

labels = ["phq9", "gad7"]
modelTypelist = ["SVC", "kNN", "RF", "LR", "XG"]
splits = [5,6,7,8,9,10]
drive_base = "../../"
for split in splits:

    for week in [2,4,8,16]:
        data = pd.read_csv(drive_base + "DepreST-CAT-main/features/featureSet" + str(week) + "weeksDepreST-CAT.csv")
        original_data = data.copy(deep=True)
        print(data.shape)

        #binary labels
        d10 = []
        g10 = []
        for i in range(0, data.shape[0]):
            if int(data.phq9[i]) >= split:
                d10.append(1)
            else:
                d10.append(0)
            if int(data.gad7[i]) >= split:
                g10.append(1)
            else:
                g10.append(0)
        data["phq9"] = d10
        data["gad7"] = g10

        print(data.shape)

        for label in labels:

            #create lists to populate
            flist = []
            mlist = []
            llist = []
            featureList = []
            wlist = []
            slist = []
            f1List = []
            accuracyList = []
            truePosList = []
            trueNegList = []
            falsePosList = []
            falseNegList = []
            predictions = []
            rseed = []

            for r in range(0, 100):

                #train/test split
                df_train, df_test = train_test_split(data, test_size=0.3, stratify=data[["phq9", "gad7"]], random_state = r)
                # creating data_copy to prevent any leakage
                data_copy = data.copy(deep=True)
                trainids = list(df_train["id"])
                testids = list(df_test["id"])
                # replacing scores only in train data
                data_copy = replace_gad_phq_df(data_copy, trainids, split)
                #for svr
                # data_copy = replace_gad_phq_SVR_or_GPT(data_copy, original_data, trainids, split, False)

                print(data.shape)
                testdata = data_copy[data_copy['id'].isin(testids)]
                print(testdata.shape)
                traindata = data_copy[data_copy['id'].isin(trainids)]
                print(traindata.shape)

                #limit to features
                testContent = testdata[testdata.columns[1:-2]]
                print(testContent.shape)
                trainContent = traindata[traindata.columns[1:-2]]
                print(trainContent.shape)

                #NEED TO SCALE BEFORE FEATURE SELECTION/REDUCATION
                min_max_scaler = preprocessing.MinMaxScaler()
                np_scaled = min_max_scaler.fit_transform(trainContent)
                featureSubset = pd.DataFrame(np_scaled)
                np_scaled2 =  min_max_scaler.transform(testContent)
                testSubset = pd.DataFrame(np_scaled2)
                print(featureSubset.shape)
                print(testSubset.shape)

                target = list(traindata[label])

                featureDF = []
                testDFs = []

                nFeatureList = list(np.arange(1,5,1))
                for numberOfFeatures in nFeatureList:
                    pca = PCA(n_components=numberOfFeatures)
                    pca = pca.fit(featureSubset)
                    X_pca = pca.transform(featureSubset)
                    pcaDF = pd.DataFrame(X_pca)
                    pcaDF = pcaDF.assign(target = target)
                    featureDF.append(pcaDF)
                    testSubset2 = pca.transform(testSubset)
                    testDFs.append(pd.DataFrame(testSubset2))


                for f in range(0, len(featureDF)):
                    print(f)

                    train_phq9 = featureDF[f]
                    X_test = testDFs[f]

                    # upsampling
                    #Count 1s and 0s
                    ones = len(train_phq9.loc[train_phq9['target'] == 1])
                    zeros = len(train_phq9.loc[train_phq9['target'] == 0])
                    if ones >= zeros:
                        majority = 1
                        minority = 0
                    else:
                        majority = 0
                        minority = 1


                    # Upsample TrainingSet
                    train_majority = train_phq9[train_phq9.target==majority]
                    train_minority = train_phq9[train_phq9.target==minority]

                    #print("train_majority ="  + str(len(train_majority)))
                    #print("train_minority ="  + str(len(train_minority)))

                    # Upsample minority class
                    train_minority_upsampled = resample(train_minority,
                                                     replace=True,     # sample with replacement
                                                     n_samples=len(train_majority),    # to match majority class
                                                     random_state=42) # reproducible results

                    # Combine majority class with upsampled minority class
                    train_phq9 = pd.concat([train_majority, train_minority_upsampled])

                    #seperate features and target
                    y_train = train_phq9["target"]
                    X_train = train_phq9.drop(columns = "target")

                    for modelType in modelTypelist:

                        #add data to lists
                        llist.append(label)
                        featureList.append(f +1)
                        flist.append("PCA")
                        mlist.append(modelType)

                        #chose model type
                        if modelType == "SVC":
                            clf = svm.SVC(random_state=r)
                        elif modelType == "RF":
                            clf = RandomForestClassifier(random_state=r)
                        elif modelType == "kNN":
                            clf = KNeighborsClassifier()
                        elif modelType == "LR":
                            clf = LogisticRegression(random_state=r)
                        elif modelType =="XG":
                            clf = xgb.XGBClassifier(random_state=r)

                        #train model and make predictions
                        clf.fit(X_train, y_train)
                        y_pred = clf.predict(X_test)

                        #evaluate model
                        conf_mat = confusion_matrix(list(testdata[label]), y_pred)
                        TN = conf_mat[0][0]
                        TP = conf_mat[1][1]
                        FP = conf_mat[0][1]
                        FN = conf_mat[1][0]
                        precision = TP/(TP+FP)
                        sensitivity = TP/(TP+FN)
                        f1 = (2*precision*sensitivity)/(precision + sensitivity)
                        accuracy = (TP+TN)/(TN+TP+FP+FN)

                        #populate lists with results
                        f1List.append(f1)
                        accuracyList.append(accuracy)
                        truePosList.append(TP)
                        trueNegList.append(TN)
                        falsePosList.append(FP)
                        falseNegList.append(FN)
                        predictions.append(y_pred)
                        rseed.append(r)
                        slist.append(split)
                        wlist.append(week)

            resultsDF = pd.DataFrame()
            resultsDF["week"] = wlist
            resultsDF["label"] = llist
            resultsDF["split"] = slist
            resultsDF["Engineering"] = flist
            resultsDF["model"] = mlist
            resultsDF["nFeatures"] = featureList
            resultsDF["F1"] = f1List
            resultsDF["Accuracy"] = accuracyList
            resultsDF["truePos"] = truePosList
            resultsDF["trueNeg"] = trueNegList
            resultsDF["falsePos"] = falsePosList
            resultsDF["falseNeg"] = falseNegList
            resultsDF["predictions"] = predictions
            resultsDF["randomSeed"] = rseed
            # foldername for output
            resultsDF.to_csv(drive_base + "DepreST-CAT-main/machineLearning/screeningResultsGPT/resultsCAT" + str(week) + "week" + label + "split" + str(split) + ".csv")