In [None]:
#libraries
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
import json
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import os
from torch.utils.data import Dataset, DataLoader

In [None]:
class DIACWOZDataGenerator(Dataset):
    def __init__(self,
                 data_dir,
               output_format='text',
               batch_size=1,
               shuffle=True,
               split="train",
               seed=None,
               prompt_maker = None,
              ):

        self.diac_woz_convo_path = os.path.join(data_dir, "transcript_DAIC_WOZ")
        self.diac_woz_ext_convo_path = os.path.join(data_dir, "transcript_ext_DAIC_WOZ")
        self.diac_woz_dev_path = os.path.join(data_dir, "dev_split_Depression_AVEC2017.csv")
        self.diac_woz_ext_dev_path = os.path.join(data_dir, "dev_split.csv")
        self.diac_woz_train_path = os.path.join(data_dir, "train_split_Depression_AVEC2017.csv")
        self.diac_woz_ext_train_path = os.path.join(data_dir, "train_split.csv")

        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.split = split
        self.total_batches_seen = 0
        self.index_array = None
        self.total_batches_seen = 0
        self.format = output_format
        self.samples = self.read_samples(split, self.diac_woz_convo_path, self.diac_woz_ext_convo_path,
               self.diac_woz_dev_path, self.diac_woz_ext_dev_path,
               self.diac_woz_train_path, self.diac_woz_ext_train_path)
        self.sample_ids = np.linspace(0, 1, len(self.samples))

        self.prompt_maker = prompt_maker

    def concatenate_dataframe(self, df):
        prev_speaker = None
        result_df = pd.DataFrame(columns=['speaker', 'value'])

        for index, row in df.iterrows():
            if prev_speaker == row['speaker']:
                result_df.at[result_df.index[-1], 'value'] += ' ' + row['value']
            else:
                result_df = pd.concat([result_df, pd.DataFrame({'speaker': [row['speaker']], 'value': [row['value']]})], ignore_index=True)
                prev_speaker = row['speaker']

        if result_df.iloc[0]['speaker'] == 'Ellie':
            result_df = result_df[2:]
        else:
            result_df = result_df[3:]

        return result_df

    def get_records_and_labels(self, diac_woz_convo_path, diac_woz_ext_convo_path, daic_woz_split_path, daic_woz_ext_split_path):
        x = []
        y = []

        dc = pd.read_csv(daic_woz_split_path)
        dc_ex = pd.read_csv(daic_woz_ext_split_path)

        for f in dc["Participant_ID"].values:
            file = os.path.join(diac_woz_convo_path, str(f) + "_TRANSCRIPT.csv")
            df_ = pd.read_csv(file, delimiter = "\t")
            df_['value'] = df_['value'].astype(str)
            df = self.concatenate_dataframe(df_)
            df['combined'] = df['value']
            x.append(". ".join(map(str, df['combined'].values.tolist())))
            label = dc[dc['Participant_ID'] == int(f)].PHQ8_Binary.values[0]
            y.append(label)

        for f in dc_ex["Participant_ID"].values:
            file = os.path.join(diac_woz_ext_convo_path, str(f) + "_Transcript.csv")
            df_ex = pd.read_csv(file, delimiter = ",")
            df_ex['combined'] = df_ex['Text']
            x.append(". ".join(map(str, df_ex['combined'].values.tolist())))
            label = dc_ex[dc_ex['Participant_ID'] == int(f)].PHQ_Binary.values[0]
            y.append(label)

        return x, y

    def read_samples(self, split, diac_woz_convo_path, diac_woz_ext_convo_path,
               diac_woz_dev_path, diac_woz_ext_dev_path,
               diac_woz_train_path, diac_woz_ext_train_path):
        samples = []

        if split =="train":
            Xs, ys = self.get_records_and_labels(diac_woz_convo_path, diac_woz_ext_convo_path, diac_woz_train_path, diac_woz_ext_train_path)
        elif split =="dev":
            Xs, ys = self.get_records_and_labels(diac_woz_convo_path, diac_woz_ext_convo_path, diac_woz_dev_path, diac_woz_ext_dev_path)

        for x, y in zip(Xs, ys):
            samples.append({
                'conversation': x,
                'label': int(y),
            })

        return samples

    def set_index_array(self):
        self.index_array = np.arange(0, len(self), 1)
        if self.shuffle:
            np.random.shuffle(self.index_array)

    def __len__(self):
        return (len(self.samples) + self.batch_size - 1) // self.batch_size

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self):
            raise IndexError

        if self.seed is not None:
            np.random.seed(self.seed + self.total_batches_seen)
        self.total_batches_seen += 1
        if self.index_array is None:
            self.set_index_array()

        index_array = self.index_array[self.batch_size * int(idx): self.batch_size * int(idx + 1)]

        return self._get_batch(index_array)

    def _get_batch(self, index_array):
        batch_x, batch_y = self._get_sample_batch(index_array)
        return batch_x, batch_y

    def _get_sample_batch(self, index_array):
        batch_x, batch_y = [], []
        for x, y in self._get_sample_pair(index_array):
            batch_x.append(x)
            batch_y.append(y)
        return batch_x, batch_y

    def _get_sample_pair(self, index):
        for i in index:
            sample = self.samples[i]
            prompt = self._get_prompt(sample)
            label = 'depressed' if sample['label'] == 1 else 'not depressed'
            yield prompt, label

    def _get_prompt(self, sample):
        prompt = sample
        return prompt



In [None]:
data_path = "/content/drive/MyDrive/Hacklytics/dataset/data"

train = DIACWOZDataGenerator(data_path, split="train")
dev = DIACWOZDataGenerator(data_path, split="dev")

In [None]:
len(train)

270

In [None]:
len(dev)

91

In [None]:
train[232][0][0]["conversation"]

"okay. I'm pleased.  are you okay with this yes.  pretty good.  I grew up in Boston.  about 30 years ago.  yep.  I'm too often maybe once a year sometimes less.  oh I love living in that way I don't ever want to live anywhere else.  well I came out here to go to college and I wanted to be far away from my parents and the weather was good and.  that's part of it.  a very easy well maybe somebody's at the beginning the first few weeks for it's really hard to just be in a new environment and to be away from everybody I knew but after a few weeks it was great.  broadcast journalism.  well it's something I'd wanted to do since I was 12 and everything I did from the time I was twelve until I went to college with sort of in pursuit of that dream.  are you still working in that nope.  well it's kind of a long story but I got laid off from my first job when I was in my 20s and it just once I'd done that work I realized that's not really what I wanted to do is to cut throat of a business and it 

In [None]:
train[0][0][0]["label"]

0

In [None]:
import csv
import json

def create_csv(filename, data):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        for conversation in data:
            system_message = {"role": "system", "content": "de "}
            user_message = {"role": "user", "content": conversation[0][0]["conversation"]}
            if conversation[0][0]["label"] == 0:
                assistant_message = {"role": "assistant", "content": "The person is NOT-DEPRESSED"}
            else:
                assistant_message = {"role": "assistant", "content": "The person is DEPRESSED"}
            messages = {"messages": [system_message, user_message, assistant_message]}
            writer.writerow([json.dumps(messages)])


In [None]:
csv_filename = '/content/drive/MyDrive/Hacklytics/dataset/data/train_data.csv'

# Create the CSV file
create_csv(csv_filename, train)

In [None]:
csv_filename = '/content/drive/MyDrive/Hacklytics/dataset/data/dev_data.csv'

# Create the CSV file
create_csv(csv_filename, dev)