In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [2]:
from collections import defaultdict

def tokenize(text):
    # Simple word tokenization
    return text.split()

def find_common_subsequences(doc1,doc2,min_length=3):
    # Tokenize both documents
    words1 = tokenize(doc1)
    words2 = tokenize(doc2)

    # Function to get all subsequences of length 'min_length' or greater
    def get_subsequences(words,min_length):
        subsequences = defaultdict(int)
        # for length in range(min_length,len(words) + 1):
        for length in [min_length]:
            for i in range(len(words) - length + 1):
                subsequence = tuple(words[i:i + length])
                subsequences[subsequence] += 1
        return subsequences

    # Get all subsequences for both documents
    subsequences1 = get_subsequences(words1,min_length)
    subsequences2 = get_subsequences(words2,min_length)

    # Find the common subsequences
    common_subsequences = set(subsequences1.keys()) & set(subsequences2.keys())

    return common_subsequences



In [4]:
import openpyxl

ADs = []
CNs = []

wb = openpyxl.load_workbook("/content/gdrive/MyDrive/DePiC.xlsx")
ws = wb.active
rows = list(ws.iter_rows(values_only=True))

for row in rows[1:]:
    if row[2] == 1:
        ADs.append({"id": row[0],"text": row[1]})
    elif row[2] == 0:
        CNs.append({"id": row[0],"text": row[1]})
    else:
      print("NOT FOUND")

In [9]:
min_length = 5

res = []

for i in range(len(ADs)):
    for j in range(len(CNs)):
        common_subs = find_common_subsequences(ADs[i]["text"],CNs[j]["text"],min_length=min_length)
        for sub in common_subs:
            print(ADs[i]["id"],CNs[j]["id"],' '.join(sub))
            res.append({
                "AD_ID": ADs[i]["id"],
                "CN_ID": CNs[j]["id"],
                "sentence": ' '.join(sub)
            })


adrso025 adrso016 cookies from the cookie jar
adrso025 adrso016 from the cookie jar and
adrso025 adrso157 cookies from the cookie jar
adrso025 adrso157 from the cookie jar and
adrso025 adrso169 cookies from the cookie jar
adrso024 adrso276 is off the cookie jar.
adrso024 adrso276 lid is off the cookie
adrso045 adrso170 don't know what's going on.
adrso045 adrso291 don't know what that is.
adrso045 adrso291 I don't know what that
adrso035 adrso315 is reaching for a cookie
adrso032 adrso002 the cookie jar on the
adrso032 adrso017 water on the floor and
adrso032 adrso156 and he's about to fall
adrso053 adrso182 I think I mentioned that
adrso053 adrso182 think I mentioned that the
adrso072 adrso151 taking cookies out the cookie
adrso072 adrso164 taking cookies out of the
adrso072 adrso164 cookies out of the cookie
adrso072 adrso165 of the cookie jar, the
adrso072 adrso165 taking cookies out of the
adrso072 adrso165 out of the cookie jar,
adrso072 adrso165 cookies out of the cookie
adrso072

In [10]:
def find_consecutive_times(df,sentence):
    words = sentence.split()  # Split the sentence into words
    sequence_length = len(words)
    sequence_start_index = None

    # Iterate over the dataframe rows
    for i,row in df.iterrows():
        # If the row's content matches the first word of the sequence and we are not already in a sequence
        if row['content'] == words[0] and sequence_start_index is None:
            sequence_start_index = i  # Set the potential start of the sequence

        # If we are in a sequence,check if the current row's content matches the expected word in the sentence
        elif sequence_start_index is not None:
            expected_word_index = i - sequence_start_index
            if expected_word_index < sequence_length and row['content'] == words[expected_word_index]:
                if expected_word_index == sequence_length - 1:
                    # We found the whole sequence,return the start time of the first word and the end time of the last word
                    start_time_of_sentence = df.iloc[sequence_start_index]['start_time']
                    end_time_of_sentence = row['end_time']
                    return start_time_of_sentence,end_time_of_sentence
            else:
                # Reset the sequence if the word does not match
                sequence_start_index = None

    return None,None

# Let's leave the sentence empty again for you to provide it
sentence = ""

# Uncomment the following line and replace the sentence with the one you'd like to find times for
# start_time,end_time = find_consecutive_times(data,sentence)

# Please provide the sentence you want to search for,and I will execute the function.



In [12]:
import pandas as pd
import csv

AD_PATH = "/content/gdrive/MyDrive/AD_patients_excel_files"
CN_PATH = "/content/gdrive/MyDrive/CN_patients_excel_files"

with open("output.csv", "w") as fout:
    fieldnames = ['sentence', 'uid', 'label', 'start_time', 'end_time']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)

    for row in res:
        print(f"{row['AD_ID'], row['CN_ID'], row['sentence']}")
        ad_user_path = f"{AD_PATH}/{row['AD_ID']}.wav.xlsx"
        dfs = pd.read_excel(ad_user_path)

        start_time,end_time = find_consecutive_times(dfs,row["sentence"])
        writer.writerow({
            "sentence": row['sentence'],
            "uid": row['AD_ID'],
            "label": 1,
            "start_time": start_time,
            "end_time": end_time
        })
        cn_user_path = f"{CN_PATH}/{row['CN_ID']}.wav.xlsx"
        dfs = pd.read_excel(cn_user_path)

        start_time,end_time = find_consecutive_times(dfs,row["sentence"])
        writer.writerow({
            "sentence": row['sentence'],
            "uid": row['CN_ID'],
            "label": 0,
            "start_time": start_time,
            "end_time": end_time
        })


('adrso025', 'adrso016', 'cookies from the cookie jar')
('adrso025', 'adrso016', 'from the cookie jar and')
('adrso025', 'adrso157', 'cookies from the cookie jar')
('adrso025', 'adrso157', 'from the cookie jar and')
('adrso025', 'adrso169', 'cookies from the cookie jar')
('adrso024', 'adrso276', 'is off the cookie jar.')
('adrso024', 'adrso276', 'lid is off the cookie')
('adrso045', 'adrso170', "don't know what's going on.")
('adrso045', 'adrso291', "don't know what that is.")
('adrso045', 'adrso291', "I don't know what that")
('adrso035', 'adrso315', 'is reaching for a cookie')
('adrso032', 'adrso002', 'the cookie jar on the')
('adrso032', 'adrso017', 'water on the floor and')
('adrso032', 'adrso156', "and he's about to fall")
('adrso053', 'adrso182', 'I think I mentioned that')
('adrso053', 'adrso182', 'think I mentioned that the')
('adrso072', 'adrso151', 'taking cookies out the cookie')
('adrso072', 'adrso164', 'taking cookies out of the')
('adrso072', 'adrso164', 'cookies out of t