<a href="https://colab.research.google.com/github/Voytella/Cluster/blob/master/ruleFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
# copy of Cindy's asthma Notebook retooled for "older_adult" situation

import nltk
nltk.download('punkt')
from nltk import sent_tokenize

import numpy as np
import pandas as pd
import pprint
import re
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#----------BEGIN FUNCTIONS----------#

# flatten a list
flattenList = lambda x : [ele for subList in x for ele in subList]

# determine if two sentences are similar
def isSentencesSimilar(vectorization, similarityThreshold, sentence1Index, sentence2Index):
    return cosine_similarity(vectorization[sentence1Index],
                             vectorization[sentence2Index]) > similarityThreshold

# grab all the lengths of the sentences similar to the target sentence
def getSimilarSentenceLengths(vectorization, similarityThreshold, sentences, targetSentence):
    similarSentenceLengths = [len(sentence) for sentence in sentences if (targetSentence != sentence) and\
                                            (isSentencesSimilar(vectorization,
                                                               similarityThreshold,
                                                               sentences.index(targetSentence),
                                                               sentences.index(sentence)
                                                               )
                                            )
            ]
    
    # if there are no similar sentences, assume the "length of the most similar sentence" to be infinite
    return similarSentenceLengths if similarSentenceLengths else [sys.maxsize]

#-----------END FUNCTIONS-----------#

#----------BEGIN REFERENCE DATA----------#

# grab the giant CDC text file and split it up into lines
CDCRaw = open("CDCGuidelines.txt", mode = 'r', encoding = 'utf-8-sig').readlines()

# list of provided general rules
generalRules = ["Wear a mask",
                "Stay 6 feet from others",
                "Avoid crowds",
                "Avoid poorly ventilated spaces",
                "Wash your hands often",
                "Cover coughs and sneezes",
                "Clean and disinfect frequently touched surfaces daily",
                "Monitor your health daily",
                "Get vaccinated"
                ]

# chosen situation
situationRaw = "older_adult"

# manually modified version of situation
situation = "older adult"

# sentences must contain an action verb in order to be kept
actionVerbsMine = ["stay",
               "check",
               "monitor",
               "wear",
               "avoid",
               "clean",
               "wash",
               "get",
               "remember",
               "talk",
               "create",
               ]
actionVerbs = ["Learn",
               "plan",
               "Know",
               "Avoid",
               "Call",
               "Open",
               "follow",
               "Wear",
               "Speak"]

#-----------END REFERENCE DATA-----------#

#----------BEGIN DATA TRIMMING----------#

# clean up raw data by replacing newlines with spaces and replacing adjacent blank lines with a single newline
CDCClean = re.sub("\n+", "\n", ' '.join([line.strip() if line.strip() else "\n" for line in CDCRaw]))

# extract all sentences containing the chosen situation and remove duplicates
sitSentences = list(set(flattenList([sent_tokenize(para) for para in CDCClean.splitlines() if situation in para])))

#-----------END DATA TRIMMING-----------#

# vectorize the sentences (fancy stuff)
vectorization = TfidfVectorizer().fit_transform(sitSentences)

# grab the sentences that contain action verbs
sitSentencesVerb = [sentence for sentence in sitSentences if any([verb in sentence for verb in actionVerbs])]

# retain only shorter versions of similar sentences
similarityThreshold = 0.2
#print([len(otherSentence) for otherSentence in sitSentencesVerb if (sitSentencesVerb[0] != otherSentence) and (cosine_similarity(vectorization[0], vectorization[sitSentencesVerb.index(otherSentence)]) > similarityThreshold)])
sitSentencesVerbTrim = [sentence for sentence in sitSentencesVerb if\
                        len(sentence) < min(getSimilarSentenceLengths(vectorization, similarityThreshold, sitSentencesVerb, sentence))
                        ]
#print(vectorization[0])
pprint.pprint(sitSentencesVerbTrim)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[' Wear masks Masks should be worn over the nose and mouth.',
 'Know the steps to help prevent the spread of COVID-19 if you are sick.',
 'Identify a plan for transporting people with COVID-19, including those with '
 'disabilities, for care at other healthcare systems and/or providers if your '
 'hospital or facility is closed or offering decreased services.',
 ' Know when you need to seek medical attention for your loved one.',
 'When using public transportation, follow CDC’s guidance on how to protect '
 'yourself when using transportation Does my activity require travel to '
 'another community?',
 'Avoid close contact with others on your commute to work, if possible.',
 'Many health insurance plans now allow for early access to prescriptions, for '
 'more than a 90-day supply of medication.',
 'Learn how to protect yourself when using transportation to commute to work.',
