<a href="https://colab.research.google.com/github/dzanahmed/welcome-ideathon-lshtm/blob/main/code/regular_expression_filter_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

## Load packages

In [None]:
import re
import pandas as pd

## Load data for testing

In [None]:
url = "https://raw.githubusercontent.com/dzanahmed/welcome-ideathon-lshtm/main/data/interim/vax_tweets_v0.csv?token=GHSAT0AAAAAACB5CGEQU2OOOVHV34Z6WFNIZFKYOZQ"
raw_df = pd.read_csv(url)
raw_df['text'] = raw_df['text'].astype(str)

##function to preprocess text column

In [None]:
# Create a function to lowercase the strings in the column
def lowercase_string(string):
    return string.lower()

In [None]:
random_sample = raw_df.sample(frac=0.1, random_state=42)
random_sample = random_sample.loc[:, ['tweet_id', 'text']]
print(len(random_sample))


working_df = raw_df.loc[:, ['tweet_id', 'text']]
# rename text column to raw_text
working_df = raw_df.rename(columns={'text': 'raw_text'})

# add lowercased text column
working_df["lowercase_text"] = working_df["raw_text"].apply(lowercase_string)
print(len(working_df))


10000
99997


## Keyword/RegEx dictionaries

In [None]:
vaccine_filter = [
    # variations of the word vaccine
    r"(?=.*\bvaccine\b)",
    r"(?=.*\bvaccination\b)",
    r"(?=.*\bvaccinated\b)",
    r"(?=.*\bvaccinating\b)",

    # catch all vax---sufix words, since the majority refer to vaccination
    r"(?=.*\bvax\w*\b)",

    # catch common mispellings of vaccine
    r"(?=.*\bvacine\b)",
    r"(?=.*\bvacinne\b)",
    r"(?=.*\bvacinn\b)",

    r"(?=.*\bjab\b)",
    r"(?=.*\bjabs\b)",

    r"(?=.*\bshot\b)",
    r"(?=.*\bshots\b)",

    r"(?=.*\binjection\b)",
    r"(?=.*\binjections\b)",

    r"(?=.*\bbooster\b)",
    r"(?=.*\boosters\b)",

    r"(?=.*\bneedle\b)",
    r"(?=.*\bneedles\b)",

    r"(?=.*\bdose\b)",
    r"(?=.*\bdoses\b)",

    r"(?=.*\bimmune\b)",
    r"(?=.*\bimmunity\b)",

    r"(?=.*\bmoderna\b)",
    r"(?=.*\biontech\b)",
    r"(?=.*\bpfizer\b)",
    r"(?=.*\bj&j\b)",
    r"(?=.*\bjohnson\s+&\s+Johnson\b)",
    r"(?=.*\bcovaxin\b)",
    r"(?=.*\baz\b)",
    r"(?=.*\bastra\w*zeneca\b)",

    # covid terms
    r"(?=.*\bcovid\w*\b)",
    r"(?=.*\bcorona\w*\b)"
]

hesitancy_filter = [
    r"(?=.*\b(worry|worries|worried|worrying)\b)",
    r"(?=.*\b((dont|do\s*not)\s*(want|give|take|have))\b)",
    r"(?=.*\b(reluctant)\b)",
    r"(?=.*\bskeptic\w*\b)",
    r"(?=.*\bconflicted\b)",
    r"(?=.*\bdoubt\w*\b)",
    r"(?=.*\bcan\s*not\b)",
    r"(?=.*\bwill\s*not\b)",
    r"(?=.*\bwont\b)",
    r"(?=.*\bunsure\b)",
    r"(?=.*\bunwilling\b)",
    r"(?=.*\bhesitant\b)",
    r"(?=.*\bquestion\w*\b)",
    r"(?=.*\bon\s*the\s*fence\b)",
    r"(?=.*\bwaver\w*\b)",
    r"(?=.*\b(scared|scary)\b)",
    r"(?=.*\bafraid\b)",
    r"(?=.*\bnervous\b)",
    r"(?=.*\bfright\b)",
    r"(?=.*\bhorror\b)"

]

safety_filter = [
    r"(?=.*\bmortality\w*\b)",
    r"(?=.*\bdeath\w*\b)",
    r"(?=.*\bkill\w*\b)",
    r"(?=.*\b(side\s?effects\w*))\b",
    r"(?=.*\bpoison\w*\b)",
    r"(?=.*\blead\w*\b)",
    r"(?=.*\b(worse\s?than\s?infection\w*))\b",
    r"(?=.*\bhazardous\w*\b)",
    r"(?=.*\blethal\w*\b)",
    r"(?=.*\bfatality\w*\b)",
    r"(?=.*\bharmful\w*\b)",
    r"(?=.*\b(adverse\s?reactions\w*))\b",
    r"(?=.*\breact\w*\b)",
    r"(?=.*\btoxic\w*\b)",
    r"(?=.*\b(result\s?in\s?death\w*))\b",
    r"(?=.*\bdeadly\w*\b)",
    r"(?=.*\beliminate\w*\b)",
    r"(?=.*\bcomplications\w*\b)",
    r"(?=.*\bdetrimental\w*\b)",
    r"(?=.*\b(potentially\s?lethal\w*))\b",
    r"(?=.*\bperilous\w*\b)",
    r"(?=.*\binjurious\w*\b)",
    r"(?=.*\b(severe\s?consequences\w*))\b",
    r"(?=.*\bhazard\w*\b)",
    r"(?=.*\bdestructive\w*\b)",
    r"(?=.*\bendanger\w*\b)",
    r"(?=.*\bcatastrophic\w*\b)",
    r"(?=.*\b(risks\s?outweigh\s?benefits\w*))\b",
    r"(?=.*\blife-threatening\w*\b)",
    r"(?=.*\bunsafe\w*\b)",
    r"(?=.*\bdanger\w*\b)",
    r"(?=.*\buntest\w*\b)",
    r"(?=.*\bunverif\w*\b)",
    r"(?=.*\b(not\s?test\w*))\b",
    r"(?=.*\bfatal\w*\b)",
    r"(?=.*\bmortality\w*\b)",
    r"(?=.*\b(negative\s?outcomes\w*))\b",
    r"(?=.*\btoxicity\w*\b)",
    r"(?=.*\bpoisonous\w*\b)",
    r"(?=.*\b(inflict\s?harm\w*))\b",
    r"(?=.*\bcompromising\w*\b)",
    r"(?=.*\bdetriment\w*\b)",
    r"(?=.*\b(critical\s?condition\w*))\b",
    r"(?=.*\brisky\w*\b)",
    r"(?=.*\bincurable\w*\b)",
    r"(?=.*\bmenacing\w*\b)"
]


mistrust_filter = [
    r"(?=.*\b(doubt\w*)\b)",
    r"(?=.*\b(suspic\w*)\b)",
    r"(?=.*\b(skeptic\w*)\b)",
    r"(?=.*\b(distrust\w*)\b)",
    r"(?=.*\b(cynic\w*)\b)",
    r"(?=.*\b(misbelief\w*)\b)",
    r"(?=.*\b(disbelief\w*)\b)",
    r"(?=.*\b(wari\w*)\b)",
    r"(?=.*\b(apprehen\w*)\b)",
    r"(?=.*\b(disillusion\w*)\b)",
    r"(?=.*\b(misgiv\w*)\b)",
    r"(?=.*\b(uncertain\w*)\b)",
    r"(?=.*\b(reservation\w*)\b)",
    r"(?=.*\b(hesitat\w*)\b)",
    r"(?=.*\b(question\w*)\b)",
    r"(?=.*\b(lack\sof\sconfidenc\w*)\b)",
    r"(?=.*\b(unreliab\w*)\b)",
    r"(?=.*\b(dubiousn\w*)\b)",
    r"(?=.*\b(skepti\smindset\w*)\b)",
    r"(?=.*\b(lack\sof\sfaith\w*)\b)",
    r"(?=.*\b(disreputable\w*)\b)",
    r"(?=.*\b(paranoia\w*)\b)",
    r"(?=.*\b(suspiciousness\w*)\b)",
    r"(?=.*\b(doubtfulness\w*)\b)",
    r"(?=.*\b(mistrust\w*)\b)",
    r"(?=.*\b(incredulous\w*)\b)",
    r"(?=.*\b(concern\w*)\b)",
    r"(?=.*\b(pessimism\w*)\b)",
    r"(?=.*\b(skepticism\w*)\b)",
    r"(?=.*\b(dubiety\w*)\b)",
    r"(?=.*\b(apprehensiveness\w*)\b)",
    r"(?=.*\b(reservation\w*)\b)",
    r"(?=.*\b(untrustworthiness\w*)\b)",
    r"(?=.*\b(cautiousness\w*)\b)",
    r"(?=.*\b(hesitan\w*)\b)",
    r"(?=.*\b(surveil\w*)\b)",
    r"(?=.*\b(discredit\w*)\b)",
    r"(?=.*\b(wariness\w*)\b)",
    r"(?=.*\b(skeptical\w*)\b)",
    r"(?=.*\b(distrustful\w*)\b)",
    r"(?=.*\b(critical\w*)\b)",
    r"(?=.*\b(secrecy\w*)\b)",
    r"(?=.*\b(misgiving\w*)\b)",
    r"(?=.*\b(scrutiny\w*)\b)",
    r"(?=.*\b(apprehensive\w*)\b)"
]

## Testing zone

In [None]:
def test_regex_filter(regex):
  """
  This function tests whether a user-provided piece of text will pass the specified regex filter

  """
  input_string = input("Sentence: ")

  for expression in regex:
    pattern = re.compile(expression)
    if pattern.match(input_string) is not None:
      return True

  return False

In [None]:
# test_regex_filter(vaccine_filter)

# Apply filters and note results

## function to apply filter

In [None]:
# function to apply filter to a dataframe and return filter results

def apply_regex_filter(df, regex, new_col_name):
  df[new_col_name] = False

  for expression in regex:
    # Use 'str.contains()' to check if each tweet matches the pattern
    mask = df['lowercase_text'].str.contains(expression, flags=re.IGNORECASE, regex=True)

    # Update 'no_vax_filter' column where the tweet matches the pattern
    df.loc[mask, new_col_name] = True

  return df

## apply filters to dataset

In [None]:
# random_sample = apply_regex_filter(random_sample, regex = vaccine_filter, new_col_name = "vaccine_filter")
# random_sample = apply_regex_filter(random_sample, regex = hesitancy_filter, new_col_name = "hesitancy_filter")
# random_sample = apply_regex_filter(random_sample, regex = mistrust_filter, new_col_name = "mistrust_filter")
# random_sample = apply_regex_filter(random_sample, regex = safety_filter, new_col_name = "safety_filter")

# Explore filter performance

## define functions to explore filter performance

In [None]:
def report_label_spread(df, filter_name):
  positive_labels = len(df[df[filter_name] == True])
  negative_labels = len(df[df[filter_name] == False])
  total_labels = len(df)

  print(f"Proportion positive labels: {positive_labels / total_labels}")
  print(f"Proportion negative labels: {negative_labels / total_labels}")


def show_positive_labels(df, filter_name):
  return df[df[filter_name] == True].loc[:, ['raw_text', filter_name]]

## explore filter performance

In [None]:
# filter_names = ['vaccine_filter', 'hesitancy_filter', 'safety_filter', 'mistrust_filter']

# for filter_name in filter_names:
#   print(f"\n{filter_name}:")
#   report_label_spread(df = random_sample, filter_name = filter_name)

# show_positive_labels(df = random_sample, filter_name = "mistrust_filter")

# Apply filters to dataset

## code to run cascading filters

In [None]:
def cascading_regex_filter(df, regex_filter_chain, filter_names):
  current_tweet_ids = df['tweet_id'].tolist()

  # go through each filter level in the cascade
  for filter_level in range(len(regex_filter_chain)):

    # when multiple filters occur on the same level
    if type(filter_names[filter_level]) == list:

      # apply each filter to the current set of tweets
      for filter in range(len(filter_names[filter_level])):

        # initialise filter output columns
        df[filter_names[filter_level][filter]] = False

        for tweet_id in current_tweet_ids:
          # check if the tweet id passes the regex filter
          for expression in regex_filter_chain[filter_level][filter]:
            pattern = re.compile(expression)
            if pattern.match(str(df.loc[df["tweet_id"] == tweet_id, 'lowercase_text'].values[0])) is not None:
              # if it passes the filter, update the dataframe
              df.loc[df["tweet_id"] == tweet_id, filter_names[filter_level][filter]] = True
              break


    # otherwise . . .
    else:
      # initialise filter output columns
      df[filter_names[filter_level]] = False

      # go through the current list of tweet ids
      working_tweet_ids = current_tweet_ids[:]

      for tweet_id in working_tweet_ids:
        pass_filter = False
        for expression in regex_filter_chain[filter_level]:
          pattern = re.compile(expression)
          if pattern.match(str(df.loc[df["tweet_id"] == tweet_id, 'lowercase_text'].values[0])) is not None:
            # if it passes the filter, update the dataframe
            df.loc[df["tweet_id"] == tweet_id, filter_names[filter_level]] = True
            pass_filter = True
            break

        # if it doesn't pass, remove that tweet id from the working tweet ids
        if pass_filter == False:
          current_tweet_ids.remove(tweet_id)

  return df

## run cascading filter on dataframe

In [None]:
regex_filter_chain = [vaccine_filter, hesitancy_filter, [safety_filter, mistrust_filter]]
filter_names = ['vaccine_filter', 'hesitancy_filter', ['safety_filter', 'mistrust_filter']]

# output_df = cascading_regex_filter(random_sample, regex_filter_chain, filter_names)
output_df = cascading_regex_filter(working_df, regex_filter_chain, filter_names)

## explore results after cascading filters

In [None]:
filter_names = ['vaccine_filter', 'hesitancy_filter', 'safety_filter', 'mistrust_filter']

for filter_name in filter_names:
  print(f"\n{filter_name}:")
  report_label_spread(df = output_df, filter_name = filter_name)


show_positive_labels(df = output_df, filter_name = "mistrust_filter")


vaccine_filter:
Proportion positive labels: 0.7867936038081143
Proportion negative labels: 0.21320639619188575

hesitancy_filter:
Proportion positive labels: 0.02523075692270768
Proportion negative labels: 0.9747692430772923

safety_filter:
Proportion positive labels: 0.0018400552016560497
Proportion negative labels: 0.9981599447983439

mistrust_filter:
Proportion positive labels: 0.012760382811484344
Proportion negative labels: 0.9872396171885156


Unnamed: 0,raw_text,mistrust_filter
2,@JoyAnnReid @NIH 👿Questions: Could the vacci...,True
3,Next question is how do you find out where you...,True
25,"Dr Alex Shaw, a GP in Southend helps to answer...",True
187,Do you have questions about the #CovidVaccine ...,True
361,@CBCNews is running a series of great online a...,True
...,...,...
99774,What is a vaccine? In this episode of Your Vac...,True
99786,"Fantastic, really well attended virtual vaccin...",True
99847,Can #Vaccine be questioned in this case?\n#Cov...,True
99913,"📢📢 Attention: Parents, teachers, and Student. ...",True


In [None]:
output_df.head()

Unnamed: 0,tweet_id,user_location,user_description,user_followers,user_friends,user_favourites,user_verified,date,raw_text,hashtags,is_retweet,roberta_loc_score,roberta_loc_guess,distilBERT_sentiment,distilBERT_score,lowercase_text,vaccine_filter,hesitancy_filter,safety_filter,mistrust_filter
0,1,Chicago,Official Twitter of Mercy Home for Boys & Girl...,4352.0,2087.0,1780.0,True,2021-03-12,We asked our coworkers why they're getting a C...,"'FeelGoodFriday', 'MercyHomeCares', 'Covid19'",False,6.369158e-06,Chicago,NEGATIVE,0.997759,we asked our coworkers why they're getting a c...,True,False,False,False
1,2,"Bengaluru, India",Hourly updates on FREE and PAID 18+ and 45+ va...,40.0,0.0,0.0,False,2022-03-10,45+ #RURAL #Bengaluru #CovidVaccine Availabili...,"'RURAL', 'Bengaluru', 'CovidVaccine'",False,0.3810927,India,NEGATIVE,0.978396,45+ #rural #bengaluru #covidvaccine availabili...,True,False,False,False
2,3,,"”First they ignore you, then they laugh at you...",1915.0,860.0,52127.0,False,2021-10-19,@JoyAnnReid @NIH 👿Questions: Could the vacci...,"'ColinPowell', 'covidvaccine'",False,6.478056e-08,NotSpecified,NEGATIVE,0.998409,@joyannreid @nih 👿questions: could the vacci...,True,True,True,True
3,4,,Gentleman adventurer...,144.0,329.0,15876.0,False,2021-07-02,Next question is how do you find out where you...,'CovidVaccine',False,6.478056e-08,NotSpecified,NEGATIVE,0.999123,next question is how do you find out where you...,True,True,False,True
4,5,not in London -,I fall into the vulnerable category.\n\nThat m...,184.0,98.0,13359.0,False,2021-08-07,"If you told your child to get a Covid vaccine,...","'COVID19', 'CovidVaccine'",False,5.848263e-08,not in London,NEGATIVE,0.999768,"if you told your child to get a covid vaccine,...",True,False,False,False


In [None]:
from google.colab import files

output_df.loc[:, ['tweet_id', 'vaccine_filter', 'hesitancy_filter', 'safety_filter', 'mistrust_filter']].to_csv('cascading_regex_filters_results.csv', encoding = 'utf-8-sig', index = False)
files.download('cascading_regex_filters_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>