<a href="https://colab.research.google.com/github/Arteric-Jeff-Knight/collabs/blob/master/text_cleaner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# First, just run this code block to load libraries and define functions

Go ahead and ignore the output that isn't an error.

In [32]:
#@title
# First the easy stuff

from google.colab import files
import ipywidgets as widgets
import io
import re, string, unicodedata                          # Import Regex, string and unicodedata.
import numpy as np
import pandas as pd                                     # Import pandas.
import spacy

# Then the stuff that has to be compiled

!pip install emoji
import emoji
!pip install contractions
import contractions                                     # Import contractions library.
!pip install num2words
from num2words import num2words

# We are only using spacy to lemmatize the content and to get stop words
spacy.cli.download('en_core_web_sm')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.Defaults.stop_words -= {"none", "nor", "not", "no", "one"}

# This regex will eliminate all stop words without tokenizing 
sw_pattern = re.compile(r'\b(' + r'|'.join(nlp.Defaults.stop_words) + r')\b\s*')

# Define the normalization funciton
def normalize(content: str, remove_usernames: bool = True) -> str:
  # Convert emojis
  content = emoji.demojize(content)
  # Remove non-ASCII
  content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  # Expand contractions
  content = contractions.fix(content)
  # remove URLs
  content = re.sub(r"http\S+", "", content)
  # Remove leading RT_
  content = re.sub("^[Rr][Tt] ","",content)  
  # Remove leading @somename
  content = re.sub("^@[^ ]*","",content)
  # If requested, remove all usernames
  if remove_usernames:
    content = re.sub("@[^ ]*","",content)
  # Expand some symbols @todo others besides percent? 
  content = content.replace("%"," percent")
  content = content.replace(" w/ "," with ")
  # Replace all the ordinals
  ordinals = re.findall("\d+(?:st|nd|rd|th)",content) # find all in string
  if ordinals:
    ordinals = set(ordinals) # reduce to unique values
    for ordinal in ordinals:
      content = content.replace(ordinal,num2words(ordinal[:-2],to="ordinal") + ' ')
  # Convert to lower case, then get rid of punctuation
  content = re.sub("[^a-z0-9\-\/@ ]","",content.lower())
  # Eliminate Stopwords
  content = sw_pattern.sub('', content)
  # Reduce multiple spaces
  content = re.sub("\s\s+", " ", content).strip()
  words = []
  # Consider just splitting on the space to KISS
  for word in content.split(' '):
    try:
      if word.isnumeric():
        if int(word) < 999:
          word = num2words(word)
    except Exception as e:
      pass
    # put everything back together
    words.append(word)
  return ' '.join(words)


def lemmatize(sentence: str):
  doc = nlp(sentence)
  return " ".join([token.lemma_ for token in doc])

def split_df_into_data_and_configs(uploaded, defaults: dict = {}, config_name: str = 'config'):
  # Put uploaded file into dataframe
  filename = list(uploaded.keys())[0]
  df = pd.read_csv(io.BytesIO(uploaded[filename]),header=None)

  # Get everything with 'configs' in first coumn
  configs = df[df[0] == config_name] 
  # Build a dictionary from the key in the second column with values from the third
  defaults.update(dict(zip(configs[1], configs[2])))

  # Everything else that isn't a config, is data
  data = df[df[0] != config_name].reset_index(drop=True)
  # Assume that the first row is the column names now that configs are gone
  data.columns = data.iloc[0]
  # Drop the row with the column names
  data.drop(df.index[0], inplace=True)
  # Reset the index, so zero works below
  data = data.reset_index(drop=True)

  # Validate the column name configs
  col_list = list(data.columns)

  if 'col_in' not in defaults or defaults['col_in'] not in col_list:
    # With nothing defined or garbage, use first column
    if defaults['col_in'].capitalize() in col_list:
      defaults['col_in'] = defaults['col_in'].capitalize()
    else:
      defaults['col_in'] = col_list[0]

  if 'col_out' not in defaults:
      defaults['col_out'] = 'clean_text'

  if 'col_lem' not in defaults:
      defaults['col_lem'] = 'lemmatized'

  if 'file_out' not in defaults:
      defaults['file_out'] = '-cleaned'

  if 'drop_dupes' not in defaults:
      defaults['drop_dupes'] = True
  
  if not isinstance(defaults['drop_dupes'], bool) and defaults['drop_dupes'].lower() in ['false','0',0]:
      defaults['drop_dupes'] = False

  defaults['output_filename'] = filename.replace('.csv',f"{defaults['file_out']}.csv")

  return data, defaults

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


# Next, run this code block to upload a file for processing

If you want to process multiple files, start here for each one, no need to run the first block again and again.

## How to configure

Processing is mostly handled by default values, but if you need to override them, configuration is handled by passing values in the input file. 

To add a configuration to a file, put the value 'config' in the first column (no matter what the header) and the configuration key name in the second, and the value to be set in the third. Durding processing these rows will be separated from the data and not included in the returned file. For convenience, they can appear anywhere in the incoming file: before the headers, at the end, anywhere in between or even mixed among the data.

The possible configuration keys and their default values are:

* `col_in` : "Content"
  * *The exact name of column to process*
* `col_out` : "clean_text"
  * *The name of column to add the processed content*
* `col_lem` : "lemmatized"
  * *The name of column to add the lemmatized processed content*
* `file_out` : "-cleaned"
  * *The text to add to the filename that is returned*
* `drop_dupes` : True
  * *If this is set to True, then rows that duplicate content are dropped*

#### Coming Soon: Use configs to tweak normalization

In [33]:
#@title
defaults = {
  'col_in': 'content',
  'col_out': 'clean_text',
  'col_lem': 'lemmatized',
  'file_out': '-cleaned',
  'drop_dupes': True
}

df, configs = split_df_into_data_and_configs(files.upload(), defaults)

print('---------- configs ----------')
for key in configs:
  print('   ',key,':',configs[key])
print('-----------------------------')
display(df)

# Remove duplicate rows
num_rows = df.shape[0]
if configs['drop_dupes']:
  df.drop_duplicates(configs['col_in'], inplace=True)
  print(f'Processing {df.shape[0]} rows after deleting {num_rows - df.shape[0]} duplicates:\n')
else:
  print(f'Processing {df.shape[0]} rows:\n')

# Apply the normalization function to the input file
df[configs['col_out']] = df.apply(lambda x: normalize(x[configs['col_in']]), axis=1)
df[configs['col_lem']] = df.apply(lambda x: lemmatize(x[configs['col_out']]), axis=1)

print('Finished Processing\n')

print(f"Saving locally to {defaults['output_filename']}\n")
#This code downloads the result to your local machine.
df.to_csv(defaults['output_filename'],index=False)
files.download(defaults['output_filename'])

Saving configtest.csv to configtest (8).csv
---------- configs ----------
    col_in : Content
    col_out : clean_text
    col_lem : lemmatized
    file_out : -cleaned
    drop_dupes : False
    aaa : aaa
    bbb : bbb
    output_filename : configtest-cleaned.csv
-----------------------------


Unnamed: 0,Title,Content,Date,Topics,Sentiment Class,Main Emotion,Bio,Links
0,Anti-TNF Agents Linked to Increased Risk for IBD,I do not really care much about associations f...,19/9/8 12:26,"autoimmune disease,https://www.medscape.com/vi...",negative,sadness,Gastroenterology doctor,https://www.medscape.com/viewarticle/915613
1,Auricular Chondritis and Cervical Lymphadenopathy,An 85-year-old woman presented with a 3-week h...,19/9/5 11:33,"18F-fluorodeoxyglucose,auricular chondritis,tu...",negative,sadness,Internal Medicine doctor,https://www.thelancet.com/doi/story/10.1016/pi...
2,-,Some advocate shorter intervals after 20-25 yr...,19/9/12 14:23,"Surveillance colonoscopies,higher cancer risk,...",negative,fear,Gastroenterology doctor,"UC.So,https://journals.lww.com/ajg/Fulltext/20..."
3,A strange cirrhosis,I present the case of a 70-year-old man with s...,19/11/7 13:57,"liver disease,days of steroid therapy,case of ...",negative,sadness,Gastroenterology doctor,-
4,gynecomastia in male,A 37-year-old male with a background of ulcera...,20/2/24 6:52,"differential diagnosis,37-year-old male,serum ...",negative,disgust,Endocrinology doctor,-
5,Patient with ulcerative colitis,25 year old male patient with ulcerative colit...,20/5/21 6:13,"first flare,ulcerative colitis,background hist...",negative,sadness,Gastroenterology doctor,-
6,Psoriasiform dermatitis in severe ulcerative c...,A 40-year-old patient presented to the emergen...,20/10/8 2:23,"severe ulcerative colitis,40-year-old patient,...",negative,sadness,Gastroenterology doctor,-
7,-,Bile duct carcinomas have been associated with...,20/4/16 12:32,"history of rectal UC,Bile duct carcinomas,intr...",negative,sadness,Oncology doctor,-
8,-,Main danger is in ulcerative colitis pts.Preca...,20/4/13 9:12,"ulcerative colitis pts.Precaution,Main danger,...",negative,fear,Oncology doctor,-
9,-,Uveitis associated with ulcerative colitis is ...,19/10/18 12:30,"ulcerative colitis,differential diagnosis,clea...",neutral,sadness,Neurology doctor,-


Processing 18 rows:

Finished Processing

Saving locally to configtest-cleaned.csv



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Optional: for debugging, list 10 random results
Run again and again to view different random rows

In [34]:
#@title
for idx in np.random.choice(df.shape[0], replace = True, size = 5):
    print('----------')
    print(df.loc[idx][configs['col_in']])
    print('-')
    print(df.loc[idx][configs['col_out']])
    print('-')
    print(df.loc[idx][configs['col_lem']])
    print()

----------
You're invited to the next HealtheVoices for HCPs event, The Patient Story: Amplifying Their Voices on Social Media , on Tuesday, February 2, 7:00-9:00 pm ET. Now, more than ever before, patients are taking to social media to expand their knowledge on health, share their experiences and build communities. HealtheVoices continues to empower health advocates to use social media effectively, ensuring conversations and timely information is accessible to patients and caregivers in need. Click HERE to register for the next session, where top HCP influencers will focus on the patient story and how HCPs can help elevate their digital voice. As with previous sessions, the agenda is packed with engaging and informative conversations with HCPs and patient advocates, including: Pam Ressler, RN, MS, HNB and Mariah Leach, Rheumatoid Arthritis advocate, on what HCPs can learn from patients through their stories. Aline Charabaty, M.D. and Olivia Fulton, Respiratory advocate, on translating


# About the Normalization function

### Here we define the steps to normalize the text:

- Convert Unicode to ASCII and then back again
    - This removes all emojis and accents and other garbage
    - Convert back to unicode because later operations expect it
- Expand Contractions
    - For consistent grammar, expand it's to it is, etc.
    - Also, future removal of punctation would change contractions to nonsense
- Remove URLs
    - They are not words
- Remove 'RT ' from the start
    - Many tweets begin with "RT " for retweet
- Remove '@name: ' from start
    - Even tweets that aren't retweets begin with "@somename: " which is garbage
- Convert to lower case and remove punctuation
    - For consistency 
- Change all spaces to single space and remove all leading and trialing spaces
 