<a href="https://colab.research.google.com/github/Arteric-Jeff-Knight/collabs/blob/master/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import ipywidgets as widgets
import io, re, string, unicodedata                          # Import Regex, string and unicodedata.
from itertools import dropwhile
import numpy as np
import pandas as pd                                     # Import pandas.
from collections import Counter
!pip install spacy==2.3.5  # need 2.3.5 for textacy
import spacy
!pip install textacy
import textacy
import textacy.ke.textrank

# functions used several times
def drop_fewer_than(counter_dict: Counter, threshold: int = 1):
  for key, count in dropwhile(lambda key_count: key_count[1] >= threshold, counter_dict.most_common()):
      del counter_dict[key]

#print(spacy.__version__)
count_words = re.compile(r'\w+')

"""
print('\n\nSelect the spaCy language model:')
ddopts = [('Fast - No vectors, 13 MB : en_core_web_sm', 'en_core_web_sm'), 
	('685k keys, 20k unique vectors, 44 MB : en_core_web_md', 'en_core_web_md'), 
	('Slow - 685k keys, 685k unique vectors, 742 MB : en_core_web_lg', 'en_core_web_lg')]

load_lib = widgets.Dropdown(options=ddopts)
display(load_lib)
"""

"""
print('\nHow many times must a Noun Cluster appear to be counted (zero to not run Noun Clusters):')
nc_count = widgets.Dropdown(options=range(10), value=3)
display(nc_count)
"""
"""
print('\nHow many times must a TextRank Phrase appear to be counted (zero to not run TextRank):')
tr_count = widgets.Dropdown(options=range(10), value=2)
display(tr_count)
"""

ng_threshold = widgets.Dropdown(options=[('No n-grams', 0), 
  ('2-grams', (1, 2)), 
  ('3-grams', (1, 2, 3)), 
  ('4-grams', (1, 2, 3, 4)), 
  ('5-grams', (1, 2, 3, 4, 5)), 
  ('6-grams', (1, 2, 3, 4, 5, 6))], value=(1, 2, 3, 4))
ng_count = widgets.Dropdown(options=range(1,10), value=3)
print('\nWhat is the maximum size of the n-grams (zero to not run n-grams):')
display(ng_threshold)
print('How many times must the n-gram appear to be counted:')
display(ng_count)

In [None]:
spacy.cli.download('en_core_web_sm')
nlp = textacy.load_spacy_lang('en_core_web_sm')
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print('uploded file: ',filename)
text_df = pd.read_csv(io.BytesIO(uploaded[filename]))
text_df.head(5)

In [None]:
#@title
col_list = list(text_df.columns)
if 'lemmatized' in col_list:
  default_col = 'lemmatized'
elif 'clean_text' in col_list:
  default_col = 'clean_text'
else:
  default_col = None

col_in = widgets.Dropdown(options=col_list, value=default_col)

print('\nSelect the column that contains the text to be tokenized:')
display(col_in)

print('\nName the file to return:')
output_filename = widgets.Text(value=filename.replace('.csv','-tokenized.csv') )
display(output_filename)


In [None]:
text_df = text_df[[col_in.value]]

cluster_phrases = []
textrank_phrases = []
yake_phrases = []
scake_phrases = []
sgrank_phrases = []
ngram_phrases = []

for docrow in text_df[col_in.value].squeeze():
  doc = textacy.make_spacy_doc(str(docrow), lang='en')
  word_count = len(doc)
  # if nc_count.value:
  chunks = Counter()
  for nc in doc.noun_chunks:
    if len(nc) > 1:
      chunks[nc.text] += 1
  # a round about way to preserve token frequency and document frequency
  for token in chunks:
    cluster_phrases.append({'token': token, 'noun_cluster_frequency': chunks[token], 'noun_cluster_documents': 1})

  textranks = textacy.ke.textrank(doc=doc)
  for token, rank in textranks:
    textrank_phrases.append({'token': token, 'textrank_frequency': 1, 'textrank_documents': 1, 'textrank': rank})

  try:
    for token, rank in textacy.ke.textrank(doc=doc):
      textrank_phrases.append({'token': token, 'textrank_frequency': 1, 'textrank_documents': 1, 'textrank': rank})
  except Exception as e:
    pass

  try:
    for token, rank in textacy.ke.yake(doc=doc):
      yake_phrases.append({'token': token, 'yake_frequency': 1, 'yake_documents': 1, 'yake': rank})
  except Exception as e:
    pass

  try:
    for token, rank in textacy.ke.scake(doc=doc):
      scake_phrases.append({'token': token, 'scake_frequency': 1, 'scake_documents': 1, 'scake': rank})
  except Exception as e:
    pass

  try:
    for token, rank in textacy.ke.sgrank(doc=doc):
      sgrank_phrases.append({'token': token, 'sgrank_frequency': 1, 'sgrank_documents': 1, 'sgrank': rank})
  except Exception as e:
    pass

  if ng_threshold.value:
    bot = doc._.to_bag_of_terms(ngrams=ng_threshold.value, entities=True, weighting="count", as_strings=True)
    for term in bot:
      if len(term) > 1:
        ngram_phrases.append({'token': term, 'ngram_frequency': bot[term], 'ngram_documents': 1, 'tf': bot[term]/word_count})

if cluster_phrases:
  ncdf = pd.DataFrame(cluster_phrases)
  # Group by token
  ncdf = ncdf.groupby('token', as_index=False).agg({'noun_cluster_frequency': 'sum', 'noun_cluster_documents': 'sum'}) 
  # Drop all rows with count below threshold
  # ncdf = ncdf[ncdf.noun_cluster_frequency > nc_count.value]
else:
  # the merges later on will require an empty dataframe
  ncdf = pd.DataFrame([], columns=['token', 'noun_cluster_frequency', 'noun_cluster_documents'])

if textrank_phrases:
  # Build the DataFrame
  trdf = pd.DataFrame(textrank_phrases)
  ykdf = pd.DataFrame(yake_phrases)
  scdf = pd.DataFrame(scake_phrases)
  sgdf = pd.DataFrame(sgrank_phrases)
  # Group by token
  trdf = trdf.groupby('token', as_index=False).agg({'textrank_frequency': 'sum', 'textrank_documents': 'sum', 'textrank': 'max'}) 
  ykdf = ykdf.groupby('token', as_index=False).agg({'yake_frequency': 'sum', 'yake_documents': 'sum', 'yake': 'max'}) 
  scdf = scdf.groupby('token', as_index=False).agg({'scake_frequency': 'sum', 'scake_documents': 'sum', 'scake': 'max'}) 
  sgdf = sgdf.groupby('token', as_index=False).agg({'sgrank_frequency': 'sum', 'sgrank_documents': 'sum', 'sgrank': 'max'}) 
  m1 = pd.merge(left=trdf, right=ykdf, left_on='token', right_on='token',how='outer')
  m2 = pd.merge(left=scdf, right=sgdf, left_on='token', right_on='token',how='outer')
  trm = pd.merge(left=m1, right=m2, left_on='token', right_on='token',how='outer')
  # Drop all rows with count below threshold
  # trdf = trdf[trdf.textrank_frequency > tr_count.value]
else:
  # the merges later on will require an empty dataframe
  trm = pd.DataFrame([], columns=['token'])


if ngram_phrases:
  ngdf = pd.DataFrame(ngram_phrases)
  # Group by token
  ngdf = ngdf.groupby('token', as_index=False).agg({'ngram_frequency': 'sum', 'ngram_documents': 'sum'}) 
  # Drop all rows with count below threshold
  ngdf = ngdf[ngdf.ngram_frequency > ng_count.value]
else:
  ngdf = pd.DataFrame([], columns=['token', 'ngram_frequency', 'ngram_documents'])

merged = pd.merge(left=ncdf, right=ngdf, left_on='token', right_on='token',how='outer')
merged = pd.merge(left=merged, right=trm, left_on='token', right_on='token',how='outer')
merged = merged.fillna(0)
cols = ['noun_cluster_frequency','noun_cluster_documents',
        'textrank_frequency','textrank_documents',
        'yake_frequency','yake_documents',
        'scake_frequency','scake_documents',
        'sgrank_frequency','sgrank_documents',
        'ngram_frequency','ngram_documents'
        ]
merged[cols] = merged[cols].astype(int)

# Add word count
merged['words'] = merged.apply(lambda x: len(count_words.findall(x['token'])), axis=1)

merged.to_csv(output_filename.value,index=False)
files.download(output_filename.value) 

merged

# [An explanation of PyTextRank: the algorithm](https://derwen.ai/docs/ptr/explain_algo/)