<a href="https://colab.research.google.com/github/Arteric-Jeff-Knight/collabs/blob/master/ScattertextTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install scattertext pytextrank

from google.colab import files
import ipywidgets as widgets
import io, re, string, unicodedata                          # Import Regex, string and unicodedata.
import numpy as np
import pandas as pd                                     # Import pandas.
import spacy
import pytextrank
import scattertext as st
from pprint import pprint

spacy.cli.download('en_core_web_sm') # this is how to do it in colab
nlp = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
# text_df = st.SampleCorpora.ConventionData2012.get_data()

# 25s to load "shorter" w/ gpu
# 1m 3s to load "shorter" w/o gpu

uploaded = files.upload()
filename = list(uploaded.keys())[0]
print('uploded file: ',filename)
text_df = pd.read_csv(io.BytesIO(uploaded[filename]))
text_df.head(5)

Saving shorter-Zeposia-Twitter-Analysis-cleaned.csv to shorter-Zeposia-Twitter-Analysis-cleaned (3).csv
uploded file:  shorter-Zeposia-Twitter-Analysis-cleaned.csv


Unnamed: 0,date,visibility,potential impressions,actual impressions,no. of followers,no. of friends,gender,no. of comments,no. of likes,no. of shares,no. of retweets,user screen name,Patient / HCP,Persona Segment,post subtype,clean_text,lemmatized
0,20/8/2 4:49,134.68,944183,103860,944183,913,male,0,553,0,0,hankgreen,Patient,Journaling Champions,quote,refer media networks content platforms think l...,refer medium network content platform think lo...
1,20/8/2 6:27,33.73,2821,310,2821,586,unknown,0,0,0,0,DanSharpIBD,Patient,Community Creators,retweet,surgeons not think tweeting intraop photos lig...,surgeon not think tweet intraop photo lighthea...
2,20/8/2 10:50,43.89,8227,904,8227,1744,male,0,0,0,0,ibddoctor,HCP,Pipeline Treatment Enthusiast,reply,not need one,not need one
3,20/8/2 11:20,28.9,1675,184,1675,873,unknown,0,0,0,0,FITWITMD,HCP,Wellness Warriors,retweet,miss networking years annual meeting great tal...,miss networking year annual meet great talk li...
4,20/8/2 12:10,48.16,7061,776,7061,989,female,0,0,0,1,DCharabaty,HCP,Community Creators,reply,tagging friend help,tag friend help


In [70]:
col_data = text_df.nunique().to_dict()
data_columns = []
category_columns = []
for column_name in col_data:
  if col_data[column_name] > 9:
    data_columns.append(column_name)
  else:
    category_columns.append(column_name)

if 'lemmatized' in data_columns:
  default_col = 'lemmatized'
elif 'clean_text' in data_columns:
  default_col = 'clean_text'
else:
  default_col = None

text_col = widgets.Dropdown(options=data_columns, value=default_col)
category_col = widgets.Dropdown(options=category_columns)

print('\nSelect the column that contains the text to be analyzed:')
display(text_col)

print('\nSelect the column that contains the category:')
display(category_col)


Select the column that contains the text to be analyzed:


Dropdown(index=9, options=('date', 'visibility', 'potential impressions', 'actual impressions', 'no. of follow…


Select the column that contains the category:


Dropdown(options=('gender', 'no. of comments', 'no. of shares', 'no. of retweets', 'Patient / HCP', 'Persona S…

In [74]:
category_list = text_df[category_col.value].unique().tolist()
print(f'Unique Values for Column "{category_col.value}"')
category_list

Unique Values for Column "Patient / HCP"


['Patient', 'HCP']

In [75]:
category_filename = re.sub(' +',' ',re.sub(r'[^\w\s]', '', category_col.value)).replace(' ','-') 

# 1m 26s to build "shorter" w/ gpu
# 1m 39s to build "shorter" w/o gpu

corpus = st.CorpusFromPandas(text_df, category_col=category_col.value, text_col=text_col.value, nlp=nlp).build()

print('Scaled F Scores vs Background:')
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

Scaled F Scores vs Background:
['facewithtearsofjoy', 'covid', 'ibd', 'mondaynightibd', 'ridetheshotwave', 'gitwitter', 'medtwitter', 'crohns', 'congrat', 'redheart']


In [76]:
term_freq_df = corpus.get_term_freq_df()

# add in scores
for category in category_list:
  score_title = f"{category} score"
  term_freq_df[score_title] = corpus.get_scaled_f_scores(category)
  print(f"\nTerms most assoicated with {category}")
  print("----------")
  pprint(list(term_freq_df.sort_values(by=score_title, ascending=False).index[:10]))




Terms most assoicated with Patient
----------
['adult chronic',
 'purpleheart',
 'creator',
 'coram',
 'hb',
 'podcast app',
 'young adult',
 'ibdchat',
 'disability',
 'aboutibdpodcast']

Terms most assoicated with HCP
----------
['womeninmedicine',
 'polyp',
 'microbiota',
 'rte',
 'hertimeisnow',
 'foldedhandsmedium',
 'amplify',
 'rte donate',
 'thank ridetheshotwave',
 'mede']


In [77]:
html = st.produce_scattertext_explorer(corpus, 
                                       category=category_list[0], category_name=category_list[0],
                                       not_category_name=category_list[1],
                                       width_in_pixels=1000)

filename = f"{category_filename}-Term-Associations.html"
open(filename, 'wb').write(html.encode('utf-8'))

1801506

In [78]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("textrank", last=True)

new_df = text_df.dropna() # nlp hates NAN

new_df = new_df.assign(
    parse=lambda df: df[text_col.value].apply(nlp)
)

corpus = st.CorpusFromParsedDocuments(
    new_df, category_col=category_col.value,
    parsed_col='parse',
    feats_from_spacy_doc=st.PyTextRankPhrases()
).build(
).compact(
    st.AssociationCompactor(2000, use_non_text_features=True)
)

term_category_scores = corpus.get_metadata_freq_df('')

term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1
metadata_descriptions = {
    term: '<br/>' + '<br/>'.join(
        '<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
        for cat in corpus.get_categories())
    for term in corpus.get_metadata()
}

category_specific_prominence = term_category_scores.apply(
    lambda r: r[0] if r[0] > r[1] else -r[1],
    axis=1
)
category_specific_prominence

html = st.produce_scattertext_explorer(
    corpus,
    category=category_list[0],
    not_category_name=category_list[1],
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1200,
    transform=st.dense_rank,
    #metadata=corpus.get_df()['speaker'],
    scores=category_specific_prominence,
    sort_by_dist=False,
    use_non_text_features=True,
    topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
    topic_model_preview_size=0,
    metadata_descriptions=metadata_descriptions,
    use_full_doc=True
)
filename = f"{category_filename}-Phrase-Associations.html"
open(filename, 'wb').write(html.encode('utf-8'))

  precision = (cat_word_counts * 1. / (cat_word_counts + not_cat_word_counts))
  precision = (cat_word_counts * 1. / (cat_word_counts + not_cat_word_counts))


2260562

2260562