In [8]:
def scattertextplot(convention_df):
    #librairies
    %matplotlib inline
    import scattertext as st
    import re, io
    from pprint import pprint
    import pandas as pd
    import numpy as np
    import spacy
    from scipy.stats import rankdata, hmean, norm
    import os, pkgutil, json, urllib
    from urllib.request import urlopen
    from IPython.display import IFrame
    from IPython.core.display import display, HTML
    from scattertext import CorpusFromPandas, produce_scattertext_explorer
    display(HTML("<style>.container { width:98% !important; }</style>"))
    
    #NLP
    from spacy.lang.en import English

    raw_text = 'Hello, world. Here are two sentences.'
    nlp = English()
    nlp.add_pipe('sentencizer')
    doc = nlp(raw_text)
    
    convention_df.groupby('comments').apply(lambda x: x.comments.apply(lambda x: len(x.split())).sum())
    convention_df['parsed'] = convention_df.title.apply(nlp)
    
    #Corpus
    corpus = st.CorpusFromParsedDocuments(convention_df, category_col='title', parsed_col='parsed').build()
    
    #Stats
    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['cure_precision'] = term_freq_df['CANCER FREE!! freq'] * 1./(term_freq_df['It’s over freq'] + term_freq_df['Goodbye my sweet angel. I Lost my 5 year old daughter last night to complications from the treatment for stage IV alveolar rhabdomyosarcoma. No more tubes, no more pokes, no more drugs making her feel sick. No more pain. freq'])
    term_freq_df['cure_recall'] = term_freq_df['Officially 12 months cancer free freq'] * 1./term_freq_df['My initial prognosis was, "a few months". I recently celebrated my 3rd post diagnosis birthday! freq'].sum()
    term_freq_df['cure_f_score'] = term_freq_df.apply(lambda x: (hmean([x['cure_precision'], x['cure_recall']])
                                                                       if x['cure_precision'] > 0 and x['cure_recall'] > 0 
                                                                       else 0), axis=1)     
    #precision and recall
    term_freq_df['cure_precision_pctl'] = rankdata(term_freq_df['cure_precision'])*1./len(term_freq_df)
    term_freq_df['cure_recall_pctl'] = rankdata(term_freq_df['cure_recall'])*1./len(term_freq_df)
    
    #Normalizing 
    def normcdf(x):
        return norm.cdf(x, x.mean(), x.std())
    
    #calc
    term_freq_df['cure_precision_normcdf'] = normcdf(term_freq_df['cure_precision'])
    term_freq_df['cure_recall_normcdf'] = normcdf(term_freq_df['cure_recall'])
    
    #Override
    term_freq_df['cure_precision_normcdf'].fillna(5)
    
    #Cure
    term_freq_df['dem_corner_score'] = corpus.get_rudder_scores('cure')
    
    #HTML
    html = produce_scattertext_explorer(corpus,
                                        category='cure',
                                        category_name='cure',
                                        not_category_name='disease',
                                        width_in_pixels=1000,
                                        minimum_term_frequency=5,

                                        pmi_filter_thresold=4,
                                        metadata=convention_df['comments'],
                                        term_significance = st.LogOddsRatioUninformativeDirichletPrior())
    file_name = 'Conventions2012ScattertextScale.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    page = IFrame(src=file_name, width = 1200, height=700)
    
    return page

In [9]:
import pandas as pd
cancer = pd.read_csv('dataset/cancer.csv')

In [10]:
scattertextplot(cancer)