## Import packages

In [None]:
import scattertext as st
import pandas as pd
from nltk.corpus import stopwords

## Scattertext 

### Install

It's very important to install scattertext using pip:

`pip install scattertext`

The conda and conda-forge versions are out of date and currently don't work due to some version conflicts!

### Built-in example: 2012 US political convention speeches

First, we'll work through a modified version of the built-in example from scattertext:

In [36]:

eng_stopwords = set(stopwords.words('english'))

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = (
    st.CorpusFromParsedDocuments(df, category_col='party', parsed_col='parse')
    .build()
    .remove_terms(eng_stopwords, ignore_absences=True)
    .get_unigram_corpus()
    .compact(st.AssociationCompactor(2000))
)

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, 
    pmi_threshold_coefficient=0,
    width_in_pixels=1000, 
    metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    include_gradient=True,
    left_gradient_term='More Republican',
    middle_gradient_term='Metric: Dense Rank Difference',
    right_gradient_term='More Democratic',
)
open('./scattertext0.html', 'w').write(html)

1687281

To view the result, we need to return to our folder and open the output .html file!

### Scattertext with Reddit data

Let's apply this to our Reddit data!  We can compare two different subreddits: GPT3 and MachineLearning.

In [33]:
gpt3 = pd.read_csv("gpt3_data.csv")
gpt3["subreddit"] = "gpt3"

ml = pd.read_csv("MachineLearning_data.csv")
ml["subreddit"] = "MachineLearning"

reddit = pd.concat([gpt3, ml], ignore_index=True)[["content","subreddit"]]
reddit = reddit.loc[reddit.content.notnull(),:]
reddit = reddit.assign(
    parse=lambda df: df.content.apply(st.whitespace_nlp_with_sentences)
)

In [32]:
corpus = (
    st.CorpusFromParsedDocuments(reddit, category_col='subreddit', parsed_col='parse')
    .build()
    .remove_terms(eng_stopwords, ignore_absences=True)
    .get_unigram_corpus()
    .compact(st.AssociationCompactor(2000))
)

html = st.produce_scattertext_explorer(
    corpus,
    category='gpt3',
    category_name='gpt3',
    not_category_name='MachineLearning',
    minimum_term_frequency=0, 
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    transform=st.Scalers.dense_rank,
    include_gradient=True,
    left_gradient_term='More MachineLearning',
    middle_gradient_term='Metric: Dense Rank Difference',
    right_gradient_term='More gpt3',
)
open('./scattertext_reddit.html', 'w').write(html)

2075402