Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [ri.gov](http://www.governor.ri.gov/newsroom/speeches/index.php).
<hr>

# State of the State Wordcloud Generator

This notebook contains code used to create word clouds using the text of a state of the state address. The word clouds are the top 250 most common words with common words and stopwords removed. All words are converted into lowercase with punctuation and special characters removed.

In [1]:
import re
import numpy as np
import pandas as pd

from PIL import Image
from wordcloud import WordCloud, STOPWORDS

In [2]:
def collect_words(df, year, stop=False):
    all_text = []
    
    cols = ['words', 'stopword']
    cols.append(year)
    
    df = df[cols]
    df = df[df[year] > 0]
    
    if stop == True:
        df = df[df['stopword'] != 'Stopword']
    
    for i, row in df.iterrows():
        word = row['words']
        count = int(row[year])

        total_word = [word]*count
        [all_text.append(word) for word in total_word]

    text = ' '.join(all_text)
    
    return text

In [3]:
def generate_wordcloud(text, title, stop_words=None):
    
    # adding stopwords
    stopwords = set(STOPWORDS)
    
    if stop_words != None:
        for word in stop_words:
            stopwords.add(word)

    # setting image mask
    mask = np.array(Image.open('../files/mask/rhode_island.png'))

    # setting wordcloud params
    wc = WordCloud(
        font_path='../files/font/trebuc.ttf',
        background_color='#1b1b1b',
        max_font_size=1000,
        mask=mask,
        max_words=250,
        colormap='Spectral',
        stopwords=stopwords,
        normalize_plurals=True,
        random_state=1,
        collocations=False
    )

    # generating wordcloud
    wc = wc.generate(text)

    # saving image
    wc.to_file(f"../output/wordcloud/{title}_output.png")

In [4]:
# load the data to get words by topic
df = pd.read_csv('../output/word_count/clean/word_counts_combined.csv')

# get works and make wordcloud
words = collect_words(df, '2022', stop=True)
generate_wordcloud(words, '2022_sots', ['state'])