Created by: [SmirkyGraphs](http://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [insideairbnb.com](http://insideairbnb.com/get-the-data.html).
<hr>

# Rhode Island Airbnb WordClouds

This notebook contains code used to create wordclouds from reviews on Airbnb locations around Rhode Island. This will look at both positive and negative reviews for the entire state. Additionally I will group reviews into wordclouds based on the top 5 cities with the most locations to rent.

<hr>

In [1]:
import re
import numpy as np
import pandas as pd

from PIL import Image
from wordcloud import WordCloud, STOPWORDS

In [2]:
def collect_words(df):
    # getting all reviews into a list
    text = df['comments'].tolist()
            
    # lowercasing and removing extra spaces
    text = [x for x in text if x != '']
    
    # joining all text into a corpus
    text = ' '.join(text)
    
    # removing non a-z characters
    text = re.sub(r'[\\-_/+]+', ' ', text)
    text = re.sub(r'[^A-Za-z ]+', '', text)
    text = re.sub(' +', ' ', text).strip().lower()

    return text

def generate_wordcloud(text, title, stop_words=None):
    # adding stopwords
    stopwords = set(STOPWORDS)
    
    if stop_words != None:
        for word in stop_words:
            stopwords.add(word)

    # setting image mask
    mask = np.array(Image.open(f'./wordcloud/mask/{title}.png'))

    # setting wordcloud params
    wc = WordCloud(
        font_path='./wordcloud/fonts/trebuc.ttf',
        background_color='#1b1b1b',
        max_font_size=1000,
        mask=mask,
        max_words=200,
        colormap='Spectral',
        stopwords=stopwords,
        normalize_plurals=True,
        random_state=1,
        collocations=False
    )

    # generating wordcloud
    wc = wc.generate(text)

    # saving image
    wc.to_file(f"./wordcloud/output/{title}_output.png")

In [3]:
df = pd.read_csv('./data/clean/reviews_clean.csv')

airbnb_stopwords = [
    'place',
    'stay',
    'room',
    'location',
    'house',
    'us',
    'airbnb',
    'apartment',
    'home',
    'host',
    'hosts',
    'rhode',
    'island',
    'great',
    'clean'
]

In [4]:
# south kingstown
sk = df[df['neighbourhood_cleansed']=='South Kingstown']
sk = collect_words(sk)
generate_wordcloud(sk, 'south_kingstown', airbnb_stopwords)

# providence
pvd = df[df['neighbourhood_cleansed']=='Providence']
pvd = collect_words(pvd)
generate_wordcloud(pvd, 'providence', airbnb_stopwords)

# middletown
mid = df[df['neighbourhood_cleansed']=='Middletown']
mid = collect_words(mid)
generate_wordcloud(mid, 'middletown', airbnb_stopwords)

# newport
new = df[df['neighbourhood_cleansed']=='Newport']
new = collect_words(new)
generate_wordcloud(new, 'newport', airbnb_stopwords)

# narragansett
nar = df[df['neighbourhood_cleansed']=='Narragansett']
nar = collect_words(nar)
generate_wordcloud(nar, 'narragansett', airbnb_stopwords)