In [None]:
# DataPrep
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib

# Modeling
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
#from sklearn.naive_bayes import GaussianNB
#import lightgbm as lgb

In [None]:
# Standard libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from matplotlib.gridspec import GridSpec
pd.set_option('display.max_columns', 100)
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
from wordcloud import WordCloud
from collections import Counter




In [None]:
df=pd.read_csv("BCA_Airbnb_reviews .csv")
df

In [None]:
df_comments = df.loc[:, ['Reviews']]
#df_comments = df.loc[:, ['Reviews','score']] # score = ratings column (if u have ratings also in data )
#df_comments = df_comments.dropna(subset=['Review Description'])
df_comments = df_comments.reset_index(drop=True)
print(f'Dataset shape: {df_comments.shape}')
df_comments.columns = ['comment']
#df_comments.columns = ['comment','score']
df_comments.head()

In [None]:
df_comments.dtypes

# Data Pre-Processing

In [None]:
def find_patterns(re_pattern, text_list):
    """
    Args:
    ---------
    re_pattern: regular expression pattern to be used on search [type: string]
    text_list: list with text strings [type: list]
    
    Returns:
    positions_dict: python dictionary with key-value pars as below:
        text_idx: [(start_pattern1, end_pattern1), (start_pattern1, end_pattern2), ... (start_n, end_n)]
    """
    
    # Compiling the Regular Expression passed as a arg
    p = re.compile(re_pattern)
    positions_dict = {}
    i = 0
    for c in text_list:
        match_list = []
        iterator = p.finditer(c)
        for match in iterator:
            match_list.append(match.span())
        control_key = f'Text idx {i}'
        if len(match_list) == 0:
            pass
        else:
            positions_dict[control_key] = match_list
        i += 1
        
    """p = '[R]{0,1}\$[ ]{0,}\d+(,|\.)\d+'
    pattern_dict = find_patterns(p, reviews_breakline)
    print(len(pattern_dict))
    pattern_dict
    for idx in [int(c.split(' ')[-1]) for c in list(pattern_dict.keys())]:
        print(f'{reviews_breakline[idx]}\n')"""

    return positions_dict

def print_step_result(text_list_before, text_list_after, idx_list):
    """
    Args:
    ----------
    text_list_before: list object with text content before transformation [type: list]
    text_list_after: list object with text content after transformation [type: list]
    idx_list: list object with indexes to be printed [type: list]
    """
    
    # Iterating over string examples
    i = 1
    for idx in idx_list:
        print(f'--- Text {i} ---\n')
        print(f'Before: \n{text_list_before[idx]}\n')
        print(f'After: \n{text_list_after[idx]}\n')
        i += 1

In [None]:
def re_breakline(text_list):
    """
    Args:
    ----------
    text_list: list object with text content to be prepared [type: list]
    """
    
    # Applying regex
    return [re.sub('[\n\r]', ' ',str(r)) for r in text_list]

In [None]:
# Creating a list of comment reviews
reviews = list(df_comments['comment'].values)

# Applying RegEx
reviews_breakline = re_breakline(reviews)
df_comments['re_breakline'] = reviews_breakline

# Verifying results
print_step_result(reviews, reviews_breakline, idx_list=[4])

In [None]:
def re_hiperlinks(text_list):
    """
    Args:
    ----------
    text_list: list object with text content to be prepared [type: list]
    """
    
    # Applying regex
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return [re.sub(pattern, ' link ', r) for r in text_list]

In [None]:
# Applying RegEx
reviews_hiperlinks = re_hiperlinks(reviews_breakline)
df_comments['re_hiperlinks'] = reviews_hiperlinks

# Verifying results
print_step_result(reviews_breakline, reviews_hiperlinks, idx_list=[7])

In [None]:
def re_dates(text_list):
    """
    Args:
    ----------
    text_list: list object with text content to be prepared [type: list]
    """
    
    # Applying regex
    pattern = '([0-2][0-9]|(3)[0-1])(\/|\.)(((0)[0-9])|((1)[0-2]))(\/|\.)\d{2,4}'
    return [re.sub(pattern, ' data ', r) for r in text_list]

In [None]:
# Applying RegEx
reviews_dates = re_dates(reviews_hiperlinks)
df_comments['re_dates'] = reviews_dates

# Verifying results
print_step_result(reviews_hiperlinks, reviews_dates, idx_list=[7])

In [None]:
# Special Characters

In [None]:
def re_special_chars(text_list):
    """
    Args:
    ----------
    text_series: list object with text content to be prepared [type: list]
    """
    
    # Applying regex
    return [re.sub('\W', ' ', r) for r in text_list]

In [None]:
# Applying RegEx
reviews_special_chars = re_special_chars(reviews_dates)
df_comments['re_special_chars'] = reviews_special_chars

# Verifying results
print_step_result(reviews_dates, reviews_special_chars, idx_list=[4])

In [None]:
def re_whitespaces(text_list):
    """
    Args:
    ----------
    text_series: list object with text content to be prepared [type: list]
    """
    
    # Applying regex
    white_spaces = [re.sub('\s+', ' ', r) for r in text_list]
    white_spaces_end = [re.sub('[ \t]+$', '', r) for r in white_spaces]
    return white_spaces_end

In [None]:
# Applying RegEx
reviews_whitespaces = re_whitespaces(reviews_special_chars)
df_comments['re_whitespaces'] = reviews_whitespaces

# Verifying results
print_step_result(reviews_special_chars, reviews_whitespaces, idx_list=[3, 4, -1])

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# Examples of some english stopwords
pt_stopwords = stopwords.words('english')
print(f'Total english stopwords in the nltk.corpous module: {len(pt_stopwords)}')
pt_stopwords[:10]

In [None]:
# Defining a function to remove the stopwords and to lower the comments
def stopwords_removal(text, cached_stopwords=stopwords.words('english')):
    """
    Args:
    ----------
    text: list object where the stopwords will be removed [type: list]
    cached_stopwords: stopwords to be applied on the process [type: list, default: stopwords.words('portuguese')]
    """
    
    return [c.lower() for c in text.split() if c.lower() not in cached_stopwords]

In [None]:
# Removing stopwords and looking at some examples
reviews_stopwords = [' '.join(stopwords_removal(review)) for review in reviews_whitespaces]
df_comments['stopwords_removed'] = reviews_stopwords

print_step_result(reviews_whitespaces, reviews_stopwords, idx_list=[0, 45])

In [None]:
import nltk
from nltk.stem.snowball import SnowballStemmer

snowBallStemmer = SnowballStemmer("english")

In [None]:
# Defining a function to remove the stopwords and to lower the comments
def stemming_process(text, stemmer=SnowballStemmer("english")):
    """
    Args:
    ----------
    text: list object where the stopwords will be removed [type: list]
    stemmer: type of stemmer to be applied [type: class, default: RSLPStemmer()]
    """
    
    return [stemmer.stem(c) for c in text.split()]

In [None]:
# Applying stemming and looking at some examples
reviews_stemmer = [' '.join(stemming_process(review)) for review in reviews_stopwords]
df_comments['stemming'] = reviews_stemmer

print_step_result(reviews_stopwords, reviews_stemmer, idx_list=[0, 45, -1])

In [None]:
def extract_features_from_corpus(corpus, vectorizer, df=False):
    """
    Args
    ------------
    text: text to be transformed into a document-term matrix [type: string]
    vectorizer: engine to be used in the transformation [type: object]
    """
    
    # Extracting features
    corpus_features = vectorizer.fit_transform(corpus).toarray()
    features_names = vectorizer.get_feature_names()
    
    # Transforming into a dataframe to give interpetability to the process
    df_corpus_features = None
    if df:
        df_corpus_features = pd.DataFrame(corpus_features, columns=features_names)
    
    return corpus_features, df_corpus_features

In [None]:
# Creating an object for the CountVectorizer class
count_vectorizer = CountVectorizer(max_features=300, min_df=2, max_df=0.8, stop_words=pt_stopwords)

# Extracting features for the corpus
countv_features, df_countv_features = extract_features_from_corpus(reviews_stemmer, count_vectorizer, df=True)
print(f'Shape of countv_features matrix: {countv_features.shape}\n')
print(f'Example of DataFrame of corpus features:')
df_countv_features.head()

In [None]:
# Creating an object for the CountVectorizer class
tfidf_vectorizer = TfidfVectorizer(max_features=300, min_df=2, max_df=0.8, stop_words=pt_stopwords)

# Extracting features for the corpus
tfidf_features, df_tfidf_features = extract_features_from_corpus(reviews_stemmer, tfidf_vectorizer, df=True)
print(f'Shape of tfidf_features matrix: {tfidf_features.shape}\n')
print(f'Example of DataFrame of corpus features:')
df_tfidf_features.head()

In [None]:
#fig, ax = plt.subplots(figsize=(10, 5))
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.countplot(x='score', data=df_comments)

In [None]:
# Labelling data
score_map = {
    1: 'negative',
    2: 'negative',
    3: 'negative',
    4: 'positive',
    5: 'positive'
}
df_comments['sentiment_label'] = df_comments['score'].map(score_map)


In [None]:
sent = df_comments.groupby('sentiment_label')
print(sent.size())

In [None]:
df_comments

In [None]:
from textblob import TextBlob
# Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return  TextBlob(text).sentiment.polarity


# Create two new columns 'Subjectivity' & 'Polarity'
df_comments['Subjectivity'] = df_comments['stopwords_removed'].apply(getSubjectivity)
df_comments['Polarity'] = df_comments['stopwords_removed'].apply(getPolarity)

# Show the new dataframe with columns 'Subjectivity' & 'Polarity'
df_comments

In [None]:
# Create a function to compute negative (-1), neutral (0) and positive (+1) analysis
def getAnalysis(score):
    if score < 0:
        return 'negative'
    else:
        return 'positive'
df_comments['sentiment_label'] = df_comments['Polarity'].apply(getAnalysis)
# Show the dataframe
df_comments

In [None]:
sent = df_comments.groupby('sentiment_label')
print(sent.size())

# N-grams Plotly

# Uni-gram

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df_comments['stopwords_removed'], 30)
df2 = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

fig = go.Figure([go.Bar(x=df2['unigram'], y=df2['count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 unigrams in the Reviews"))
fig.show()

In [None]:
df2.to_csv('unigram.csv')

In [None]:
df2

# Bi-gram

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df_comments['stopwords_removed'], 20)
df3 = pd.DataFrame(common_words, columns = ['bigram' , 'count'])

fig = go.Figure([go.Bar(x=df3['bigram'], y=df3['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 bigrams in the Reviews"))
fig.show()

In [None]:
df3

In [None]:
df3.to_csv("bigrams.csv")

# Tri-gram

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df_comments['stopwords_removed'], 20)
df4 = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

fig = go.Figure([go.Bar(x=df4['trigram'], y=df4['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 trigrams in the Reviews"))
fig.show()

In [None]:
df4

In [None]:
df4.to_csv("trigrams.csv")

In [None]:
df4.columns

# Word Cloud General (includes for all comments)

In [None]:
# background_color = https://www.google.com/search?q=hex+color+picker&oq=hex+color+&aqs=chrome.2.69i57j35i39j0i20i263i433i512j0i512l3j69i60l2.5061j0j7&sourceid=chrome&ie=UTF-8
# colormap = https://matplotlib.org/stable/tutorials/colors/colormaps.html

The description of the following arguments is below

width/height: we can change the dimension of the canvas using these arguments. Here we assign width as 3000 and height as 2000.
    
random_state:  It will return PIL color for each word, set as an int value. 
    
background_color: It is used for the background color of the word cloud image. 
    
colormap: using this argument we can change each word color. Matplotlib colormaps provide awesome colors.
    
collocation: collocation argument is set to FALSE to ensure that the word cloud doesn’t contain any bigrams or duplicate words.
    
stopwords: ‘stop_words’ are those words that are commonly used in the English language such as ‘we’, ‘the’, ‘a’, ‘an’, etc. thus, we have to eliminate those words. we already imported the STOPWORDS function from the WordCloud library.

In [None]:
from wordcloud import STOPWORDS

In [None]:
# Import package
import matplotlib.pyplot as plt
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

# Generate wordcloud
allWords = ' '.join([twts for twts in df_comments['stopwords_removed']])
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1,background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(allWords)
#wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(text)
#wordcloud = WordCloud(width = 3000, height = 2000, random_state=1,max_words=50,background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(allWords)
# Plot
plot_cloud(wordcloud)

# Positive N-grams Plotly

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

df_positive=df_comments[df_comments["sentiment_label"] == "positive"]
common_words = get_top_n_words(df_positive['stopwords_removed'], 30)
dfp1 = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

fig = go.Figure([go.Bar(x=dfp1['unigram'], y=dfp1['count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 unigrams in the Reviews"))
fig.show()

In [None]:
dfp1.to_csv("positive_unigrams.csv")

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
df_positive=df_comments[df_comments["sentiment_label"] == "positive"]
common_words = get_top_n_bigram(df_positive['stopwords_removed'], 20)
dfp2 = pd.DataFrame(common_words, columns = ['bigram' , 'count'])


fig = go.Figure([go.Bar(x=dfp2['bigram'], y=dfp2['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 bigrams in the Reviews"))
fig.show()

In [None]:
dfp2.to_csv("positive_bigrams.csv")

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
df_positive=df_comments[df_comments["sentiment_label"] == "positive"]
common_words = get_top_n_trigram(df_positive['stopwords_removed'], 20)
dfp4 = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

fig = go.Figure([go.Bar(x=dfp4['trigram'], y=dfp4['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 trigrams in the Reviews"))
fig.show()

In [None]:
dfp4.to_csv("positive_trigrams.csv")

# POSITIVE WORD CLOUD

In [None]:
# Import package
import matplotlib.pyplot as plt
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

# Generate wordcloud
df_positive=df_comments[df_comments["sentiment_label"] == "positive"]
#df_positive=df3[df3["bigram"]]
allWords = ' '.join([twts for twts in df_positive['stopwords_removed']])
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(allWords)
# Plot
plot_cloud(wordcloud)

In [None]:
#df_positive.to_csv("positive_reviews.csv")

# Negative n-grams Plotly

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
df_positive=df_comments[df_comments["sentiment_label"] == "negative"]
common_words = get_top_n_words(df_positive['stopwords_removed'], 30)
dfn1 = pd.DataFrame(common_words, columns = ['unigram' , 'count'])

fig = go.Figure([go.Bar(x=dfn1['unigram'], y=dfn1['count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 unigrams in the Reviews"))
fig.show()

In [None]:
dfn1.to_csv("negative_unigrams.csv")

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
df_positive=df_comments[df_comments["sentiment_label"] == "negative"]
common_words = get_top_n_bigram(df_positive['stopwords_removed'], 20)
dfn2 = pd.DataFrame(common_words, columns = ['bigram' , 'count'])

fig = go.Figure([go.Bar(x=dfn2['bigram'], y=dfn2['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 bigrams in the Reviews"))
fig.show()

In [None]:
dfn2.to_csv("negative_bigrams.csv")

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
df_positive=df_comments[df_comments["sentiment_label"] == "negative"]
common_words = get_top_n_trigram(df_positive['stopwords_removed'], 20)
dfn3 = pd.DataFrame(common_words, columns = ['trigram' , 'count'])

fig = go.Figure([go.Bar(x=dfn3['trigram'], y=dfn3['count'])])
fig.update_layout(title=go.layout.Title(text="Top 20 trigrams in the Reviews"))
fig.show()

In [None]:
dfn3.to_csv("negative_trigrams.csv")

# NEGATIVE WORD CLOUD

In [None]:
# Import package
import matplotlib.pyplot as plt
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");

# Generate wordcloud
df_negative=df_comments[df_comments["sentiment_label"] == "negative"]
allWords = ' '.join([twts for twts in df_negative['stopwords_removed']])
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(allWords)
# Plot
plot_cloud(wordcloud)