<div style="margin: 2em 5em 0 0;">
    
<p>
   <span style="margin-left: 17%; vertical-align: text-bottom;">&nbsp;&bullet;&nbsp; Social Media Analytics for Business Intelligence - DAT-9731 - SFO1 &nbsp;&bullet;&nbsp; Hult International Business School</span>
</p>
<br>
<hr style="height: 1px; margin-top: -.2em">


<center>
   <br>
   <span style="font-size: 155%; font-weight: bold; font-family: "Times New Roman", Times, serif;">
      Facebook page activity tracking | "Australis Cosmetics" Company
   </span>
   <br>
   <br>
   <div style="font-size: 107%; margin-top: .3em">
      <b>Instructors: Beau Giannini, Pavel Paramonov</b>
      <br style="margin-bottom: .5em">
      <b>Ohyoung Kim </b><br>
      <br>
   </div>
</center>

Import libraries and dataset

In [None]:
# Import libraries

## Facebook page analysis starter ##
import html
import string
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd

## Additional package ##
import seaborn as sns    # enhanced data visualization

In [None]:
# Load Facebook dataset: Australian cosmetics company "Australis Cosmetics" page
df = pd.read_csv("https://raw.githubusercontent.com/multidis/hult-social-media-analytics/main/data/brand_posts.csv")
df

Follow the analysis steps :

Step 1. Identify what types of posts in the dataset and how many entries of each post type are present

In [None]:
# Frequency for each type
for tp in df["type"].unique():
    ntp = len(df[df["type"] == tp])
    print(f"Type {tp} occurs {ntp} times")

Step 2. List the average number of shares for each post type

In [None]:
# Average of each type of shares
for etp in df["type"].unique():
    share = df[df["type"] == etp]["shares_count"].mean()
    print(f"Type {etp} shared {share.round(decimals = 2)} times on average")

Step 3. Analyze the most common keywords occurring throughout the posts and visualize the results

In [None]:
# Clean up the text
## Text cleaning function ##
stop_words = set(stopwords.words('english'))

def text_cleanup(s):
    # if not a string object, disregard
    if not isinstance(s, str):
        return ''
    
    s_unesc = html.unescape(re.sub(r"http\S+", "", re.sub('\n+', ' ', s)))
    s_noemoji = s_unesc.encode('ascii', 'ignore').decode('ascii')
    # normalize to lowercase and tokenize
    wt = word_tokenize(s_noemoji.lower())
    
    # filter word-tokens
    wt_filt = [w for w in wt if (w not in stop_words) and (w not in string.punctuation) and (w.isalnum())]
    
    # return clean string
    return ' '.join(wt_filt)

In [None]:
# Add clean text column
# NOTE: apply in pandas applies a function to each element of the selected column
df['message_clean'] = df['message'].apply(text_cleanup)
df

In [None]:
# Combine all post text entries
text_all = ' '.join(df['message_clean'])

# Keywords occurring throughout all of the posts
wc = WordCloud(width=1200, height=800, max_font_size=110, collocations=False).generate(text_all)
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.show()

In [None]:
# Store the words used to create WordCloud as kwords
kwords = WordCloud().process_text(text_all)

# Transform that dictionary into a pandas DataFrame
df_kwords = pd.DataFrame(list(kwords.items()), columns=['keyword', 'count']).set_index('keyword')

# Plot a bar chart with the top keywords
%matplotlib inline
df_kwords.sort_values(by='count', ascending=False).head(20).plot.bar()

Step 4. Explore the times when posts were created for the most shared entries

In [None]:
df.sort_values(by='shares_count', ascending=False).head(20)['created_time']

Step 5. Identify the top-20 posts that received the most shares

In [None]:
# Top-shared entries: combined text
text_shared = ' '.join(df.sort_values(by='shares_count', ascending=False)['message_clean'].head(20))

# Proceed generating a word cloud
wc = WordCloud(width=1200, height=800, max_font_size=110, collocations=False).generate(text_shared)
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.show()

In [None]:
# Store the words used to create WordCloud as kwords1
kwords1 = WordCloud().process_text(text_shared)

# Transform that dictionary into a pandas DataFrame
df_kwords1 = pd.DataFrame(list(kwords1.items()), columns=['keyword', 'count']).set_index('keyword')

# Plot a bar chart with the top keywords
%matplotlib inline
df_kwords1.sort_values(by='count', ascending=False).head(20).plot.bar()

Additional Step. Analyze the relationship between each count

In [None]:
# Convert correlation matrix into a DataFrame
df_corr = df.corr(method = 'pearson').round(decimals = 2)

# specifying plot size (making it bigger)
fig, ax = plt.subplots(figsize=(12,12))


# developing a spicy heatmap
sns.heatmap(data       = df_corr, # the correlation matrix
            cmap       = 'inferno',      # changing to SPICY colors
            square     = True,          # tightening the layout
            annot      = True,          # should there be numbers in the heatmap
            linecolor  = 'black',       # lines between boxes
            linewidths = 0.5)           # how thick should the lines be?


# title and displaying the plot
plt.title("""
Linear Correlation Heatmap for each count
""")

plt.show()