# Preamble

## Imports

Standard imports

In [14]:
import sys, os, re, time, copy
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import unidecode
import html

Language libraries:

In [15]:
from textblob import TextBlob

In [16]:
## Importing spacy takes a few seconds

import spacy
nlp = spacy.load("en_core_web_sm")
nlp_big = spacy.load("en_core_web_lg")

### Settings

Disable copy warnings in Pandas

In [17]:
pd.options.mode.chained_assignment = None  # default='warn'

Disable depreciation warnings

In [18]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

-----
# Part 1: Inspect the data
-----

## Load the data

You can also download the Excel sheet directly by clicking here: [download](https://github.com/TiesdeKok/eaa_2023_nlp_workshop/raw/main/facebook_review_dataset.xlsx)

In [19]:
review_df = pd.read_excel('facebook_review_dataset.xlsx')

## Inspect the data

### Dataset

In [20]:
print(f"The dataset contains {len(review_df)} reviews. \nThe oldest review is from {review_df['date'].min()} \nThe newest review is from {review_df['date'].max()}.")
review_df.head(2).T

The dataset contains 2290 reviews. 
The oldest review is from 2014-03-17 00:00:00 
The newest review is from 2021-12-28 00:00:00.


Unnamed: 0,0,1
date,2021-11-09 00:00:00,2021-12-25 00:00:00
year,2021,2021
rating,5,5
rating_wlb,5,5
rating_culture,5,5
rating_dei,5,4
rating_comp,5,4
job_title,Software Engineer,Data Scientist
location,"Seattle, WA","San Francisco, CA"
pros,"I’m still a bit new, but I have very good impr...","Its all you hear about, fun and diverse and su..."


### Number of reviews by year

In [21]:
review_df.value_counts('year').sort_index(ascending=False)

year
2021    1123
2020     572
2019     226
2018     239
2017     128
2016       1
2014       1
dtype: int64

------
## Evaluate the textual data that we are working with. 
-----

### Show random review

**Note:** Every time you run the code below it will show a different review in the dataset.

In [22]:
review_row = review_df.sample(1).iloc[0].to_dict()

print(f"Review by a {review_row['job_title']} in {review_row['location']} on {review_row['date']}")
print(f"\nPros: \n\n{review_row['pros'].strip()}")
print(f"\nCons: \n\n{review_row['cons'].strip()}")

Review by a Data Scientist in San Francisco, CA on 2021-12-25 00:00:00

Pros: 

Its all you hear about, fun and diverse and super engaging. Full of opportunities

Cons: 

You are evaluated nased on individual accomplishments, which is not always ideal to create an environment for contribution


## 

------


# Part 2: Clean the data
-----

### Cleaning function

In [23]:
def clean_text(input_str):
    clean_str = copy.copy(input_str)

    ## Remove special characters
    clean_str = unidecode.unidecode(clean_str)
    
    ## Unescape special HTML characters
    clean_str = html.unescape(clean_str)
    
    ## Replace Excel newline encoding errors
    clean_str = clean_str.replace("_x000D_", " ")

    ## Remove any non-letter characters except for regular sentence-ending punctuation
    clean_str = re.sub(r'[^a-zA-Z\s\.!\?\\n,]', '', clean_str)

    ## Replace all \s with a single space except for \n
    clean_str = re.sub(r'[ \t\x0B\f\r]+', ' ', clean_str)

    ## Replace a newline with a dot and a space
    clean_str = re.sub(r'\s*\n\s*', '. ', clean_str)

    ## Replace duplicate dots
    clean_str = re.sub(r'\.+', '.', clean_str)

    ## Remove any leading or trailing spaces
    clean_str = clean_str.strip()

    ## Add trailing dot if not exists
    if clean_str[-1] != '.':
        clean_str += '.'

    return clean_str

### Run

In [24]:
review_df['pros_clean'] = review_df['pros'].apply(clean_text)
review_df['cons_clean'] = review_df['cons'].apply(clean_text)

#### Add lower case version

In [25]:
review_df['pros_clean_l'] = review_df['pros_clean'].str.lower()
review_df['cons_clean_l'] = review_df['cons_clean'].str.lower()

### Inspect

In [26]:
review_row = review_df.sample(1).iloc[0].to_dict()

print(f"Review by a {review_row['job_title']} in {review_row['location']} on {review_row['date']}")
print(f"\n{'-'*15}\nBefore cleaning:\n{'-'*15}")
print(f"\nPros: \n\n{review_row['pros'].strip()}")
print(f"\nCons: \n\n{review_row['cons'].strip()}")
print(f"\n{'-'*15}\nAfter cleaning:\n{'-'*15}")
print(f"\nPros: \n\n{review_row['pros_clean'].strip()}")
print(f"\nCons: \n\n{review_row['cons_clean'].strip()}")

Review by a Software Engineer in Menlo Park, CA on 2020-09-07 00:00:00

---------------
Before cleaning:
---------------

Pros: 

- The engineering culture is still very bottom-up: teams generally have a lot of power and responsibility to determine what they work on._x000D_
- Internal developer tools are a huge focus. Anything which slows developers down is fixed if possible._x000D_
- Very good internal mobility: bootcamp means you have a good deal of leeway in finding a team, and, after a year internal mobility is encouraged._x000D_
- Tons of opportunities to make millions of peoples' lives just a little bit better. Like it or not, Facebook's products are used by over a billion people worldwide. There are few places where improvements and fixes can have such a broad reach._x000D_
- Lots of intellectual challenges: There are many types of challenges in building products for billions of people: technical (scaling and perf), product (how do you make it useful for everyone), regulatory (h

------
# Part 3: Analyze the text
----

## Let's quickly add a column that contains the full review text

In [27]:
review_df["full_review"] = review_df["pros_clean"] + " " + review_df["cons_clean"]
review_df["full_review_l"] = review_df["pros_clean_l"] + " " + review_df["cons_clean_l"]

------
## Part 3a: Keyword counts
----

### Count frequency of compensation keywords

In [28]:
keywords_of_interest = ['salary', 'pay', 'benefits', 'bonus']

count_columns = []
for keyword in keywords_of_interest:
    review_df[f'count_{keyword}'] = review_df['full_review_l'].str.count(keyword)**1
    count_columns.append(f'count_{keyword}')

### Evaluate

In [29]:
review_df[count_columns].describe().T[['min', 'mean', 'max']]

Unnamed: 0,min,mean,max
count_salary,0.0,0.064629,8.0
count_pay,0.0,0.141048,2.0
count_benefits,0.0,0.29607,4.0
count_bonus,0.0,0.016157,2.0


In [30]:
review_row = review_df.sample(1).iloc[0].to_dict()

print(f"Review by a {review_row['job_title']} in {review_row['location']} on {review_row['date']}")
print(f"\nReview: \n\n{review_row['full_review'].strip()}")
print("\nCounts:\n")
for col in count_columns:
    print(f"""{col.replace("count_", ""):<10} -> {review_row[col]:>3}""")

Review by a Software Engineer in Dallas, TX on 2021-02-21 00:00:00

Review: 

Salary Salary. Salary Salary. Salary Salary. Salary Salary. Work life balnce. Work life balnce. Work life balnce.

Counts:

salary     ->   8
pay        ->   0
benefits   ->   0
bonus      ->   0


------
## Part 3b: Classification - sentiment score
----

### Calculate sentiment score

In [31]:
def add_sentiment(input_str):
    return TextBlob(input_str).sentiment.polarity

In [32]:
review_df['pros_sentiment'] = review_df['pros_clean'].apply(add_sentiment)
review_df['cons_sentiment'] = review_df['cons_clean'].apply(add_sentiment)
review_df['review_sentiment'] = review_df['full_review'].apply(add_sentiment)

### Evaluate

In [33]:
sentiment_cols = ["pros_sentiment", "cons_sentiment", "review_sentiment"]

In [34]:
review_df[sentiment_cols].describe()

Unnamed: 0,pros_sentiment,cons_sentiment,review_sentiment
count,2290.0,2290.0,2290.0
mean,0.429562,0.045984,0.28749
std,0.280192,0.280253,0.243551
min,-1.0,-1.0,-1.0
25%,0.233333,-0.057244,0.1375
50%,0.45,0.0,0.271825
75%,0.65,0.2,0.433333
max,1.0,1.0,1.0


In [35]:
review_row = review_df.sample(1).iloc[0].to_dict()

print(f"Review by a {review_row['job_title']} in {review_row['location']} on {review_row['date']}")
print(f"\nPros: \n\n{review_row['pros_clean'].strip()}")
print(f"\nCons: \n\n{review_row['cons_clean'].strip()}")
print("\nSentiment:\n")
for col in sentiment_cols:
    print(f"""{col:<18} -> {review_row[col]:>6.2f}""")

Review by a Technical Sourcer in Menlo Park, CA on 2018-10-15 00:00:00

Pros: 

Thorough interview. Good blend of past experience an sourcing knowledge. Conversational but professional. Able to get a strong understanding of team dynamic and measures of success.

Cons: 

Unclear what the conversion to full time process is, how long it takes.

Sentiment:

pros_sentiment     ->   0.25
cons_sentiment     ->   0.15
review_sentiment   ->   0.23


------
## Part 3b: Classification - supervised ML
----

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Training data

In [37]:
define_training_sample = False
if define_training_sample:
    training_df = review_df.sample(200, random_state = 12345)

    training_df = training_df[["pros_clean"]]

    training_df["mentions_compensation"] = ""

    training_df.to_excel("training_data_before.xlsx", index = False)

In [38]:
## Load back in after we are done
training_df = pd.read_excel("training_data_after.xlsx")

### Evaluate

In [39]:
training_df.sample(2)

Unnamed: 0,pros_clean,mentions_compensation
159,Free food and fun activities.,0
187,interesting projects and lots of autonomy.,0


In [40]:
_ = training_df
_.mentions_compensation.value_counts().apply(lambda x: f"{x} - {x/len(_):.0%}")

0    106 - 53%
1     94 - 47%
Name: mentions_compensation, dtype: object

## Represent numerically

In [41]:
transformer = CountVectorizer(stop_words='english')
tf = transformer.fit_transform(training_df["pros_clean"])

### Evaluate

In [42]:
tf.shape ## We have 740 unique words

(200, 740)

In [43]:
tf_df = pd.DataFrame(tf.toarray(), columns = transformer.get_feature_names_out())

In [44]:
tf_df.sample(10).T.sample(10)

Unnamed: 0,25,175,156,58,123,76,16,56,162,179
jobs,0,0,0,0,0,0,0,0,0,0
unnecessary,0,0,0,0,0,0,0,0,0,0
shop,0,0,0,0,0,0,0,0,0,0
super,0,0,0,0,0,0,0,0,0,0
just,0,0,0,0,0,2,0,0,0,0
learn,0,0,0,0,0,0,0,0,0,0
view,0,0,0,0,0,0,0,0,0,0
celebrated,0,0,0,0,0,0,0,0,0,0
ideas,0,0,0,0,0,0,0,0,0,0
prove,0,0,0,0,0,1,0,0,0,0


In [45]:
training_df.iloc[83].pros_clean

'Good managers with experience in hand. Lot to learn.'

## Train

### Let's create a train - test split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    tf,
    training_df["mentions_compensation"], 
    test_size=0.25, 
    random_state=0, 
    stratify=training_df["mentions_compensation"]
)

In [47]:
X_train.shape

(150, 740)

In [48]:
X_test.shape

(50, 740)

### Train a logistic regression

For simplicity sake I am skipping over multiple things here (e.g., hyper-parameter optimization, pipelines, and N-fold cross validation). For a more comprehensive overview, see the sci-kit learn documentation:

https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

In [49]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

In [50]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.96      0.85        27
           1       0.94      0.65      0.77        23

    accuracy                           0.82        50
   macro avg       0.85      0.81      0.81        50
weighted avg       0.84      0.82      0.81        50



## Make predictions

### Identify training predictions

In [51]:
review_df["in_training"] = review_df.pros_clean.isin(training_df.pros_clean.values)

In [52]:
review_df.in_training.value_counts()

False    2090
True      200
Name: in_training, dtype: int64

In [53]:
review_df = pd.merge(review_df, training_df, on = "pros_clean", how = "left")

### Create predictions 

In [54]:
inpt_vector = transformer.transform(review_df.pros_clean)

In [55]:
review_df["prediction"] = clf.predict(inpt_vector)

### Evaluate

In [56]:
review_row = review_df.sample(1).iloc[0].to_dict()

print(f"Review by a {review_row['job_title']} in {review_row['location']} on {review_row['date']}")
print(f"\nPros: \n\n{review_row['pros_clean'].strip()}")
print("\nPrediction:\n")
for col in ["in_training", "mentions_compensation", "prediction"]:
    print(f"""{col:<23} -> {review_row[col]:>5.0f}""")

Review by a Software Engineer in Seattle, WA on 2018-02-01 00:00:00

Pros: 

If youre smart and good at software engineering, this place is the best for your career development. Excellent people, excellent food, excellent pay.

Prediction:

in_training             ->     0
mentions_compensation   ->   nan
prediction              ->     0


------
## Part 3c: Unsupervised topic modeling
----

In [57]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

## Prepare the data

In [58]:
clean_cons_l = review_df['cons_clean_l'].tolist()

In [59]:
vec_cons = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
tf_cons = vec_cons.fit_transform(clean_cons_l)

## Train the models

In [66]:
n_topics = 5
lda_cons = LatentDirichletAllocation(n_components=n_topics, max_iter=10,
                                learning_method='online',
                                n_jobs=1)
lda_f_cons = lda_cons.fit_transform(tf_cons)

### Show most common keywords per cluster

In [67]:
feature_names = vec_cons.get_feature_names_out()
n_top_words = 10
for topic_idx, topic in enumerate(lda_cons.components_):
        print((topic_idx+1, ", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])))

(1, 'lot, cons, long, hours, working, culture, process, company, environment, stressful')
(2, 'company, facebook, management, people, like, really, teams, time, big, manager')
(3, 'work, balance, life, high, bad, worklife, pressure, hard, great, good')
(4, 'youre, people, company, dont, impact, means, make, bit, getting, team')
(5, 'fast, think, growth, things, company, growing, focus, lots, lot, leadership')


## Visualize
PyLDAvis is a nice package but it often does not work and can create issues with Jupyter. 

In [68]:
from IPython.display import HTML  
css_str = '''<style> 
   .jp-icon-warn0 path {fill: var(--jp-warn-color0);} 
   .bp3-button-text path {    fill: var(--jp-inverse-layout-color3);} 
   .jp-icon-brand0 path { fill: var(--jp-brand-color0);} 
   text.terms { fill: #616161;} 
</style>'''
display(HTML(css_str))

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_cons, tf_cons, vec_cons, n_jobs=1)