In [1]:
import pandas as pd
import os
import pandas as pd
import spacy
from multiprocessing import Pool, cpu_count
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [2]:
pwd = os.getcwd()
df = pd.read_csv(pwd + "/data/reviews_snap.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,date,reviewNo,employeeType,position,summary,pro,con,advice,overallStar,workLifeStar,cultureStar,careerOppStar,comBenefitsStar,srManagementStar,reviewLink
0,1,"Nov 8, 2017",empReview_17803574,Current Employee - Contractor QA Test Engineer,,Amazing time so far,"Morale is high, and they take care of you. Fre...",Work/life balance can be hectic. People enjoy ...,,5.0,5.0,5.0,5.0,5.0,5.0,https://www.glassdoor.com/Reviews/Snap-Reviews...
1,2,"Nov 18, 2017",empReview_17970248,Former Employee - Research Scientist,I worked at Snap full-time (Less than a year),research intern,working environment is good. The pay is satisf...,not enough people working on research,,4.0,4.0,4.0,4.0,4.0,4.0,https://www.glassdoor.com/Reviews/Snap-Reviews...
2,3,"Nov 8, 2017",empReview_17797905,Former Employee - Anonymous Employee,I worked at Snap full-time,Snap Review,"Great people, office, and benefits","Loose structure, company is under-performing",,3.0,3.0,3.0,3.0,3.0,3.0,https://www.glassdoor.com/Reviews/Snap-Reviews...
3,4,"Oct 20, 2017",empReview_17470285,Current Employee - Anonymous Employee,I have been working at Snap full-time (More th...,Manager,Great benefits. Perfect mix of people and resu...,Fast paced. You have to be your own advocate. ...,,5.0,5.0,5.0,5.0,5.0,5.0,https://www.glassdoor.com/Reviews/Snap-Reviews...
4,5,"Oct 13, 2017",empReview_17342809,Current Employee - Anonymous Employee,I have been working at Snap full-time,I love Snap!,"Seriously, this is the best place I have ever ...","Still very ""start up"" like; however, this open...",,5.0,5.0,5.0,5.0,5.0,5.0,https://www.glassdoor.com/Reviews/Snap-Reviews...


# To Do:
For Pros and Cons of the reviews, process the data for NLP analysis by 
- removing Stop Words 
- Stemming/Lemmatizing
- Analyzing the corpus using TF-IDF to find top words in each category
- Use word cloud to look at most common topics
- Sentiment analysis on the reviews
- Can sentiments be correlated to rating? How would you weight Pros & Cons to arrive at Review Rating??
- Can we build a model to predict rating from reviews??
- Does the review depend on type of job?
- How is the rating trend over time?

In [4]:
df.rename(columns={"Unnamed: 0":'reviewNum'}, inplace=True)
df.columns

Index(['reviewNum', 'date', 'reviewNo', 'employeeType', 'position', 'summary',
       'pro', 'con', 'advice', 'overallStar', 'workLifeStar', 'cultureStar',
       'careerOppStar', 'comBenefitsStar', 'srManagementStar', 'reviewLink'],
      dtype='object')

In [5]:
# clean_text
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
STOPLIST = set(["n't", "'s", "'m", "ca", "'", "'re", "i've", 'poor', '-',
                'worst', 'place', 'make', 'thing', 'hour', 'low', 'high', 'bos',
                'good', 'great', 'awesome', 'excellent', 'job', 'best', 'lot',
                'wonderful', 'awful', 'work', 'amazing', 'suck', 'nice',
                'really', 'free', 'like', 'love', 'bad', 'terrible', 'care',
                'horrible', 'company', 'employee', 'staff', 'time', 'day',
                'week', 'month', 'year', 'need', 'better', 'just', 'decent'] +
               list(ENGLISH_STOP_WORDS))

In [6]:
KEEP_POS = {'ADJ', 'ADP', 'ADV', 'NOUN', 'VERB'}
nlp = spacy.load('en')


In [7]:
def multi_scrub_text(reviews):
    '''
    Function to lemmatize text - utilizes multiprocessing for parallelization
    INPUT:
        reviews: array-like, pandas DataFrame column containing review texts
    OUTPUT:
        lemmatized: pandas DataFrame column with cleaned texts
    '''
    lemmatized = []
    cpus = cpu_count() - 1
    pool = Pool(processes=cpus)
    lemmatized = pool.map(lemmatize_text, reviews)
    pool.close()
    pool.join()
    return lemmatized


def lemmatize_text(text, stop_words=STOPLIST, keep_pos=KEEP_POS):
    '''
    Function to lemmatize a single document of the corpus
    INPUT:
        text: string, text of review
        stop_words: words to remove from text, default STOPLIST defined above
        keep_pos: parts of speech to keep in text, default KEEP_POS def above
    OUTPUT:
        lemmatized text
    '''
    x = nlp(text)
    words = [tok.lemma_.strip(punctuation) for tok in x if (
        tok.pos_ in keep_pos) and (tok.lemma_.strip(punctuation) not in STOPLIST)]
    words.extend(['boss' for tok in x if tok.lemma_ == 'bos'])
    return ' '.join(words)

In [9]:
pros_df = pd.DataFrame()
cons_df = pd.DataFrame()
print(multi_scrub_text(df['pro']))
pros_df['lemmatized_text'] = multi_scrub_text(df['pro'])

cons_df['lemmatized_text'] = multi_scrub_text(df['con'])

['morale food learn opportunity people team environment approachable', 'environment pay satisfactory', 'people office benefit', 'benefit perfect mix people result focus awake night lifetime opportunity PRON idea grow global business', 'seriously extremely creative drive produce product customer experience PRON user individual ton responsibility able truly impact business culture bfast lunch dinner want perk activity', 'culture benefit opportunity growth', 'remote flexible', 'beach unbeatable lunch dinner brand exciting feel benefit', 'competitive perks competitive salary', 'benefit intelligent people fast paced environment', 'people creative atmosphere food benefit', 'small team responsibility smart kind people competitive compensation stock vesting backload encourage', 'people interesting task collaborative college', 'office think enjoy environment people smart', 'learn fun', 'people voice constantly cool stuff idea valid listen', 'fun cut edge product million user team attitude possi

In [12]:
df['lem_pro'] = multi_scrub_text(df['pro'])
df['lem_con'] = multi_scrub_text(df['con'])

In [13]:
df.head()

Unnamed: 0,reviewNum,date,reviewNo,employeeType,position,summary,pro,con,advice,overallStar,workLifeStar,cultureStar,careerOppStar,comBenefitsStar,srManagementStar,reviewLink,lem_pro,lem_con
0,1,"Nov 8, 2017",empReview_17803574,Current Employee - Contractor QA Test Engineer,,Amazing time so far,"Morale is high, and they take care of you. Fre...",Work/life balance can be hectic. People enjoy ...,,5.0,5.0,5.0,5.0,5.0,5.0,https://www.glassdoor.com/Reviews/Snap-Reviews...,morale food learn opportunity people team envi...,life balance hectic people enjoy come later me...
1,2,"Nov 18, 2017",empReview_17970248,Former Employee - Research Scientist,I worked at Snap full-time (Less than a year),research intern,working environment is good. The pay is satisf...,not enough people working on research,,4.0,4.0,4.0,4.0,4.0,4.0,https://www.glassdoor.com/Reviews/Snap-Reviews...,environment pay satisfactory,people research
2,3,"Nov 8, 2017",empReview_17797905,Former Employee - Anonymous Employee,I worked at Snap full-time,Snap Review,"Great people, office, and benefits","Loose structure, company is under-performing",,3.0,3.0,3.0,3.0,3.0,3.0,https://www.glassdoor.com/Reviews/Snap-Reviews...,people office benefit,loose structure perform
3,4,"Oct 20, 2017",empReview_17470285,Current Employee - Anonymous Employee,I have been working at Snap full-time (More th...,Manager,Great benefits. Perfect mix of people and resu...,Fast paced. You have to be your own advocate. ...,,5.0,5.0,5.0,5.0,5.0,5.0,https://www.glassdoor.com/Reviews/Snap-Reviews...,benefit perfect mix people result focus awake ...,fast pace PRON advocate people snap
4,5,"Oct 13, 2017",empReview_17342809,Current Employee - Anonymous Employee,I have been working at Snap full-time,I love Snap!,"Seriously, this is the best place I have ever ...","Still very ""start up"" like; however, this open...",,5.0,5.0,5.0,5.0,5.0,5.0,https://www.glassdoor.com/Reviews/Snap-Reviews...,seriously extremely creative drive produce pro...,start open opportunity individual contributor ...
