In [None]:
import numpy as np 
import pandas as pd 
import re #Importing regular expression operations
import string
from wordcloud import WordCloud # to represent the text data (visually) in which the size of each word indicates its frequency or importance.
#from textblob import TextBlob #It performs different operations on textual data such as noun phrase extraction, sentiment analysis, classification, translation, etc.
from sklearn.feature_extraction import text 
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("employee_reviews.csv",encoding='latin-1',index_col=0)
df.info()

In [None]:
df.head(3)

In [None]:
print(df.shape)

In [None]:
print(df.isnull().sum())

In [None]:
df.describe()

#### Segregating the data company wise to find their individual shape

In [None]:
gog_df = df.apply(lambda row: row[df['company'].isin(['google'])])
print("Shape of Google : ", gog_df.shape)

amz_df = df.apply(lambda row: row[df['company'].isin(['amazon'])])
print("Shape of Amazon : ",amz_df.shape)

fb_df = df.apply(lambda row: row[df['company'].isin(['facebook'])])
print("Shape of Facebook : ",fb_df.shape)

net_df = df.apply(lambda row: row[df['company'].isin(['netflix'])])
print("Shape of Netflix : ",net_df.shape)

apl_df = df.apply(lambda row: row[df['company'].isin(['apple'])])
print("Shape of Apple : ",apl_df.shape)

mcf_df = df.apply(lambda row: row[df['company'].isin(['microsoft'])])
print("Shape of Microsoft : ",mcf_df.shape)

In [None]:
df['combined_reviews'] = df['summary']+'. '+df['advice-to-mgmt']+'. '+df['pros']+'. '+df['cons']
df['combined_reviews']

### Creating a new dataframe which would contain only the needed column attributes.

In [None]:
dtf = df[['company','job-title', 'combined_reviews',
        'overall-ratings', 'work-balance-stars',
       'culture-values-stars', 'carrer-opportunities-stars',
       'comp-benefit-stars', 'senior-mangemnet-stars']]

### Data Cleaning

In [None]:
ratings = dtf[['overall-ratings',
       'work-balance-stars', 'culture-values-stars',
       'carrer-opportunities-stars', 'comp-benefit-stars',
       'senior-mangemnet-stars']]

In [None]:
ratings.dtypes

In [None]:
print("Unique Values in Overall rating column : " , ratings['overall-ratings'].unique())
print("Unique Values in work-balance-stars column : ", ratings['work-balance-stars'].unique())
print("Unique Values in culture-values-stars column : ", ratings['culture-values-stars'].unique())
print("Unique Values in carrer-opportunities-stars column : ", ratings['carrer-opportunities-stars'].unique())
print("Unique Values in comp-benefit-stars column : ", ratings['comp-benefit-stars'].unique())
print("Unique Values in senior-mangemnet-stars column : ", ratings['senior-mangemnet-stars'].unique())

In [None]:
#step 1
ratings['overall-ratings'] = ratings['overall-ratings'].replace('None', np.nan)
ratings['overall-ratings'] = ratings['overall-ratings'].replace('Nan', np.nan)
ratings['overall-ratings'] = ratings['overall-ratings'].replace('none', np.nan)

ratings['work-balance-stars'] = ratings['work-balance-stars'].replace('None', np.nan)
ratings['work-balance-stars'] = ratings['work-balance-stars'].replace('Nan', np.nan)
ratings['work-balance-stars'] = ratings['work-balance-stars'].replace('none', np.nan)

ratings['culture-values-stars'] = ratings['culture-values-stars'].replace('None', np.nan)
ratings['culture-values-stars'] = ratings['culture-values-stars'].replace('Nan', np.nan)
ratings['culture-values-stars'] = ratings['culture-values-stars'].replace('none', np.nan)

ratings['carrer-opportunities-stars'] = ratings['carrer-opportunities-stars'].replace('None', np.nan)
ratings['carrer-opportunities-stars'] = ratings['carrer-opportunities-stars'].replace('Nan', np.nan)
ratings['carrer-opportunities-stars'] = ratings['carrer-opportunities-stars'].replace('none', np.nan)

ratings['comp-benefit-stars'] = ratings['comp-benefit-stars'].replace('None', np.nan)
ratings['comp-benefit-stars'] = ratings['comp-benefit-stars'].replace('Nan', np.nan)
ratings['comp-benefit-stars'] = ratings['comp-benefit-stars'].replace('none', np.nan)

ratings['senior-mangemnet-stars'] = ratings['senior-mangemnet-stars'].replace('None', np.nan)
ratings['senior-mangemnet-stars'] = ratings['senior-mangemnet-stars'].replace('Nan', np.nan)
ratings['senior-mangemnet-stars'] = ratings['senior-mangemnet-stars'].replace('none', np.nan)

In [None]:
ratings['work-balance-stars'] = ratings['work-balance-stars'].astype(float)
ratings['culture-values-stars'] = ratings['culture-values-stars'].astype(float)
ratings['carrer-opportunities-stars'] = ratings['carrer-opportunities-stars'].astype(float)
ratings['comp-benefit-stars'] = ratings['comp-benefit-stars'].astype(float)
ratings['senior-mangemnet-stars'] = ratings['senior-mangemnet-stars'].astype(float)

In [None]:
#step2
ratings['overall-ratings']= ratings['overall-ratings'].fillna((ratings['overall-ratings'].mean(skipna=True)))
ratings['work-balance-stars']= ratings['work-balance-stars'].fillna((ratings['work-balance-stars'].mean(skipna=True)))
ratings['culture-values-stars']= ratings['culture-values-stars'].fillna((ratings['culture-values-stars'].mean(skipna=True)))
ratings['carrer-opportunities-stars']= ratings['carrer-opportunities-stars'].fillna((ratings['carrer-opportunities-stars'].mean(skipna=True)))
ratings['comp-benefit-stars']= ratings['comp-benefit-stars'].fillna((ratings['comp-benefit-stars'].mean(skipna=True)))
ratings['senior-mangemnet-stars']= ratings['senior-mangemnet-stars'].fillna((ratings['senior-mangemnet-stars'].mean(skipna=True)))


In [None]:
ratings.dtypes

In [None]:
ratings

In [None]:
dtf[['overall-ratings',
       'work-balance-stars', 'culture-values-stars',
       'carrer-opportunities-stars', 'comp-benefit-stars',
       'senior-mangemnet-stars']] = ratings[['overall-ratings',
       'work-balance-stars', 'culture-values-stars',
       'carrer-opportunities-stars', 'comp-benefit-stars',
       'senior-mangemnet-stars']]

### Text Preprocessing

In [None]:
def clean_txt(text):
    text = str(text)
    for n in range(10):
        text = text.replace(str(n) , '') #replacing n with empty quotes 
    text = text.lower() #lowercase  conversion
    text = text.replace("(" , "") #replacing brackets
    text = text.replace(")" , "") #replacing quotes
    text = re.sub('\[.*?\]', '', text) #to replace multiple patterns with single quotes
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text) #to remove all the non-word characters
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub(r'[^\x00-\x7f]', '', text) #Replacing non-ASCII characters with single space 
    text = text.replace( " \ " , "" ) #replacing back slash with single space
    text = text.replace("/" , "") #replacing front slash with single space
    
    return text

In [None]:
df['summary'] =  df['summary'].apply(lambda x : clean_txt(x)) #applying clean function in each line of summary
#removing latent chars, numbers, special chars - cleaning
summary_corpus = df[["summary","company"]]

combined_smry_dict = {} 
for comp in df['company'].unique():
    combined_summary = "" 
    for summary in df['summary'][df['company'] == comp]: #converting company into key and summary to value
        combined_summary = combined_summary +" "+summary
    
    combined_smry_dict[comp] = combined_summary

df_summary = pd.DataFrame(data=combined_smry_dict , index = [0])
df_summary = pd.DataFrame.copy(df_summary.T)
df_summary.columns = ["summary_corpus"]
df_summary['company'] = df_summary.index

### Sentiment Analysis using VADER Package
In this method, we will use the Sentiment Intensity Analyser which uses the VADER Lexicon. VADER is a long-form for Valence Aware and sEntiment Reasoner, a rule-based sentiment analysis tool. VADER calculates text emotions and determines whether the text is positive, neutral or, negative. This analyzer calculates text sentiment and produces four different classes of output scores: positive, negative, neutral, and compound.
Here, we will make use of the Compound Score. A compound score is the aggregate of the score of a word, or precisely, the sum of all words in the lexicon, normalized between -1 and 1. 

In [None]:
!pip install VaderSentiment

dtf['combined_reviews'] = dtf['combined_reviews'].apply(lambda x : clean_txt(x)) #applying clean function in each line of summary
#removing latent characters, numbers, special characters - cleaning

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
#from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent = SentimentIntensityAnalyzer()
polarity = [round(sent.polarity_scores(i)['compound'], 2) for i in dtf['combined_reviews']]
dtf['vader_sentiment_score'] = polarity

# function to analyse 
def vader_analysis(compound):
    if compound >= 0:
        return 'Positive'
    elif compound <= 0 :
        return 'Negative'
    else:
        return 'Neutral'
    
dtf['vader_Analysis'] = dtf['vader_sentiment_score'].apply(vader_analysis)

dtf.head()

In [None]:
vader_counts = dtf['vader_Analysis'].value_counts()
vader_counts

In [None]:
pol_total = dtf.groupby('company')['vader_Analysis'].value_counts(normalize=False).unstack()
pol_total

To find the overall positive and negative polarity percentage count of given reviews from the dataset.

In [None]:
vader_counts= dtf['vader_Analysis'].value_counts()

sns.set(rc={'axes.facecolor':'white'})
fig, ax = plt.subplots(figsize=(10, 7))
plt.rcParams["font.size"] = "20"
colors = sns.color_palette("gist_rainbow")
plt.pie(vader_counts.values,labels = vader_counts.index, explode = (0.1, 0.1), autopct='%1.1f%%', shadow=False, colors = colors)
leg = ax.legend(prop={"size":15})


In [None]:
#To add a new column category next to the age group. 
category = pd.cut(dtf['vader_sentiment_score'],bins=[-0.9,-0.5,0.1,0.5,0.9,1.0],labels=["not very satisfied", "not satisfied","neutral", "satisfied","very well satisfied"])
dtf.insert(11, 'polarity_grade', category)
dtf.sample(5)

In [None]:
category_total = dtf.groupby('company')['polarity_grade'].value_counts(normalize=False).unstack()
category_total

### Exploratory Data Analysis

To find the correlation between the variables

In [None]:
corr = dtf.corr()# plot the heatmap

sns.set(rc={'axes.facecolor':'white'})

sns.set(font_scale = 1.5)

sns_plot =  sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap ="gist_rainbow")


To find the company that has the highest overall rating distribution

In [None]:
plt.figure(1 , figsize = (15 , 9))
sns.set(rc={'axes.facecolor':'white'})
sns.set(font_scale = 1.5)
n = 0 
for company in dtf['company'].unique():
    n += 1
    sns.set_style("whitegrid")
    plt.subplot(3 , 2 , n )
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    sns.violinplot(x = 'overall-ratings' , data = dtf.where(dtf['company'] == company), color = "lawngreen")
    plt.xlabel('')
    plt.ylabel(company)
plt.show()


To find the average rating points of the job features

In [None]:
features = ['work-balance-stars' , 'culture-values-stars' , 'carrer-opportunities-stars',
                     'comp-benefit-stars','senior-mangemnet-stars']
index_companies = ['google' , 'amazon' , 'facebook' , 'netflix' , 'apple' , 'microsoft']

mean_rating_dtf = pd.DataFrame(index = index_companies)

for feature in features:
    f = []
    for comp in index_companies:
      #except ValueError:
        f.append(np.mean(dtf[feature][dtf['company'] == comp]))
    mean_rating_dtf[feature] = f
del f

plt.figure(1 , figsize = (15 , 6))
sns.set(rc={'axes.facecolor':'white'})
sns.set(font_scale = 1.5)
colors = sns.color_palette("gist_rainbow")

sns.set_style("whitegrid")
for n , c  in zip(range(6) , colors):
    plt.scatter(x = np.arange(5) , y = mean_rating_dtf.iloc[n , :].values , s = 200 , c = c  , label = mean_rating_dtf.index[n])
    plt.plot(np.arange(5) , mean_rating_dtf.iloc[n , :].values , '-' , color = c , 
             alpha = 0.2)
    plt.xticks(np.arange(5) , features)
plt.legend(bbox_to_anchor=(1.00, 1), loc=2, borderaxespad=0.)    
plt.show()  

To find the distribution of polarity scores for the combined reviews 

In [None]:
plt.figure(1 , figsize = (15 , 4))
sns.set(rc={'axes.facecolor':'white'})
fig1 = plt.hist(dtf['vader_sentiment_score'] , bins = 50)
plt.title('Polarity of Combined Reviews')
sns.set(rc={'axes.facecolor':'white'})
colors = sns.color_palette("gist_rainbow")
plt.figure(2 , figsize = (15 , 7))
n = 0 
for comp , c in zip(index_companies , colors):
    n += 1
    plt.subplot(2 , 3 , n)
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    fig2 = plt.hist(dtf['vader_sentiment_score'][dtf['company'] == comp], bins = 50, color = c )
    plt.title(comp)
plt.show()


To find the count of companies that have reviewed the most in Glassdoor

In [None]:
plt.style.use('fivethirtyeight')
sns.set(rc={'axes.facecolor':'white'})
sns.set(font_scale = 1.5)
plt.figure(1 , figsize = (15 , 7))
colors = sns.color_palette("gist_rainbow")
ax = sns.countplot(y = 'company' , data = dtf  , 
              order = dtf['company'].value_counts().index)
initialx=0
for p in ax.patches:
  ax.text(p.get_width(),initialx+p.get_height()/8,'{:1.0f}'.format(p.get_width()))
  initialx+=1

plt.show()

To find which employee and job profile has reviewed the most in Glassdoor

In [None]:
common_job_titles = []
c = Counter(dtf['job-title']).most_common()[:11]
for n in range(11):
    common_job_titles.append(c[n][0])
    
plt.figure(1 , figsize = (15 , 8))
sns.set(rc={'axes.facecolor':'white'})
sns.set(font_scale = 1.5)
sns.set_style("whitegrid")
ax = sns.countplot(y = 'job-title' , data = dtf[dtf['job-title'].isin(common_job_titles)] ,
              palette = 'gist_rainbow' , 
              order = dtf[dtf['job-title'].isin(common_job_titles)]['job-title'].value_counts().index)
initialx=0

for p in ax.patches:
  ax.text(p.get_width(),initialx+p.get_height()/8,'{:1.0f}'.format(p.get_width()))
  initialx+=1
plt.show()

Word cloud of overall reviews

In [None]:
wc = WordCloud(stopwords = text.ENGLISH_STOP_WORDS, 
               background_color = "white" , 
               colormap = "Dark2" ,
               max_font_size = 150 , 
               random_state = 42)
corpus = ''
for corp in df_summary['summary_corpus'].values:
    corpus = corpus+' '+corp
wc.generate(corpus)
sns.set(rc={'axes.facecolor':'white'})
plt.figure(1 , figsize = (15 , 8))
plt.imshow(wc , interpolation="bilinear")
plt.axis("off")
plt.show()

Word cloud of each company

In [None]:
wc = WordCloud(stopwords = text.ENGLISH_STOP_WORDS.union(['apple' , 'amazon','netflix' , 'google', 'facebook']), 
               background_color = "white" , 
               colormap = "Dark2" ,
               max_font_size = 150 , 
               random_state = 42)


plt.figure(1 , figsize = (15 , 9))
sns.set(rc={'axes.facecolor':'white'})
for corpus , i in zip(df_summary['summary_corpus'].values,range(6)):
    wc.generate(corpus)
    plt.subplot(3 , 2 , i + 1)
    plt.imshow(wc , interpolation="bilinear")
    plt.axis("off")
    plt.title(summary_corpus.index[i])
plt.show()


### Findings and Analysis - Machine Learning Algorithms 

To find the accuracy of congenial between the text reviews and numeric ratings given by individual employees.

In [None]:
X=dtf[['overall-ratings','work-balance-stars','culture-values-stars','carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars']]
y=dtf['vader_sentiment_score']

In [None]:
y = y.astype(np.int)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=40)

In [None]:
print('X_train dimension= ', X_train.shape)
print('X_test dimension= ', X_test.shape)
print('y_train dimension= ', y_train.shape)
print('y_train dimension= ', y_test.shape)

Random Forest

In [None]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier
# creating a RF classifier
RF = RandomForestClassifier(n_estimators = 100) 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
RF.fit(X_train, y_train)
 
# performing predictions on the test dataset
RF.predict(X_test)

RF_result = RF.score(X_test, y_test)
 
print("ACCURACY OF THE MODEL: ", RF_result)

AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

ABC = AdaBoostClassifier(n_estimators=100, random_state=0)
ABC.fit(X_train, y_train)

ABC.predict(X_test)

ABC_result = ABC.score(X_test, y_test)

print("ACCURACY OF THE MODEL: ", ABC_result)

Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import make_classification
GNB = GaussianNB()
GNB.fit(X_train, y_train)

GNB.predict(X_test)

GNB_result = GNB.score(X_test, y_test)

print("ACCURACY OF THE MODEL: ", GNB_result)

Support Vector Machine

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
SVM = make_pipeline(StandardScaler(), SVC(gamma='auto'))
SVM.fit(X_train, y_train)

SVM.predict(X_test)

SVM_result = SVM.score(X_test, y_test)

print("ACCURACY OF THE MODEL: ", SVM_result)


In [None]:
#dtf.to_csv("grade_final.csv")

Multinomial Logistic Regression 

In [None]:
dt =pd.read_csv("grade_final.csv",encoding='latin-1',index_col=0)
dt.info()

In [None]:
# Import label encoder
from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'species'.
dt['polarity_grade']= label_encoder.fit_transform(dt['polarity_grade'])

In [None]:
from sklearn.model_selection import train_test_split
X = dt[['overall-ratings','work-balance-stars','culture-values-stars','carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score']]
Y = dt['polarity_grade']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

In [None]:
from sklearn.linear_model import LogisticRegression

#creating the multinomial logistic regression model
LR = LogisticRegression(multi_class='multinomial',solver='lbfgs')
LR.fit(X_train, Y_train)


#evaluating the score of the model
LR_result = LR.score( X_test, Y_test)


print("ACCURACY OF THE MODEL: ", LR_result)

To find the positive and negative polarities of current and former employees individually

In [None]:
former = dtf[dtf['job-title'].str.contains('Former Employee')]
former_counts = former['vader_Analysis'].value_counts()
former_counts

In [None]:
current = dtf[dtf['job-title'].str.contains('Current Employee')]
current_counts = current['vader_Analysis'].value_counts()
current_counts

### MANOVA
To find the significant difference in the reviews among the current and former employees


In [None]:
#! pip install statsmodels
import statsmodels 
from statsmodels.multivariate.manova import MANOVA

In [None]:
x = dt[['overall-ratings','work-balance-stars','culture-values-stars','carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score']]
y = dt['job-title']

In [None]:

gog_dt = dt.apply(lambda row: row[dt['company'].isin(['google'])])

jx = gog_dt[['overall-ratings','work-balance-stars','culture-values-stars', 'carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score' ]]
jy = gog_dt['job-title']

# fit manova
manova_result = MANOVA.from_formula('jy ~jx', gog_dt)
print(manova_result.mv_test())

In [None]:
fb_dt = dt.apply(lambda row: row[dt['company'].isin(['facebook'])])

jx = fb_dt[['overall-ratings','work-balance-stars','culture-values-stars', 'carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score' ]]
jy = fb_dt['job-title']

# fit manova
manova_result = MANOVA.from_formula('jy ~jx', fb_dt)
print(manova_result.mv_test())

In [None]:
apl_dt = dt.apply(lambda row: row[dt['company'].isin(['apple'])])

jx = apl_dt[['overall-ratings','work-balance-stars','culture-values-stars', 'carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score' ]]
jy = apl_dt['job-title']

# fit manova
manova_result = MANOVA.from_formula('jy ~jx', apl_dt)
print(manova_result.mv_test())

In [None]:
net_dt = dt.apply(lambda row: row[dt['company'].isin(['netflix'])])

jx = net_dt[['overall-ratings','work-balance-stars','culture-values-stars', 'carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score' ]]
jy = net_dt['job-title']

# fit manova
manova_result = MANOVA.from_formula('jy ~jx', net_dt)
print(manova_result.mv_test())

In [None]:
mic_dt = dt.apply(lambda row: row[dt['company'].isin(['microsoft'])])

jx = mic_dt[['overall-ratings','work-balance-stars','culture-values-stars', 'carrer-opportunities-stars','comp-benefit-stars','senior-mangemnet-stars','vader_sentiment_score' ]]
jy = mic_dt['job-title']

# fit manova
manova_result = MANOVA.from_formula('jy ~jx', mic_dt)
print(manova_result.mv_test())