In [36]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [37]:
df = pd.read_csv("all_processed.csv")
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,company,location,dates,job-title,summary,pros,cons,advice-to-mgmt,overall-ratings,work-balance-stars,culture-values-stars,carrer-opportunities-stars,comp-benefit-stars,senior-mangemnet-stars,helpful-count,link
0,0,1,google,none,"Dec 11, 2018",Current Employee - Anonymous Employee,best compani work,peopl smart friendli,bureaucraci slow thing,none,5.0,4.0,5.0,5.0,4.0,5.0,0,https://www.glassdoor.com/Reviews/Google-Revie...
1,1,2,google,"Mountain View, CA","Jun 21, 2013",Former Employee - Program Manager,move speed light burn inevit,food food food cafe main campu mtv alon mini k...,work life balanc balanc perk benefit illus kee...,1) Don't dismiss emotional intelligence and ad...,4.0,2.0,3.0,3.0,5.0,3.0,2094,https://www.glassdoor.com/Reviews/Google-Revie...


### At this point there were still rows left with None or NaN values in summary, pros or cons. We went and removed these rows

In [38]:
df.count()[0]

67519

In [39]:
df = df[df["summary"].notna()]
df = df[df["pros"].notna()]
df = df[df["cons"].notna()]

Only 1 row was removed.

In [40]:
df.count()[0]

67118

### In order to make it easier to work with the data we added a new column containing the text for summary, pros and cons.

In [41]:
df["text"] = df["summary"].str.cat(df["pros"].str.cat(df["cons"],sep=" "),sep=" ")

In [42]:
df[["summary","pros","cons","text"]].head(3)

Unnamed: 0,summary,pros,cons,text
0,best compani work,peopl smart friendli,bureaucraci slow thing,best compani work peopl smart friendli bureauc...
1,move speed light burn inevit,food food food cafe main campu mtv alon mini k...,work life balanc balanc perk benefit illus kee...,move speed light burn inevit food food food ca...
2,great balanc big compani secur fun fast move p...,softwar engin among king hill googl engin driv...,becom larger come grow pain bureaucraci slow r...,great balanc big compani secur fun fast move p...


Mean text length in characters

In [43]:
np.mean(df["text"].str.len())

198.47529723770077

### We needed a binary label to be able to predict whether a review was positive or negative. We decided to define positive reviews as a 4-star rating or leaving the rest as negative.

In [44]:
df["label"] = df.apply(lambda row: 1 if row["overall-ratings"] >= 4 else 0, axis=1)

### Now with the label column we can see that the dataset is quite unbalanced

In [45]:
df_pos = df[df["label"] == 1]
df_neg = df[df["label"] == 0]
print("Positive: {0}, Negative: {1}".format(df_pos.count()[0], df_neg.count()[0]))

Positive: 45483, Negative: 21635


### We did a resampling in order to balance out the dataset

In [46]:
from sklearn.utils import resample

In [47]:
df_pos_res = resample(df_pos, 
                   n_samples=df_neg.count()[0], 
                   random_state=23)

In [48]:
df_pos_res.count()[0]

21635

In [49]:
df = df_neg.append(df_pos_res)

In [50]:
y = df["label"]

### We used the TfidVectorizer from sci-kit learn to transform the strings into word vectors. We chose to only vectorize the 3000 most common words for simplicity sake.

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
no_of_words = 3000
cv = TfidfVectorizer(max_features=no_of_words, ngram_range=(1, 2))

In [52]:
X = cv.fit_transform(df["text"].tolist())
X.shape

(43270, 3000)

### We devided the data into training and testing data

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=22)

### In order to reduce the dimensionality of the input data we created an autoencoder

In [54]:
import keras.layers as layers
import keras.models as models

encoding_dim = 32

input_layer = layers.Input(shape=(no_of_words,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded = layers.Dense(no_of_words, activation='sigmoid')(encoded)

In [55]:
autoencoder = models.Model(input_layer, decoded)

In [56]:
encoder = models.Model(input_layer, encoded)

In [57]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [58]:
autoencoder.fit(X_train, X_train,
                epochs=10,
                batch_size=256,
                shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f090aa5b668>

In [59]:
X_train_encoded = encoder.predict(X_train)
X_train_encoded.shape

(34616, 32)

In [60]:
y_train.shape

(34616,)

### We created a function to easily run cross_val_score on the model with the correct data and print out the results

In [None]:
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits=10, shuffle=True, random_state=4)
def do_cross_val(model):
    acc_score = cross_val_score(model, X_train.toarray(), y_train, cv=kf)
    print(acc_score)
    print(np.mean(acc_score))

### We ran with various different models trying to tweak the settings for best result

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

pipe_knn = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier(n_neighbors=10))
do_cross_val(pipe_knn)

In [None]:
from sklearn.linear_model import LogisticRegression

pipe_lr = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(random_state=21, solver='liblinear', C=1))
do_cross_val(pipe_lr)

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipe_dt = make_pipeline(StandardScaler(with_mean=False), DecisionTreeClassifier(max_depth=45, random_state=21))
do_cross_val(pipe_dt)

In [None]:
from sklearn.naive_bayes import GaussianNB

pipe_nb = make_pipeline(StandardScaler(with_mean=False), GaussianNB())
do_cross_val(pipe_nb)

Since the SVM classifier was very slow at training the data we trained on only 1000 samples so we could easily tweak the settings and retrain the model

In [87]:
Xs = encoder.predict(X_test[:1000])
ys = y_test[:1000]

In [99]:
from sklearn.svm import SVC

pipe_svm = make_pipeline(StandardScaler(with_mean=False), SVC(C=.1, random_state=21, kernel="rbf", gamma="auto"))
acc_score = cross_val_score(pipe_svm, Xs, ys, cv=kf)
print(acc_score)
print(np.mean(acc_score))

[0.58 0.54 0.52 0.43 0.53 0.51 0.5  0.59 0.57 0.52]
0.5290000000000001


Of all of the ML models we used the D-Tree seemed to perform the best with an accuracy of almost 60%

## Conclusion

Did we manage to create a model that could answer the business question? With 60%-ish accuracy, yes.
The accuracy of the model is not utterly impressive. We might be able to improve the predictions by using a neural network but that will be for some other time.
However we learned a lot about Natural Language Processing and we think we will be able to do better in the future.