In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("all_processed.csv")
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,company,location,dates,job-title,summary,pros,cons,advice-to-mgmt,overall-ratings,work-balance-stars,culture-values-stars,carrer-opportunities-stars,comp-benefit-stars,senior-mangemnet-stars,helpful-count,link
0,0,1,google,none,"Dec 11, 2018",Current Employee - Anonymous Employee,best compani work,peopl smart friendli,bureaucraci slow thing,none,5.0,4.0,5.0,5.0,4.0,5.0,0,https://www.glassdoor.com/Reviews/Google-Revie...
1,1,2,google,"Mountain View, CA","Jun 21, 2013",Former Employee - Program Manager,move speed light burn inevit,food food food cafe main campu mtv alon mini k...,work life balanc balanc perk benefit illus kee...,1) Don't dismiss emotional intelligence and ad...,4.0,2.0,3.0,3.0,5.0,3.0,2094,https://www.glassdoor.com/Reviews/Google-Revie...


### At this point there were still rows left with None or NaN values in summary, pros or cons. We went and removed these rows

In [3]:
df.count()[0]

67519

In [4]:
df = df[df["summary"].notna()]
df = df[df["pros"].notna()]
df = df[df["cons"].notna()]

Only 1 row was removed.

In [5]:
df.count()[0]

67118

### In order to make it easier to work with the data we added a new column containing the text for summary, pros and cons.

In [6]:
df["text"] = df["summary"].str.cat(df["pros"].str.cat(df["cons"],sep=" "),sep=" ")

In [7]:
df[["summary","pros","cons","text"]].head(3)

Unnamed: 0,summary,pros,cons,text
0,best compani work,peopl smart friendli,bureaucraci slow thing,best compani work peopl smart friendli bureauc...
1,move speed light burn inevit,food food food cafe main campu mtv alon mini k...,work life balanc balanc perk benefit illus kee...,move speed light burn inevit food food food ca...
2,great balanc big compani secur fun fast move p...,softwar engin among king hill googl engin driv...,becom larger come grow pain bureaucraci slow r...,great balanc big compani secur fun fast move p...


Mean text length in characters

In [8]:
np.mean(df["text"].str.len())

198.47529723770077

### The value that we wanted to be able to predict whether it a review was positive or negative. We defined a positive review to be reviews with an overall rating of 4 or 5. Reviews  with a rating of 3 or less were considered negative.

In [9]:
df["label"] = df.apply(lambda row: 1 if row["overall-ratings"] >= 4 else 0, axis=1)

In [10]:
df_pos = df[df["label"] == 1]
df_neg = df[df["label"] == 0]
print("Positive: {0}, Negative: {1}".format(df_pos.count()[0], df_neg.count()[0]))

Positive: 45483, Negative: 21635


In [15]:
from sklearn.utils import resample

In [16]:
df_pos_res = resample(df_pos, 
                   n_samples=df_neg.count()[0], 
                   random_state=23)

In [17]:
df_pos_res.count()[0]

21635

In [None]:
df.imbalance.under_sampling.RandomUnderSampler

In [None]:
df[df["label"] == 0].count()[0]

In [None]:
y = df["label"]

### We used the TfidVectorizer from sci-kit learn to transform the strings into word vectors. We chose to only vectorize the 3000 most common words for simplicity sake.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
no_of_words = 3000
cv = TfidfVectorizer(max_features=no_of_words)

In [None]:
X = cv.fit_transform(df["text"].tolist())
X.shape

In [None]:
X[0]

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=4)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

#pipe_knn = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier(n_neighbors=5))

In [None]:
import keras.layers as layers
import keras.models as models

encoding_dim = 32


input_layer = layers.Input(shape=(no_of_words,))
encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)
decoded = layers.Dense(no_of_words, activation='sigmoid')(encoded)

In [None]:
autoencoder = models.Model(input_layer, decoded)

In [None]:
encoder = models.Model(input_layer, encoded)

In [None]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [None]:
autoencoder.fit(X, X,
                epochs=10,
                batch_size=256,
                shuffle=True)

In [None]:
Xs = encoder.predict(X)
Xs.shape

In [None]:
X[0]

In [None]:
ys = y
ys.shape

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

pipe_svm = make_pipeline(StandardScaler(with_mean=False), SVC())
cross_val_score(pipe_svm, Xs, ys, cv=kf)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

pipe_knn = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier(n_neighbors=5))
cross_val_score(pipe_knn, Xs, ys, cv=kf)

In [None]:
from sklearn.linear_model import LogisticRegression

pipe_lr = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(random_state=0, solver='liblinear'))
cross_val_score(pipe_lr, Xs, ys, cv=kf)

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipe_dt = make_pipeline(StandardScaler(with_mean=False), DecisionTreeClassifier(max_depth=5))
cross_val_score(pipe_dt, Xs, ys, cv=kf)