In [1]:
%pylab inline
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import VotingClassifier

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("all_processed2.csv")
df.head(3)
df.columns.tolist()

['Unnamed: 0',
 'Unnamed: 0.1',
 'company',
 'location',
 'dates',
 'job-title',
 'summary',
 'pros',
 'cons',
 'advice-to-mgmt',
 'overall-ratings',
 'work-balance-stars',
 'culture-values-stars',
 'carrer-opportunities-stars',
 'comp-benefit-stars',
 'senior-mangemnet-stars',
 'helpful-count',
 'link',
 'summary_processed',
 'summary_char_length',
 'summary_word_count',
 'summary_stopword_count',
 'summary_stopword_freq',
 'pros_processed',
 'pros_char_length',
 'pros_word_count',
 'pros_stopword_count',
 'pros_stopword_freq',
 'cons_processed',
 'cons_char_length',
 'cons_word_count',
 'cons_stopword_count',
 'cons_stopword_freq',
 'text',
 'text_processed',
 'text_char_length',
 'text_word_count',
 'text_stopword_count',
 'text_stopword_freq']

### Converting the label into a binary value: 1 if rating is 4-5, 0 if rating is 1-3.

In [3]:
df["label"] = df.apply(lambda row: 1 if row["overall-ratings"] >= 4 else 0, axis=1)

### Balancing out the dataset

Almost more than twice as many positive as negative reviews

In [4]:
df_pos = df[df["label"] == 1]
df_neg = df[df["label"] == 0]
print("Positive: {0}, Negative: {1}".format(df_pos.count()[0], df_neg.count()[0]))

Positive: 45607, Negative: 21792


Undersampling of the data.

In [5]:
df_pos_res = resample(df_pos, 
                   n_samples=df_neg.count()[0], 
                   random_state=23)

In [6]:
df_pos_res.count()[0]

21792

In [7]:
df = df_neg.append(df_pos_res)

### Extracting the culomns containing measurements about the text like number of words/stopwords, stopwords frequency or length in characters.

In [8]:
y = df["label"]

In [9]:
txt_measures_indices = [
    "pros_word_count", "pros_stopword_freq", "pros_char_length",
    "cons_word_count", "cons_stopword_freq", "cons_char_length",
    "summary_word_count", "summary_stopword_freq", "summary_char_length"
]

X_txt_measures = df[txt_measures_indices]
X_txt_measures.head()

Unnamed: 0,pros_word_count,pros_stopword_freq,pros_char_length,cons_word_count,cons_stopword_freq,cons_char_length,summary_word_count,summary_stopword_freq,summary_char_length
30,38.0,0.315789,228.0,13.0,0.153846,104.0,2.0,0.0,15.0
37,31.0,0.354839,178.0,67.0,0.402985,377.0,6.0,0.0,34.0
41,6.0,0.5,30.0,12.0,0.583333,76.0,2.0,0.0,8.0
48,9.0,0.111111,47.0,57.0,0.315789,344.0,1.0,0.0,3.0
54,18.0,0.333333,111.0,41.0,0.365854,247.0,7.0,0.714286,34.0


### We tested the text measurements' preditcion value with a couple of different models using Scikit-Learns cross_val_score function for cross validation.

In [10]:
kf = KFold(n_splits=10, shuffle=True, random_state=4)

def do_cross_val(model):
    start_time = time.time()
    acc_score = cross_val_score(model, X_txt_measures, y, cv=kf)
    print(acc_score)
    print(np.mean(acc_score))
    print("It took a total of {0} minutes"
          .format((time.time() - start_time)/60))

In [11]:
pipe_knn = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier(n_neighbors=10))
acc = do_cross_val(pipe_knn)

[0.660702   0.65290204 0.6673549  0.66551961 0.66727857 0.65144562
 0.6608536  0.65442864 0.65993575 0.67370353]
0.661412425916702
It took a total of 0.04740280310312907 minutes


In [12]:
pipe_lr = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(random_state=21, solver='liblinear', C=1))
do_cross_val(pipe_lr)

[0.68570773 0.68203716 0.68891948 0.69694884 0.69573199 0.68815971
 0.69274897 0.68150528 0.67875172 0.69710877]
0.6887619639005498
It took a total of 0.026148144404093424 minutes


In [13]:
pipe_dt = make_pipeline(StandardScaler(with_mean=False), DecisionTreeClassifier(max_depth=45, random_state=21))
do_cross_val(pipe_dt)

[0.68111952 0.67882542 0.69098417 0.68869007 0.68081689 0.69412575
 0.69137219 0.68701239 0.68540615 0.70330427]
0.6881656810651972
It took a total of 0.03066873550415039 minutes


In [14]:
pipe_nb = make_pipeline(StandardScaler(with_mean=False), GaussianNB())
do_cross_val(pipe_nb)

[0.59233769 0.5815554  0.58729066 0.60082588 0.60578247 0.58834328
 0.58352455 0.59637448 0.58834328 0.60256999]
0.5926947680212199
It took a total of 0.002381602923075358 minutes


### We used the TfidVectorizer from sci-kit learn to transform the strings into word vectors. We chose to only vectorize the 3000 most common words for simplicity sake. In total there was around 21,000 unique words in all the texts.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
no_of_words = 3000
tfidf = TfidfVectorizer(max_features=no_of_words, ngram_range=(1,3))

In [16]:
X_txt_vecs = tfidf.fit_transform(df["text"].tolist())
X_txt_vecs.shape

(43584, 3000)

In [17]:
type(X_txt_vecs)

scipy.sparse.csr.csr_matrix

### Now comes the task of combining the text vectors with the text measurements.

In [18]:
X_txt_measures.columns.tolist()

['pros_word_count',
 'pros_stopword_freq',
 'pros_char_length',
 'cons_word_count',
 'cons_stopword_freq',
 'cons_char_length',
 'summary_word_count',
 'summary_stopword_freq',
 'summary_char_length']

In [19]:
X_txt_measures.index

Int64Index([   30,    37,    41,    48,    54,    61,    66,    83,    90,
               92,
            ...
             1665,   861, 36104, 41078, 45657, 58115, 12708, 61485,  8717,
            18851],
           dtype='int64', length=43584)

In [20]:
X_full = pd.concat([X_txt_measures,pd.DataFrame(
            index = X_txt_measures.index, 
            data = X_txt_vecs.toarray()
        )], axis=1)

In [21]:
X_full.head(3)

Unnamed: 0,pros_word_count,pros_stopword_freq,pros_char_length,cons_word_count,cons_stopword_freq,cons_char_length,summary_word_count,summary_stopword_freq,summary_char_length,0,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
30,38.0,0.315789,228.0,13.0,0.153846,104.0,2.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,31.0,0.354839,178.0,67.0,0.402985,377.0,6.0,0.0,34.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115659
41,6.0,0.5,30.0,12.0,0.583333,76.0,2.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### We devided the data into training and testing data but then got away from it aftwards since we are using cross_val_score. However we kept the code to be easily implemented by just commenting out 2 lines.

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_full, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=22)
X_train = X_full
y_train = y

### We created a function to easily run cross_val_score on the model with the correct data and print out the accuracy.

In [23]:
kf = KFold(n_splits=10, shuffle=True, random_state=4)

def do_cross_val(model):
    start_time = time.time()
    acc_score = cross_val_score(model, X_train, y_train, cv=kf)
    print(acc_score)
    print(np.mean(acc_score))
    print("It took a total of {0} minutes"
          .format((time.time() - start_time)/60))

### We ran with various different models trying to tweak the settings for best result. The 2 best performing models ended up being Logistic Regression and Naive Bayes.

The logistic regression was ultimately the best of for our data. Logistic Regression performed best with normalized data and with the n_grams in the range of 1.

In [24]:
pipe_lr = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(random_state=21, solver='liblinear', C=1))
do_cross_val(pipe_lr)

[0.77632485 0.76554256 0.77448956 0.7687543  0.76686553 0.77214318
 0.77374943 0.76755392 0.76916017 0.76961909]
0.7704202590347854
It took a total of 1.2042159597078959 minutes


The naives bayes performed best with unscaled data and with the n_grams in the range of 1-3.

In [25]:
pipe_nb = make_pipeline(GaussianNB())
do_cross_val(pipe_nb)

[0.73411333 0.72746043 0.73044276 0.73503097 0.73634695 0.732905
 0.73588802 0.72785682 0.73129876 0.73474071]
0.7326083743118872
It took a total of 0.5234104514122009 minutes


ngram_1,1 3000 mixed standardscaler

### We wanted to see if we could improve the result using ensambled learning to combine the results of the 2 models.

We first tried splitting the data into text vectors for the Naive Bayes and text measurements for the Logistic Regression.

In [26]:
def select_txt_measures(X):
    return X[:, :9]

def select_txt_vectors(X):
    return X[:, 9:]

pipe_lr = make_pipeline(
    FunctionTransformer(select_txt_measures, validate = True),
    StandardScaler(with_mean=False), 
    LogisticRegression(random_state=21, solver='liblinear', C=1)
)
pipe_nb = make_pipeline(
    FunctionTransformer(select_txt_vectors, validate = True),
    GaussianNB()
)

voting_c = VotingClassifier(
    estimators=[
        ('lr', pipe_lr), 
        ('rf', pipe_nb),
    ],
    voting='soft'
)

In [27]:
do_cross_val(voting_c)

[0.72356045 0.71690755 0.71025465 0.71621932 0.72303809 0.71844883
 0.72418541 0.72028453 0.720514   0.71294172]
0.718635453374044
It took a total of 0.5847811381022135 minutes


We then tried feeding both pipelines the same data.

In [28]:
pipe_lr = make_pipeline(
    StandardScaler(with_mean=False), 
    LogisticRegression(random_state=21, solver='liblinear', C=1)
)
pipe_nb = make_pipeline(
    GaussianNB()
)

voting_c = VotingClassifier(
    estimators=[
        ('lr', pipe_lr), 
        ('rf', pipe_nb),
    ],
    voting='soft'
)

In [29]:
do_cross_val(voting_c)

[0.73411333 0.73021335 0.73067217 0.73617802 0.73749426 0.73474071
 0.7372648  0.72923359 0.732905   0.73818265]
0.7340997894246115
It took a total of 1.6498730023701986 minutes


Oddly enough the voting classifier seems to follow the Naive Bayes model accuracy quite closely. It did not give us a better result than when running with the Logistic Regression alone.

## Conclusion

We managed to reach an accuracy of close to 80% so we are quite satisfied with the result. We might be able to improve the predictions by using a neural network but that will be for some other time.
However we learned a lot about Natural Language Processing and we think we will be able to do better in the future. Maybe the result could also be improved by extracting extra features like special character count etc.