In [1]:
%pylab inline
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.utils import resample

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv("all_processed2.csv")
df.head(3)
df.columns.tolist()

['Unnamed: 0',
 'Unnamed: 0.1',
 'company',
 'location',
 'dates',
 'job-title',
 'summary',
 'pros',
 'cons',
 'advice-to-mgmt',
 'overall-ratings',
 'work-balance-stars',
 'culture-values-stars',
 'carrer-opportunities-stars',
 'comp-benefit-stars',
 'senior-mangemnet-stars',
 'helpful-count',
 'link',
 'summary_processed',
 'summary_char_length',
 'summary_word_count',
 'summary_stopword_count',
 'summary_stopword_freq',
 'pros_processed',
 'pros_char_length',
 'pros_word_count',
 'pros_stopword_count',
 'pros_stopword_freq',
 'cons_processed',
 'cons_char_length',
 'cons_word_count',
 'cons_stopword_count',
 'cons_stopword_freq',
 'text',
 'text_processed',
 'text_char_length',
 'text_word_count',
 'text_stopword_count',
 'text_stopword_freq']

In [3]:
df["label"] = df.apply(lambda row: 1 if row["overall-ratings"] >= 4 else 0, axis=1)

In [4]:
df_pos = df[df["label"] == 1]
df_neg = df[df["label"] == 0]
print("Positive: {0}, Negative: {1}".format(df_pos.count()[0], df_neg.count()[0]))

Positive: 45607, Negative: 21792


In [5]:
df_pos_res = resample(df_pos, 
                   n_samples=df_neg.count()[0], 
                   random_state=23)

In [6]:
df_pos_res.count()[0]

21792

In [7]:
df = df_neg.append(df_pos_res)

In [8]:
y = df["label"]

### We needed a binary label to be able to predict whether a review was positive or negative. We decided to define positive reviews as a 4-star rating or leaving the rest as negative.

In [9]:
txt_measures_indices = [
    "pros_word_count", "pros_stopword_freq", "pros_char_length",
    "cons_word_count", "cons_stopword_freq", "cons_char_length",
    "summary_word_count", "summary_stopword_freq", "summary_char_length"
]

df2 = df[txt_measures_indices]
#df2 = df2[:1000]
df2.head()

Unnamed: 0,pros_word_count,pros_stopword_freq,pros_char_length,cons_word_count,cons_stopword_freq,cons_char_length,summary_word_count,summary_stopword_freq,summary_char_length
30,38.0,0.315789,228.0,13.0,0.153846,104.0,2.0,0.0,15.0
37,31.0,0.354839,178.0,67.0,0.402985,377.0,6.0,0.0,34.0
41,6.0,0.5,30.0,12.0,0.583333,76.0,2.0,0.0,8.0
48,9.0,0.111111,47.0,57.0,0.315789,344.0,1.0,0.0,3.0
54,18.0,0.333333,111.0,41.0,0.365854,247.0,7.0,0.714286,34.0


In [10]:
X = df2

kf = KFold(n_splits=10, shuffle=True, random_state=4)

def do_cross_val(model):
    start_time = time.time()
    acc_score = cross_val_score(model, X, y, cv=kf)
    print("It took a total of {0} minutes"
          .format((time.time() - start_time)/60))
    print(acc_score)
    print(np.mean(acc_score))

In [11]:
pipe_knn = make_pipeline(StandardScaler(with_mean=False), KNeighborsClassifier(n_neighbors=10))
#acc = do_cross_val(pipe_knn)

In [12]:
pipe_lr = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(random_state=21, solver='liblinear', C=1))
#do_cross_val(pipe_lr)

In [13]:
pipe_dt = make_pipeline(StandardScaler(with_mean=False), DecisionTreeClassifier(max_depth=45, random_state=21))
#do_cross_val(pipe_dt)

In [14]:
pipe_nb = make_pipeline(StandardScaler(with_mean=False), GaussianNB())
#do_cross_val(pipe_nb)

In [15]:
pipe_svm = make_pipeline(StandardScaler(with_mean=False), SVC(C=.1, random_state=21, kernel="poly", gamma="auto"))
#do_cross_val(pipe_svm)

In [16]:
pipe_svm = make_pipeline(StandardScaler(with_mean=False), SVC(C=.1, random_state=21, kernel="linear", gamma="auto"))
#do_cross_val(pipe_svm)

In [17]:
pipe_svm = make_pipeline(StandardScaler(with_mean=False), SVC(C=.1, random_state=21, kernel="rbf", gamma="auto"))
#do_cross_val(pipe_svm)

### We used the TfidVectorizer from sci-kit learn to transform the strings into word vectors. We chose to only vectorize the 3000 most common words for simplicity sake.

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
no_of_words = 3000
tfidf = TfidfVectorizer(max_features=no_of_words, ngram_range=(1,3))

In [19]:
X = tfidf.fit_transform(df["text"].tolist())
X.shape

(43584, 3000)

In [20]:
type(X)

scipy.sparse.csr.csr_matrix

In [21]:
#X_pd = pd.DataFrame(X.toarray())

In [22]:
df2.columns.tolist()

['pros_word_count',
 'pros_stopword_freq',
 'pros_char_length',
 'cons_word_count',
 'cons_stopword_freq',
 'cons_char_length',
 'summary_word_count',
 'summary_stopword_freq',
 'summary_char_length']

In [23]:
df2.index

Int64Index([   30,    37,    41,    48,    54,    61,    66,    83,    90,
               92,
            ...
             1665,   861, 36104, 41078, 45657, 58115, 12708, 61485,  8717,
            18851],
           dtype='int64', length=43584)

In [24]:
X_full = pd.concat([df2,pd.DataFrame(index = df2.index, data = X.toarray())], axis=1)

In [25]:
X_full.head()

Unnamed: 0,pros_word_count,pros_stopword_freq,pros_char_length,cons_word_count,cons_stopword_freq,cons_char_length,summary_word_count,summary_stopword_freq,summary_char_length,0,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
30,38.0,0.315789,228.0,13.0,0.153846,104.0,2.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,31.0,0.354839,178.0,67.0,0.402985,377.0,6.0,0.0,34.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115659
41,6.0,0.5,30.0,12.0,0.583333,76.0,2.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,9.0,0.111111,47.0,57.0,0.315789,344.0,1.0,0.0,3.0,0.096456,...,0.227332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,18.0,0.333333,111.0,41.0,0.365854,247.0,7.0,0.714286,34.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### We devided the data into training and testing data

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_full, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=22)

### In order to reduce the dimensionality of the input data we created an autoencoder

### We created a function to easily run cross_val_score on the model with the correct data and print out the results

In [27]:
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits=10, shuffle=True, random_state=4)
def do_cross_val(model):
    start_time = time.time()
    acc_score = cross_val_score(model, X_train, y_train, cv=kf)
    print("It took a total of {0} minutes"
          .format((time.time() - start_time)/60))
    print(acc_score)
    print(np.mean(acc_score))

### We ran with various different models trying to tweak the settings for best result

In [28]:
from sklearn.linear_model import LogisticRegression

pipe_lr = make_pipeline(StandardScaler(with_mean=False), LogisticRegression(random_state=21, solver='liblinear', C=1))
do_cross_val(pipe_lr)

It took a total of 1.0395310282707215 minutes
[0.77688557 0.76914253 0.75336966 0.76570118 0.7611127  0.76484084
 0.75738457 0.7541595  0.75272519 0.76821572]
0.7623537453513508


In [29]:
from sklearn.tree import DecisionTreeClassifier

pipe_dt = make_pipeline(DecisionTreeClassifier(max_depth=45, random_state=21))
do_cross_val(pipe_dt)

It took a total of 3.0553784092267353 minutes
[0.70834528 0.70605105 0.70203613 0.72813307 0.70863206 0.70318325
 0.71866934 0.70682731 0.71342513 0.72145726]
0.7116759882333217


In [30]:
from sklearn.naive_bayes import GaussianNB

pipe_nb = make_pipeline(GaussianNB())
do_cross_val(pipe_nb)

It took a total of 0.41800847053527834 minutes
[0.73788357 0.73272154 0.72096358 0.73415543 0.73128764 0.72985374
 0.73300832 0.72576018 0.72776822 0.74842226]
0.7321824476816686


ngram_1,1 3000 mixed standardscaler

In [31]:
X_train.head()

Unnamed: 0,pros_word_count,pros_stopword_freq,pros_char_length,cons_word_count,cons_stopword_freq,cons_char_length,summary_word_count,summary_stopword_freq,summary_char_length,0,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
6457,28.0,0.321429,162.0,31.0,0.709677,150.0,1.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15293,14.0,0.428571,86.0,33.0,0.30303,211.0,23.0,0.521739,97.0,0.099945,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10640,29.0,0.62069,167.0,27.0,0.444444,160.0,1.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.153686,0.0,0.0,0.0,0.0,0.149457,0.0
25298,7.0,0.428571,39.0,110.0,0.436364,702.0,12.0,0.416667,58.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56508,28.0,0.321429,204.0,68.0,0.529412,377.0,5.0,0.6,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import VotingClassifier

text_vec_indices = [str(i) for i in range(no_of_words)]
txt_measures_indices

def select_txt_measures(X):
    return X[:, :9]

def select_txt_vectors(X):
    return X[:, 9:]

pipe_lr = make_pipeline(
    #FunctionTransformer(select_txt_measures, validate = True),
    StandardScaler(with_mean=False), 
    LogisticRegression(random_state=21, solver='liblinear', C=1)
)
pipe_nb = make_pipeline(
    #FunctionTransformer(select_txt_vectors, validate = True),
    GaussianNB()
)

voting_c = VotingClassifier(
    estimators=[
        ('lr', pipe_lr), 
        ('rf', pipe_nb),
    ],
    voting='soft'
)


In [39]:
do_cross_val(voting_c)

It took a total of 1.3889941533406576 minutes
[0.73903069 0.73358188 0.72125036 0.73616289 0.73415543 0.73386866
 0.7332951  0.72920252 0.72977625 0.75043029]
0.734075406052906


Since the SVM classifier was very slow at training the data we trained on only 1000 samples so we could easily tweak the settings and retrain the model

Of all of the ML models we used the D-Tree seemed to perform the best with an accuracy of almost 60%

## Conclusion

Did we manage to create a model that could answer the business question? With 60%-ish accuracy, yes.
The accuracy of the model is not utterly impressive. We might be able to improve the predictions by using a neural network but that will be for some other time.
However we learned a lot about Natural Language Processing and we think we will be able to do better in the future.