This file contains the necessary code to get the counts and metrics for each company and then runs it through the various ML models

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LinearRegression, LogisticRegression
import numpy as np
import csv
import os
import pandas as pd
import math
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow
import tflearn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.svm import NuSVR

In [2]:
dir = 'C:/Users/Chaitu Konjeti/socweb-glassdoor-project/REVIEWS/'

In [3]:
all_metrics = {}

metrics_file = pd.read_csv('metrics.csv')

# get all the proper metrics from the metrics file
for row in metrics_file.index:
    #print(int(metrics_file['Data Year - Fiscal'][row]))
    if metrics_file['Company Name'][row] not in all_metrics.keys():
        all_metrics[metrics_file['Company Name'][row]] = []
        if not math.isnan(metrics_file['Gross Profit (Loss)'][row]) and not math.isnan(int(metrics_file['Data Year - Fiscal'][row])) and not int(metrics_file['Data Year - Fiscal'][row]) != '0':
            all_metrics[metrics_file['Company Name'][row]].append((int(metrics_file['Gross Profit (Loss)'][row]), str(int(metrics_file['Data Year - Fiscal'][row]))))
    else:
        if not math.isnan(metrics_file['Gross Profit (Loss)'][row]) and not math.isnan(int(metrics_file['Data Year - Fiscal'][row])):
            all_metrics[metrics_file['Company Name'][row]].append((int(metrics_file['Gross Profit (Loss)'][row]), str(int(metrics_file['Data Year - Fiscal'][row]))))

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def get_sentiment(sentences):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(sentences)
#     print("{:-<65} {}".format(sentences, str(vs['compound'])))
    return vs['compound']

In [5]:
def get_counts_and_metrics(dir):

    metrics = []
    list_of_posts = []
    X = []
    sentiments = []

    #iterate through all files in the directory
    for filename in os.listdir(dir):
        print(filename)
        df = pd.read_csv(dir + filename, header=0)

        #get dates and pros columns from data file
        dates = df['date']
        pros = df['pros']

        #get the list of years for all the posts in the file and create a dictionary with years as the keys
        years = [date.split(' ')[3] for date in dates]
        corpus = {key: '' for key in set(years)}

        #add the post to the proper year in the dictionary
        for date, pro in zip(years, pros):
            corpus[date] += pro + ' '

        #sort the keys in order
        keys = sorted(corpus)

        company_name = filename.split('.')[0]

        metric_years = [val[1] for val in set(all_metrics[company_name]) if val[1] in years]

        metrics.extend([val[0] for val in set(all_metrics[company_name]) if val[1] in years])

        #only use the metrics from years that present in the datafile
        for key in keys:
            if key in metric_years:
                sentiments.append(get_sentiment(corpus[key]))
                list_of_posts.append(corpus[key])
    #creates count vectorizer for posts
    count_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 3))
    counts = count_vectorizer.fit_transform(list_of_posts)

    #creates Tfidf vectorizer for posts
    tfid_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
    tfid = tfid_vectorizer.fit_transform(list_of_posts).toarray()
    
    i = 0
    for sentiment in sentiments:
        X.append(np.append(tfid[i], [sentiment]))
        
    X = np.asarray(X)
    metrics = np.asarray(metrics)
    
    return  X, metrics

In [6]:
X, y = get_counts_and_metrics(dir)

AMERICAN AIRLINES GROUP INC.csv
APPLE INC.csv
AVNET INC.csv
BUTLER NATIONAL CORP.csv
CPI CORP.csv
CVD EQUIPMENT CORP.csv
DELPHAX TECHNOLOGIES INC.csv
DELUXE CORP.csv
KONARED CORP.csv
MATERION CORP.csv


In [7]:
print(X.shape)

(61, 14032)


In [8]:
print(y.shape)

(61,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [12]:
rff_clf = RandomForestClassifier(max_depth=5)
rff_clf.fit(X_train, y_train)
rff_clf.score(X_train, y_train)



0.6458333333333334

In [14]:
pred = (rff_clf.predict(X_test))
pred

array([    0,     0, 14485, 45640,    21, 15574,   224, 15574,     0,
       13945, 45640,  1059,    16])

In [16]:
rmse = sqrt(mean_squared_error(y_test, pred))
rmse

18628.239280519985

In [17]:
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_train, y_train)

0.04552800896937259

In [18]:
pred = lr_clf.predict(X_test)
pred

array([13765.78771973, 21002.88098145, 20936.61132812, 20703.52539062,
       19933.42724609,  1547.05151367, 20360.75170898, -1791.56335449,
       20820.06835938, 20529.85339355, 20625.82995605, 21057.72473145,
       20879.48242188])

In [19]:
rmse = sqrt(mean_squared_error(y_test, pred))
rmse

17674.598844952823

In [20]:
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
gnb_clf.score(X_train, y_train)

0.8333333333333334

In [21]:
pred = gnb_clf.predict(X_test)
pred

array([   18,     0,     0,     0,     0,    16, 71061,    14,     0,
           0,     0,  1059,     0])

In [24]:
rmse = sqrt(mean_squared_error(y_test, pred))
rmse

7767.347299878066

In [25]:
logr_clf = LinearRegression()
logr_clf.fit(X_train, y_train)
logr_clf.score(X_train, y_train)

0.04552800896937259

In [28]:
pred = logr_clf.predict(X_test)
pred

array([13765.78771973, 21002.88098145, 20936.61132812, 20703.52539062,
       19933.42724609,  1547.05151367, 20360.75170898, -1791.56335449,
       20820.06835938, 20529.85339355, 20625.82995605, 21057.72473145,
       20879.48242188])

In [29]:
rmse = sqrt(mean_squared_error(y_test, pred))
rmse

17674.598844952823

In [35]:
svm_clf = NuSVR(C=1.0, nu=0.1)
svm_clf.fit(X_train, y_train)
svm_clf.score(X_train, y_train)



-0.9801802867189416

In [36]:
pred = svm_clf.predict(X_test)
pred

array([48792.99998667, 48792.99999992, 48792.9999998 , 48792.99999937,
       48792.99999796, 48792.99996431, 48792.99999874, 48792.9999582 ,
       48792.99999958, 48792.99999905, 48792.99999923, 48793.00000002,
       48792.99999969])

In [37]:
rmse = sqrt(mean_squared_error(y_test, pred))
rmse

47711.871593786935