In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

In [18]:
def openFile(path):
    df = pd.read_csv(path ,header=0, encoding='utf-8')
    return df

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.model_selection import cross_validate


def get_classifier_cv_score(model, X, y, scoring='accuracy', cv=7):
    results = cross_validate(model, X, y, cv=cv,scoring=scoring,return_train_score=Tru)
    return [results['train_score'].mean(), results['test_score'].mean()]

In [9]:
def print_grid_search_result(grid_search):
    print(grid_search.best_params_)
    print("training score= {:.3f}; validation score={:.3f}".format(grid_search.cv_results_['mean_train_score'][grid_search.best_index_],
                                                                  grid_search.cv_results_['mean_test_score'][grid_search.best_index_]))
    

In [4]:
def getCorpus(df):
    corpus = df['processed_sentence'].iloc[:].values
    return corpus

In [23]:
def wordVector(corpus):
    word_bag = CountVectorizer().fit_transform(corpus)
    return word_bag

In [27]:
def splitData(word_bag, df):
    return train_test_split(word_bag, df['Sentiment'],test_size = 0.3, random_state=42)

In [57]:
def algoSelection(word_bag, df, X_train, X_test, y_train, y_test):
    models = [RandomForestClassifier(random_state=42, max_depth=5), LogisticRegression(random_state=42), GradientBoostingClassifier(random_state=42), BernoulliNB()]
    for model in models:
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        print(model)
        print("Training score: {0:.3f}".format(model.score(X_train,y_train)))
        print("Test score: {0:.3f}".format(model.score(X_test,y_test)))
        print(classification_report(y_test,y_pred))
        print()
        

In [53]:
from sklearn.model_selection import GridSearchCV
def gridSearchLogReg(word_bag, df, X_train, y_train):
    
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    
    clf = GridSearchCV(LogisticRegression(random_state=42), param_grid=grid, scoring="f1_macro", cv=7, return_train_score=True)
    clf.fit(X_train, y_train)
    print_grid_search_result(clf)

In [19]:
df= openFile('../labeling/raw_labeled_500_notickers.csv')

In [61]:
df

Unnamed: 0.1,Unnamed: 0,Sentiment,processed_sentence
0,0,Buy,long dollarsign twtr dollarsign
1,1,Hold,sneak peek minussign tomorrow follow minussign...
2,2,Hold,dollarsign cook taking home
3,3,Buy,long
4,4,Hold,also minussign normally buy type stuff online ...
...,...,...,...
517,517,Hold,advice bagholders
518,518,Sell,minussign numberplaceholder share
519,519,Buy,call
520,520,Buy,used call


In [25]:
word_bag = wordVector(getCorpus(df))

<font size="25">Bag-of-Words

In [58]:
X_train, X_test, y_train, y_test= splitData(word_bag, df)
scores = algoSelection(word_bag, df, X_train, X_test, y_train, y_test)
print(scores)

RandomForestClassifier(max_depth=5, random_state=42)
Training score: 0.704
Test score: 0.707
              precision    recall  f1-score   support

         Buy       0.64      1.00      0.78        73
        Hold       1.00      0.30      0.46        47
        Sell       0.83      0.65      0.73        37

    accuracy                           0.71       157
   macro avg       0.82      0.65      0.66       157
weighted avg       0.79      0.71      0.67       157


LogisticRegression(random_state=42)
Training score: 0.951
Test score: 0.854
              precision    recall  f1-score   support

         Buy       0.89      0.93      0.91        73
        Hold       0.94      0.66      0.78        47
        Sell       0.73      0.95      0.82        37

    accuracy                           0.85       157
   macro avg       0.85      0.85      0.84       157
weighted avg       0.87      0.85      0.85       157


GradientBoostingClassifier(random_state=42)
Training score: 0.981
T

In [None]:
gridSearchLogReg(word_bag, df)

<font size="25"> TF-IDF</font>

In [62]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_sentence'], df['Sentiment'],test_size = 0.3, random_state=42)

In [63]:
tfidf_vectorizer = TfidfVectorizer() 
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)


In [64]:

scores = algoSelection(word_bag, df, tfidf_train_vectors, tfidf_test_vectors, y_train, y_test)
print(scores)

RandomForestClassifier(max_depth=5, random_state=42)
Training score: 0.668
Test score: 0.650
              precision    recall  f1-score   support

         Buy       0.61      0.97      0.75        73
        Hold       1.00      0.26      0.41        47
        Sell       0.66      0.51      0.58        37

    accuracy                           0.65       157
   macro avg       0.76      0.58      0.58       157
weighted avg       0.74      0.65      0.61       157


LogisticRegression(random_state=42)
Training score: 0.937
Test score: 0.860
              precision    recall  f1-score   support

         Buy       0.94      0.92      0.93        73
        Hold       0.89      0.72      0.80        47
        Sell       0.71      0.92      0.80        37

    accuracy                           0.86       157
   macro avg       0.85      0.85      0.84       157
weighted avg       0.87      0.86      0.86       157


GradientBoostingClassifier(random_state=42)
Training score: 0.995
T

<font size="25">N-grams

In [65]:
bigram_vectorizer = CountVectorizer(ngram_range=(2,2), token_pattern=r'\b\w+\b', min_df=1)
#adjust range to get bi or tri grams
bg = bigram_vectorizer.fit_transform(getCorpus(df))

In [66]:
bg_features = bigram_vectorizer.get_feature_names()
bg_features



['able buy',
 'acquisition cooperation',
 'actually call',
 'adding numberplaceholder',
 'advice bagholders',
 'advisor financial',
 'afraid buy',
 'ahrendts depart',
 'aircraft effort',
 'airline adding',
 'align business',
 'almost numberplaceholder',
 'alphabet earnings',
 'also looking',
 'also minussign',
 'also scared',
 'always bagholders',
 'amazon earnings',
 'amazon michael',
 'amazon new',
 'amc moon',
 'amc mvis',
 'amc wild',
 'ameritrade tda',
 'anaus fucking',
 'angela ahrendts',
 'anime gambling',
 'announce hodling',
 'announced new',
 'antitrust regulator',
 'anyone hodling',
 'anyone thinking',
 'anyone want',
 'anything believe',
 'apart oil',
 'ape diamond',
 'ape gemstoneemoji',
 'ape like',
 'appl snappl',
 'apple ceo',
 'apple earnings',
 'appointed chairman',
 'appointed critic',
 'approve usd',
 'april sell',
 'april year',
 'arconic engineering',
 'around dollarsign',
 'art inc',
 'as use',
 'asia pacific',
 'asset buy',
 'asshole motherfucker',
 'associate i

In [67]:
X_train, X_test, y_train, y_test = train_test_split(bg, df['Sentiment'],test_size = 0.3, random_state=42)

In [69]:

scores = algoSelection(word_bag, df, X_train, X_test, y_train, y_test)
print(scores)

RandomForestClassifier(max_depth=5, random_state=42)
Training score: 0.518
Test score: 0.529
              precision    recall  f1-score   support

         Buy       0.50      1.00      0.66        73
        Hold       1.00      0.09      0.16        47
        Sell       1.00      0.16      0.28        37

    accuracy                           0.53       157
   macro avg       0.83      0.42      0.37       157
weighted avg       0.77      0.53      0.42       157


LogisticRegression(random_state=42)
Training score: 0.959
Test score: 0.643
              precision    recall  f1-score   support

         Buy       0.57      0.99      0.72        73
        Hold       0.92      0.23      0.37        47
        Sell       1.00      0.49      0.65        37

    accuracy                           0.64       157
   macro avg       0.83      0.57      0.58       157
weighted avg       0.77      0.64      0.60       157


GradientBoostingClassifier(random_state=42)
Training score: 0.951
T