# Sentiment Analysis
- 

In [258]:
#core
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import glob
import nasdaq
from bs4 import BeautifulSoup
from pathlib import Path

#NLP
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.corpus import reuters
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

from gensim.summarization import summarize
from gensim.summarization import keywords

#ML
from sklearn import preprocessing
from sklearn import model_selection

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

In [259]:
# params
ipo_performance_csv = 'IPO Performance.csv'

# Load NASDAQ IPO Performace

In [260]:
#load and sort by date priced
df = pd.read_csv(ipo_performance_csv, index_col='Symbol')
df['Date Priced'] = pd.to_datetime(df['Date Priced'], format='%Y-%m-%d')
df.sort_values(by='Date Priced', inplace=True)

### Preprocessing

In [261]:
#transform company name 
df.insert(0, 'Name Length', df['Company Name'].str.len())
df.insert(0, 'Name Words', df['Company Name'].str.split(' ').map(lambda x: len(x)))
df.drop(['Company Name'], axis=1, inplace=True)

#add quarter
df.insert(0, 'Q', df['Date Priced'].map(lambda x: pd.Period(x,'Q').quarter))

#add month
df.insert(0, 'Month', df['Date Priced'].dt.month)

df.drop(['Date Priced'], axis=1, inplace=True)

In [262]:
#encode market
le = preprocessing.LabelEncoder()
df['Market'] = le.fit_transform(df['Market'])

In [263]:
#to one hot encoding
df = pd.concat([pd.get_dummies(df['Market']).add_prefix('MKT'), df], axis=1)
df = pd.concat([pd.get_dummies(df['Month']).add_prefix('M'), df], axis=1)
df = pd.concat([pd.get_dummies(df['Q']).add_prefix('Q'), df], axis=1)
df.drop(['Market', 'Month', 'Q'], axis=1, inplace=True)

In [264]:
#standardize
def standardize(df):
    return (df-df.mean())/df.std()

cols_to_standardize = ['Name Words', 'Name Length', 'Offer Amount', 'Price', 'Shares']
df[cols_to_standardize] = standardize(df[cols_to_standardize])

In [265]:
df.head(3)

Unnamed: 0_level_0,Q1,Q2,Q3,Q4,M1,M2,M3,M4,M5,M6,...,MKT12,Name Words,Name Length,Offer Amount,Price,Shares,1D,1W,1M,3M
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WQNI,1,0,0,0,0,1,0,0,0,0,...,0,-1.183553,-1.6113,-0.284726,-0.340497,-0.386568,-0.129856,-0.296122,0.088818,-0.667432
BBGI,1,0,0,0,0,1,0,0,0,0,...,0,0.950208,0.701675,-0.206002,-0.038392,-0.261148,-0.058333,-0.133333,-0.175,-0.25
UTSI,1,0,0,0,0,0,1,0,0,0,...,0,-0.116672,0.31618,-0.123477,0.263712,-0.164788,0.512195,0.448171,1.042683,-0.134146


# Baseline

- No NLP or fancy models
- Raw IPO listing data

In [270]:
def run_ml_flow(df):
    '''Runs Machine Learning flow, returns evaluation DataFrame'''
    
    targets = ['1D', '1W', '1M', '3M']
    evaluation = pd.DataFrame(columns=['AUC', 'f1', 'log loss'])

    for target in targets:

        #split
        X_train, X_test, y_train, y_test = model_selection.train_test_split(df.values[:,:-4], df[target].map(lambda x: 1 if x > 0 else 0).values, test_size=0.2, shuffle=False)
        #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        #classifiers
        clfs = {
            'RF' : RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1),
            'LR' : LogisticRegression(random_state=1),
            #'Vote' : VotingClassifier(estimators=[('lr', LogisticRegression(random_state=1)), ('rf', RandomForestClassifier(n_estimators=50, max_depth=5, random_state=1))], voting='soft')
        }

        #fit
        for k, clf in clfs.items():
            clf.fit(X_train, y_train)

        #evaluate
        for k, clf in clfs.items():
            #print(k)
            predictions = clf.predict(X_test)
            probas = clf.predict_proba(X_test)

            auc = roc_auc_score(y_test, predictions)
            f1 = f1_score(y_test, predictions)
            ll = log_loss(y_test, probas)
            #print('AUC:', auc)
            #print('f1:', f1)
            #print('log loss:', ll)
            #print(confusion_matrix(y_test, predictions))
            #print('\n')

        #save
        evaluation.loc[target] = [auc, f1, ll]
    
    return evaluation.T

In [271]:
#as expected the results are poor
run_ml_flow(df)

Unnamed: 0,1D,1W,1M,3M
AUC,0.514154,0.468991,0.482075,0.483895
f1,0.417722,0.541284,0.635294,0.567568
log loss,0.697682,0.713322,0.719962,0.723912


# Integrating IPO Raw Data

- First attempt to add raw IPO filings data
- The art of Feature Enginnering

In [272]:
def get_sentiment_df(text):
    '''returns Sentiment Analysis'''
    sid = SentimentIntensityAnalyzer()
    ps_list = []
    sentences = sent_tokenize(text)
    
    for sentence in sentences:
        ps = sid.polarity_scores(sentence)
        ps['sent'] = sentence
        ps_list.append(ps)
    
    df = pd.DataFrame(ps_list)

    return df

In [281]:
def add_sentiment_features(df_sentiment, symbol):
    '''Adds Sentiment Snalysis features, returns DataFrame'''
    
    #filter frames
    df_tmp = pd.DataFrame()
    pos = df_sentiment[df_sentiment['pos'] > df_sentiment['pos'].quantile(0.95)]
    neg = df_sentiment[df_sentiment['neg'] > df_sentiment['neg'].quantile(0.95)]

    #sentences features
    df_tmp.at[symbol, 'Mean Sent Len'] = df_sentiment['sent'].str.len().mean()
    df_tmp.at[symbol, 'Sent Count'] = df_sentiment.shape[0]

    #pos sentiment
    df_tmp.at[symbol, 'Pos Mean Sent Len'] = pos['sent'].str.len().mean()
    df_tmp.at[symbol, 'Pos Sent Count'] = pos.shape[0]
    pos = pos[['compound', 'neg', 'neu', 'pos']]
    df_tmp = pd.concat([pd.DataFrame(pos.mean(), columns=[symbol]).T.add_prefix('Pos Sent Signal '), df_tmp], axis=1)

    #pos sentiment
    df_tmp.at[symbol, 'Neg Mean Sent Len'] = neg['sent'].str.len().mean()
    df_tmp.at[symbol, 'Neg Sent Count'] = neg.shape[0]
    neg = neg[['compound', 'neg', 'neu', 'pos']]
    df_tmp = pd.concat([pd.DataFrame(neg.mean(), columns=[symbol]).T.add_prefix('Neg Sent Signal '), df_tmp], axis=1)
    
    return df_tmp

In [289]:
#add sentiment features
df_sentiment_features = pd.DataFrame()
counter = 0    

for x in df.index:
    try:
        counter += 1
        print('\n( ' + str(counter) + ' / ' + str(df.shape[0]) + ' ) ' + x)

        #check if raw data is available
        file_name = "./Data/" + x + ".htm"
        if Path(file_name).is_file():
            #load raw IPO filing
            with open(file_name, "r", encoding="utf-8") as file:
                html = file.read()
                soup = BeautifulSoup(html,"html5lib")
                text = soup.get_text(strip=True)

                #get sentiment
                df_sentiment = get_sentiment_df(text)
                df_sentiment_features = pd.concat([df_sentiment_features, add_sentiment_features(df_sentiment, x)], axis=0)
        else:
            print('no S-1 for ', x)
    except Exception as e:
        print(x, e)


( 1 / 951 ) WQNI
no S-1 for  WQNI

( 2 / 951 ) BBGI

( 3 / 951 ) UTSI
no S-1 for  UTSI

( 4 / 951 ) SLAB
no S-1 for  SLAB

( 5 / 951 ) WBSN
no S-1 for  WBSN

( 6 / 951 ) ALTH
no S-1 for  ALTH

( 7 / 951 ) MET
no S-1 for  MET

( 8 / 951 ) LPSN
no S-1 for  LPSN

( 9 / 951 ) HSTM
no S-1 for  HSTM

( 10 / 951 ) PXLW
no S-1 for  PXLW

( 11 / 951 ) CYH
no S-1 for  CYH

( 12 / 951 ) QBAK
no S-1 for  QBAK

( 13 / 951 ) CRL
no S-1 for  CRL

( 14 / 951 ) ACLS
no S-1 for  ACLS

( 15 / 951 ) SOHU
no S-1 for  SOHU

( 16 / 951 ) SRTI
no S-1 for  SRTI

( 17 / 951 ) PTIE
no S-1 for  PTIE

( 18 / 951 ) SPRT
no S-1 for  SPRT

( 19 / 951 ) SMTX

( 20 / 951 ) ARNA
no S-1 for  ARNA

( 21 / 951 ) ILMN
no S-1 for  ILMN

( 22 / 951 ) CAMT
no S-1 for  CAMT

( 23 / 951 ) EVC

( 24 / 951 ) DGEN
no S-1 for  DGEN

( 25 / 951 ) ERMS
no S-1 for  ERMS

( 26 / 951 ) LTRX
no S-1 for  LTRX

( 27 / 951 ) MDCO
no S-1 for  MDCO

( 28 / 951 ) EQIX
no S-1 for  EQIX

( 29 / 951 ) DRRX
no S-1 for  DRRX

( 30 / 951 ) MON

( 31


( 334 / 951 ) WSR
no S-1 for  WSR

( 335 / 951 ) COR

( 336 / 951 ) CCIH
no S-1 for  CCIH

( 337 / 951 ) STND

( 338 / 951 ) DQ
no S-1 for  DQ

( 339 / 951 ) TOWR

( 340 / 951 ) BBRG

( 341 / 951 ) VRA

( 342 / 951 ) GLTR

( 343 / 951 ) PACB

( 344 / 951 ) PSLV
no S-1 for  PSLV

( 345 / 951 ) SODA
no S-1 for  SODA

( 346 / 951 ) CMRE
no S-1 for  CMRE

( 347 / 951 ) PRMW

( 348 / 951 ) NOAH
no S-1 for  NOAH

( 349 / 951 ) LPLA

( 350 / 951 ) GM

( 351 / 951 ) WITE

( 352 / 951 ) TRGP

( 353 / 951 ) GCAP

( 354 / 951 ) RNET

( 355 / 951 ) WD

( 356 / 951 ) OSN
no S-1 for  OSN

( 357 / 951 ) AAT

( 358 / 951 ) BKU

( 359 / 951 ) INXN
no S-1 for  INXN

( 360 / 951 ) BCDS
no S-1 for  BCDS

( 361 / 951 ) NAGS

( 362 / 951 ) NPTN
no S-1 for  NPTN

( 363 / 951 ) PCRX

( 364 / 951 ) ECYT

( 365 / 951 ) BGMD

( 366 / 951 ) GEVO

( 367 / 951 ) INN

( 368 / 951 ) ACRX

( 369 / 951 ) CRUD

( 370 / 951 ) HCA
no S-1 for  HCA

( 371 / 951 ) MX

( 372 / 951 ) CSOD

( 373 / 951 ) SREV

( 374 / 951 ) AP


( 685 / 951 ) FGEN

( 686 / 951 ) LMRK

( 687 / 951 ) SKIS

( 688 / 951 ) HSGX

( 689 / 951 ) LC
no S-1 for  LC

( 690 / 951 ) NEWR

( 691 / 951 ) WK

( 692 / 951 ) ONDK

( 693 / 951 ) BLCM

( 694 / 951 ) ZSAN

( 695 / 951 ) ASND
no S-1 for  ASND

( 696 / 951 ) TCON

( 697 / 951 ) AVGR

( 698 / 951 ) SHAK

( 699 / 951 ) ONCE

( 700 / 951 ) DEA

( 701 / 951 ) INOV

( 702 / 951 ) BLPH

( 703 / 951 ) AJX

( 704 / 951 ) TONS

( 705 / 951 ) SMMT
no S-1 for  SMMT

( 706 / 951 ) NCOM
no S-1 for  NCOM

( 707 / 951 ) STDY

( 708 / 951 ) TANH
no S-1 for  TANH

( 709 / 951 ) JCAP

( 710 / 951 ) KRNT
no S-1 for  KRNT

( 711 / 951 ) ADRO

( 712 / 951 ) VIRT

( 713 / 951 ) EVA

( 714 / 951 ) BPMC

( 715 / 951 ) OPGN

( 716 / 951 ) HTGM

( 717 / 951 ) ADAP
no S-1 for  ADAP

( 718 / 951 ) TEGP

( 719 / 951 ) COLL

( 720 / 951 ) BOJA

( 721 / 951 ) RKDA

( 722 / 951 ) WING

( 723 / 951 ) CYAD
no S-1 for  CYAD

( 724 / 951 ) GNRT

( 725 / 951 ) GKOS

( 726 / 951 ) MCRN

( 727 / 951 ) LNTH

( 728 / 951 

In [290]:
df_sentiment_features

Unnamed: 0,Neg Sent Signal compound,Neg Sent Signal neg,Neg Sent Signal neu,Neg Sent Signal pos,Pos Sent Signal compound,Pos Sent Signal neg,Pos Sent Signal neu,Pos Sent Signal pos,Mean Sent Len,Sent Count,Pos Mean Sent Len,Pos Sent Count,Neg Mean Sent Len,Neg Sent Count
BBGI,-0.354494,0.184306,0.759959,0.055755,0.782736,0.008043,0.657511,0.334383,305.887755,980.0,203.829787,47.0,179.081633,49.0
SMTX,-0.370009,0.224787,0.728090,0.047135,0.741109,0.007967,0.624322,0.367733,299.065359,1836.0,133.400000,90.0,137.853933,89.0
EVC,-0.366012,0.268825,0.692212,0.038963,0.658900,0.011837,0.586438,0.401744,229.678974,3196.0,101.631250,160.0,101.668750,160.0
MON,-0.406499,0.270042,0.656162,0.073803,0.659116,0.012580,0.565909,0.421510,206.227928,2843.0,106.685315,143.0,135.985915,142.0
OIS,-0.340341,0.251949,0.688312,0.059745,0.674713,0.016340,0.592949,0.390692,266.261371,3122.0,115.423077,156.0,117.025478,157.0
MDTH,-0.512816,0.291607,0.664713,0.043656,0.714346,0.013092,0.645708,0.341200,250.950658,2432.0,139.558333,120.0,133.508197,122.0
WTW,-0.269083,0.235030,0.684465,0.080564,0.753865,0.006440,0.595780,0.397770,290.149802,2016.0,128.910000,100.0,137.603960,101.0
SYNA,-0.478926,0.220971,0.723232,0.055826,0.716404,0.007265,0.661059,0.331676,307.769847,1373.0,152.088235,68.0,173.594203,69.0
MANT,-0.384308,0.275740,0.677973,0.046315,0.790352,0.007205,0.651726,0.341096,279.299315,1460.0,170.315068,73.0,140.328767,73.0
JBLU,-0.555012,0.335000,0.624133,0.040840,0.754570,0.018684,0.654632,0.326671,270.488449,1515.0,171.407895,76.0,157.800000,75.0


In [291]:
df_sentiment_features.to_csv('/Users/Void/Desktop/df_sentiment_features.csv', index=False)

In [292]:
df1 = pd.concat([standardize(df_sentiment_features), df], axis=1).dropna()

In [293]:
df1

Unnamed: 0,Neg Sent Signal compound,Neg Sent Signal neg,Neg Sent Signal neu,Neg Sent Signal pos,Pos Sent Signal compound,Pos Sent Signal neg,Pos Sent Signal neu,Pos Sent Signal pos,Mean Sent Len,Sent Count,...,MKT12,Name Words,Name Length,Offer Amount,Price,Shares,1D,1W,1M,3M
AACC,0.750307,0.188291,-0.188526,-0.088264,0.824884,-0.047711,-2.180596,2.130875,-0.229744,-0.879370,...,0,0.950208,0.958673,-0.207315,-0.098813,-0.256559,0.002401,0.026411,0.073229,0.121248
AAT,0.393542,-0.595995,0.415479,0.748544,1.031044,-0.191374,0.296492,-0.236425,0.468211,1.945344,...,0,0.950208,0.701675,0.305495,0.565817,0.370542,-0.017185,-0.010683,-0.019508,-0.008825
ABR,2.255889,-0.648139,0.047402,1.835012,0.378669,0.187877,0.194437,-0.238990,0.029447,0.494261,...,0,0.950208,0.059182,-0.184455,0.505396,-0.278814,0.022444,-0.030923,-0.022444,-0.003990
ABTX,0.371174,-0.206152,-0.139155,0.980488,0.387190,0.206714,-2.462221,2.337360,-0.126546,0.458952,...,0,-0.116672,0.701675,-0.263654,0.626238,-0.391157,0.027938,-0.008869,0.054989,0.004878
ACAD,-0.056967,0.067109,-0.039826,-0.099553,-0.135393,-0.211326,0.163203,-0.102851,-0.825017,-0.743829,...,0,-0.116672,0.573177,-0.285564,-1.065549,-0.317740,-0.075269,-0.112903,-0.134409,-0.233871
ACFC,0.552216,0.287444,-0.384181,0.111178,-3.876276,0.948891,-3.870263,3.507450,-0.742610,1.072872,...,0,0.950208,0.701675,-0.259641,-0.703023,-0.292686,0.036060,0.093228,0.130167,0.218118
ACIA,0.506249,-0.451198,0.276213,0.658924,0.380346,0.654197,0.595426,-0.752431,0.268327,-0.249504,...,0,-0.116672,0.701675,-0.208992,0.867922,-0.333035,0.068621,-0.011379,0.282759,1.258621
ACMR,0.093448,2.153342,-1.721236,-2.118655,-0.841036,0.580083,1.010444,-1.138132,0.273143,-0.125353,...,0,-0.116672,-0.454812,-0.312169,-1.234727,-0.409511,-0.264382,-0.308446,-0.168911,-0.351285
ACOR,-0.231773,-0.427949,0.367901,0.350238,-0.849538,1.442002,1.728598,-2.063863,-0.463895,-1.800819,...,0,-0.116672,0.187681,-0.287800,-1.186391,-0.302445,0.095624,0.051864,0.009724,-0.181524
ACRX,-1.379747,1.162361,-1.886861,1.301824,-2.114880,0.691616,-1.246354,1.025535,-0.418076,-0.788250,...,0,-0.116672,0.573177,-0.279975,-1.307233,-0.225969,-0.094000,-0.238000,-0.362000,-0.342000


In [294]:
df1.to_csv('/Users/Void/Desktop/df1.csv', index=False)

In [300]:
#now run ML flow with sentiment features
run_ml_flow(df1)

Unnamed: 0,1D,1W,1M,3M
AUC,0.531046,0.526413,0.496314,0.50841
f1,0.606061,0.610778,0.638298,0.677083
log loss,0.707982,0.740833,0.692042,0.667049


### Load Raw IPO

In [267]:
ipo = {}

for x in glob.glob("./Data/*.htm")[:5]:
    with open(x, "r", encoding="utf-8") as file:
        html = file.read()
        soup = BeautifulSoup(html,"html5lib")
        text = soup.get_text(strip=True)
        ipo[x.split('\\')[1].split('.')[0]] = text

In [268]:
ipo.keys()

dict_keys(['AAC', 'AACC', 'AACQU', 'AAHC', 'AAOI'])

# Summarization

In [301]:
import requests
import urllib.request

#response = urllib.request.urlopen('http://rare-technologies.com/the_matrix_synopsis.txt')
response = urllib.request.urlopen('https://www.sec.gov/Archives/edgar/data/1467623/000119312518055809/d451946ds1.htm')
html = response.read()
soup = BeautifulSoup(html,"html5lib")
text = soup.get_text(strip=True)

print('Summary:')
print(summarize(text, word_count=1000))

print('\nKeywords:')
print(keywords(text, words=100))

Summary:
These risks include, but are not limited to, the following:•Our business depends on our ability to retain and upgrade paying users, and any decline in renewals or upgrades could adversely affect our future results of operations.•Our future growth could be harmed if we fail to attract new users or convert registered users to paying users.•Our revenue growth rate has declined in recent periods and may continue to slow in the future.•We have a history of net losses, we anticipate increasing expenses in the future, and we may not be able to achieve or maintain profitability.•Our business could be damaged, and we could be subject to liability if there is any unauthorized access to our data or our users’ content, including through privacy and data security breaches.•Our business could be harmed by any significant disruption of service on our platform or loss of content.•We generate revenue from sales of subscriptions to our platform, and any decline in demand for our platform or for