In [75]:
import string
from collections import Counter
import os
import pickle

import numpy as np
import pandas as pd
import spacy

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#
# Domain specific libraries to handle text
#
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

%matplotlib inline
plt.rcParams['figure.figsize'] = (12.0, 10.0) # set default size of plots

In [35]:
unhealthy = pd.read_excel('/Users/victor/Documents/GitHub/SP-Global-Ops-Consulting/data/Fund Articles_Unhealthy_New.xlsx')
healthy = pd.read_excel('/Users/victor/Documents/GitHub/SP-Global-Ops-Consulting/data/Funds Articles_Healthy_New.xlsx')
healthy.drop('Unnamed: 1',axis=1,inplace=True)


In [36]:
healthy['article']=healthy['Healthy Fund Articles']
healthy.drop('Healthy Fund Articles',axis=1,inplace=True)
unhealthy['article']=unhealthy['Unhealthy Fund Articles']
unhealthy.drop('Unhealthy Fund Articles',axis=1,inplace=True)


In [37]:
healthy['label']=1
unhealthy['label']=0

In [38]:
df = pd.concat([healthy,unhealthy],axis=0).reset_index(drop=True)

In [39]:
df.head()

Unnamed: 0,article,label
0,Mitel Returns To Profit In The Second Quarter ...,1
1,TriplePoint Venture Growth BDC Corp. Announces...,1
2,"@ M&A wrap: Sycamore, Staples, Leonardo DiCapr...",1
3,GTIS Partners LP Broadens Business Model; Anno...,1
4,@ Cinven to buy Dublin-headquartered Axa Life ...,1


In [40]:
y = df['label']

In [41]:
X_train, X_test, y_train, y_test = train_test_split(df['article'], y, test_size=0.33, random_state=53)

Preprocessing

In [54]:
punctuation=list(string.punctuation)
stop_words=stopwords.words("english")+punctuation+['``',"''"]

In [55]:
def lemma_tokenizer(text):
    lemmatizer=WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text.replace("'"," "))]

In [56]:
def stem_tokenizer(text):
    porter_stemmer=PorterStemmer()
    return [porter_stemmer.stem(token) for token in word_tokenize(text.replace("'"," "))]

Word Count Representation

In [57]:
countVectorizer=CountVectorizer(input='content',tokenizer=lemma_tokenizer,stop_words=stop_words)
count_train =countVectorizer.fit_transform(X_train.values)
count_test =countVectorizer.transform(X_test)

TFIDF Representation

In [58]:
tfidfVectorizer=TfidfVectorizer(input="content",tokenizer=lemma_tokenizer,stop_words=stop_words)
tfidf_train= tfidfVectorizer.fit_transform(X_train.values)
tfidf_test = tfidfVectorizer.transform(X_test)

# Machine Learning

Naive Bayes

In [59]:
alpha=1
model=MultinomialNB(alpha)
model.fit(count_train,y_train)
Y_pred=model.predict(count_test)
np.average(Y_pred==y_test)

0.83569217160548426

In [63]:
alpha=1
model=MultinomialNB(alpha)
model.fit(tfidf_train,y_train)
Y_pred=model.predict(tfidf_test)
np.average(Y_pred==y_test)

0.73109243697478987

Logistic Regression

In [66]:
clf = LogisticRegression(C=1.0)
clf.fit(count_train, y_train)
Y_pred = clf.predict(count_test)
np.average(Y_pred==y_test)

0.9230429013710747

In [67]:
clf = LogisticRegression(C=1.0)
clf.fit(tfidf_train,y_train)
Y_pred = clf.predict(tfidf_test)
np.average(Y_pred==y_test)

0.91862007961079173

SVD-SVM

In [73]:

svd = TruncatedSVD(n_components=200)
svd.fit(tfidf_train)
xtrain_svd = svd.transform(tfidf_train)
xtest_svd = svd.transform(tfidf_test)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xtest_svd_scl = scl.transform(xtest_svd)

In [77]:
clf = SVC(C=1.0)
clf.fit(xtrain_svd_scl, y_train)
Y_pred = clf.predict(xtest_svd_scl)
np.average(Y_pred==y_test)

0.91729323308270672

## Optimization of Classifier

The grid function compute un bunch of Cross validation with the parameters of the dictionary associated.
This function take lot of time to run.

In [None]:
from sklearn.model_selection import GridSearchCV
parameters1 = [{'C': [1, 10, 100], 'kernel': ['linear']},
              {'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.1, 0.3, 0.5, 0.7, 0.9]}]
# best C:10 'gamma'=0.9 , 'kernel' ='rbf'
parameters2 = [{'C': [1,3,5], 'kernel': ['linear']},
        {'C': [2, 5, 7], 'kernel': ['rbf'], 'gamma': [0.85, 0.9, 0.95]},
        {'C': [1,3,5], 'kernel': ['poly'], 'degree': [2,3,4,5,6]}]
#C=2,gamma=0.85 kernel=rbf
grid_search = GridSearchCV(estimator = clf,
                           param_grid = parameters2,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(tfidf_train, y_train)

In [None]:
best_accuracy = grid_search.best_score_

In [None]:
best_parameters = grid_search.best_params_

## NER with spacy

The folowing code alows to create a matrix where each row give the number of each word 'type' ('PERSON','NORP'...) in the current document.

you will need to download the file 'en_core_web_sm'
documentation : https://spacy.io/usage/linguistic-features#section-named-entities

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
NER_needed=['PERSON','NORP','FAC','ORG','GPE','LOC','PRODUCT','EVENT','WORK_OF_ART','LAW','LANGUAGE','DATE','TIME','PERCENT','MONEY','QUANTITY','ORDINAL','CARDINAL']

In [None]:
corpus_train_NER=[]
for i in range (0,len(X_train)):
    
    if i%100 == 0:
        print (i)
        
    doc = nlp(X_train.iloc[i])    
    tokens_NER =[]
    for ent in doc.ents :
        tokens_NER.append(ent.label_)
    s_out_NER=' '.join(tokens_NER)
    corpus_train_NER.append(s_out_NER)

cv2 =CountVectorizer()
count_train_NER = cv2.fit_transform(corpus_train_NER).toarray()

Concretely the for loop replace on every document each word by his NER  label. Then we use the countVectorize function to count the number of occurence of each label.
You can also use the tfidif function.

This function takes a lot of time... I tried to amerliorate it by counting directly the label in a loop but actually finding the NER label for every word in the documents take still a lot of time...

the next code is the same as previously but for X_test :

In [None]:
corpus_test_NER=[]
for i in range (0,len(X_test)):
    if i%100 == 0:
        print (i)
    doc = nlp(X_test.iloc[i])    
    tokens_NER =[]
    for ent in doc.ents :
        label=ent.label_
        if label in NER_needed :
            tokens_NER.append(ent.label_)
    s_out_NER=' '.join(tokens_NER)
    corpus_test_NER.append(s_out_NER)



count_test_NER = cv2.transform(corpus_test_NER).toarray()


next is a scaler if needed :

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
count_scaled_train_NER = sc_X.fit_transform(count_train_NER)
count_scaled_test_NER = sc_X.transform(count_test_NER)