## Classifier with NER

### Import data et library

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import AgglomerativeClustering
import nltk



#Import Data
unhealthy = pd.read_excel('C:/Users/Alu/Desktop/S and P Global/Fund Articles_Unhealthy_New.xlsx')
healthy = pd.read_excel('C:/Users/Alu/Desktop/S and P Global/Funds Articles_Healthy_New.xlsx')
healthy.drop('Unnamed: 1',axis=1,inplace=True)


#Rename Columns
healthy['article']=healthy['Healthy Fund Articles']
healthy.drop('Healthy Fund Articles',axis=1,inplace=True)
unhealthy['article']=unhealthy['Unhealthy Fund Articles']
unhealthy.drop('Unhealthy Fund Articles',axis=1,inplace=True)

#Create Dependant variable
healthy['label']=1
unhealthy['label']=0

#Concat
df = pd.concat([healthy,unhealthy],axis=0,sort=False).reset_index(drop=True)

#Create Dependant variable vector
y = df['label']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['article'], y, test_size=0.10,random_state=0)

### Create corpus of NER

If you already have the file download skit this part

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
NER_needed=['PERSON','NORP','FAC','ORG','GPE','LOC','PRODUCT','EVENT','WORK_OF_ART','LAW','LANGUAGE','DATE','TIME','PERCENT','MONEY','QUANTITY','ORDINAL','CARDINAL']


corpus_train_NER=[]
for i in range (0,len(X_train)):
    if i%100 == 0:
        print (i)
    doc = nlp(X_train.iloc[i])
    tokens_NER =[]
    for ent in doc.ents :
        tokens_NER.append(ent.label_)
    s_out_NER=' '.join(tokens_NER)
    corpus_train_NER.append(s_out_NER)
corpus_train_NER=np.array(corpus_train_NER)

np.save('corpus_train_NER', corpus_train_NER)

corpus_test_NER=[]
for i in range (0,len(X_test)):
    if i%100 == 0:
        print (i)
    doc = nlp(X_test.iloc[i])  
    tokens_NER =[]
    for ent in doc.ents :
        label=ent.label_
        if label in NER_needed :
            tokens_NER.append(ent.label_)
    s_out_NER=' '.join(tokens_NER)
    corpus_test_NER.append(s_out_NER)
corpus_test_NER=np.array(corpus_test_NER)

np.save('corpus_test_NER', corpus_test_NER)


### Create Bago of word

In [None]:
corpus_train_NER = np.load('corpus_train_NER')
corpus_test_NER = np.load('corpus_test_NER')

In [None]:
cv2 =CountVectorizer()
count_train_NER = cv2.fit_transform(corpus_train_NER).toarray()
count_test_NER = cv2.fit_transform(corpus_test_NER).toarray()

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
count_scaled_train_NER = sc_X.fit_transform(count_train_NER)
count_scaled_test_NER = sc_X.transform(count_test_NER)


### Fit Model

In [None]:
classifierSVC_NER = SVC(kernel = 'linear')
classifierSVC_NER.fit(count_scaled_train_NER, y_train)
y_predSVC_NER = classifierSVC_NER.predict(count_scaled_test_NER)
np.average(y_predSVC_NER==y_test)


classifierRF_NER_count = RandomForestClassifier(n_estimators = 30, criterion = 'entropy')
classifierRF_NER_count.fit(count_scaled_train_NER, y_train)
y_predRF_NER_count = classifierRF_NER_count.predict(count_scaled_test_NER)

cm =  confusion_matrix(y_test,y_predRF_NER_count)

### Grid

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.1, 0.3, 0.5, 0.7, 0.9]},
              {'C': [1,3,5], 'kernel': ['poly'], 'degree': [2,3,4,5,6]}]
parameters = [{'n_estimators': [10, 20, 30], 'criterion': ['gini','entropy']}]
grid_search = GridSearchCV(estimator = classifierSVC_NER,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(count_scaled_train_NER, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

