In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs


In [2]:

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import string
import re
from collections import Counter

In [3]:
data_path = './data/'
df = pd.read_csv(data_path + 'FinancialPB.csv',header=None ,encoding='ISO-8859-1')
df.columns =['sentiment' ,'statement']
df.head()

Unnamed: 0,sentiment,statement
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [4]:
df.drop_duplicates(subset=['statement'],keep='first',inplace=True)
df['sentiment_encoded'] = LabelEncoder().fit_transform(df['sentiment'])


In [5]:
#remove punctuations
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def remove_stopwords(text, STOPWORDS):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def stem_words(text, stemmer):
    return " ".join([stemmer.stem(word) for word in text.split()])

def remove_freqwords(text, FREQWORDS):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

def lemmatize_words(text, lemmatizer, wordnet_map ):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

def clean_review(text):
    clean_text = []
    for w in word_tokenize(text):
        if w.lower() not in stop:
            pos = pos_tag([w])
            new_w = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            clean_text.append(new_w)
    return clean_text

def join_text(text):
    return " ".join(text)

In [6]:
df['statement'] = df['statement'].apply(lambda x: remove_punct(x))
STOPWORDS = set(", ".join(stopwords.words('english')))
df['non_stop_statement'] = df['statement'].apply(lambda x: remove_stopwords(x, STOPWORDS))

In [7]:
df['stemmed_statement'] = df['non_stop_statement'].apply(lambda x: stem_words(x,PorterStemmer()))
#remove frequent words 
fre_count = Counter()
for phrase in df["stemmed_statement"].values:
    for word in phrase.split():
        fre_count[word] += 1
        
fre_count.most_common(10)

[('the', 6059),
 ('of', 3199),
 ('in', 2747),
 ('and', 2587),
 ('to', 2493),
 ('eur', 1310),
 ('for', 1150),
 ('it', 999),
 ('compani', 967),
 ('is', 920)]

In [8]:
fre_words = set([word for (word, count) in fre_count.most_common(10)])
df["non_freq_statement"] = df["stemmed_statement"].apply(lambda x: remove_freqwords(x, fre_words))



In [9]:
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
df["lemmatized_statesment"] = df["non_freq_statement"].apply(lambda x: lemmatize_words(x, WordNetLemmatizer(), wordnet_map))
df.head()

Unnamed: 0,sentiment,statement,sentiment_encoded,non_stop_statement,stemmed_statement,non_freq_statement,lemmatized_statesment
0,neutral,According to Gran the company has no plans to...,1,According to Gran the company has no plans to ...,accord to gran the compani ha no plan to move ...,accord gran ha no plan move all product russia...,accord gran ha no plan move all product russia...
1,neutral,Technopolis plans to develop in stages an area...,1,Technopolis plans to develop in stages an area...,technopoli plan to develop in stage an area of...,technopoli plan develop stage an area no less ...,technopoli plan develop stage an area no less ...
2,negative,The international electronic industry company ...,0,The international electronic industry company ...,the intern electron industri compani elcoteq h...,intern electron industri elcoteq ha laid off t...,intern electron industri elcoteq ha lay off te...
3,positive,With the new production plant the company woul...,2,With the new production plant the company woul...,with the new product plant the compani would i...,with new product plant would increas capac mee...,with new product plant would increas capac mee...
4,positive,According to the company s updated strategy fo...,2,According to the company updated strategy for ...,accord to the compani updat strategi for the y...,accord updat strategi year baswar target longt...,accord updat strategi year baswar target longt...


In [10]:
df1 = df.drop(['sentiment','statement','non_stop_statement','stemmed_statement','non_freq_statement'],axis =1)
df1.head()

Unnamed: 0,sentiment_encoded,lemmatized_statesment
0,1,accord gran ha no plan move all product russia...
1,1,technopoli plan develop stage an area no less ...
2,0,intern electron industri elcoteq ha lay off te...
3,2,with new product plant would increas capac mee...
4,2,accord updat strategi year baswar target longt...


In [11]:
df2 = df.drop(['sentiment','lemmatized_statesment','non_stop_statement','stemmed_statement','non_freq_statement'],axis =1)
df2.head()

Unnamed: 0,statement,sentiment_encoded
0,According to Gran the company has no plans to...,1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company s updated strategy fo...,2


In [12]:

X_train, x_test = train_test_split(df2,test_size = 0.2)

In [13]:
bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [14]:
train_df = pd.DataFrame({
    'text': X_train['statement'].replace(r'\n', ' ', regex=True),
    'label': X_train['sentiment_encoded']
})

test_df = pd.DataFrame({
    'text': x_test['statement'].replace(r'\n', ' ', regex=True),
    'label': x_test['sentiment_encoded']
})

In [15]:
train_df.head()

Unnamed: 0,text,label
289,The company expects its net sales in the first...,2
1624,The contractor of the shopping center China S...,1
796,Stonesoft sees great promise in the future of...,2
1887,We are happy to be working with Rapala to enc...,2
1544,In the video above Marimekko s design manager ...,1


In [16]:
bert_model.train_model(train_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=3870.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=484.0, style=ProgressStyle(des…





(484, 0.5138317020379068)

In [17]:
result, model_outputs, _ = bert_model.eval_model(test_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=968.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=121.0, style=ProgressStyle(descr…




In [18]:
print(result,model_outputs)


{'mcc': 0.713797516156028, 'eval_loss': 0.4409747516105244} [[-2.06378794  3.84194374 -0.81492472]
 [ 2.38291192 -0.01707289 -1.23327172]
 [-2.85778904 -0.55751455  2.67488575]
 ...
 [-1.98733783 -1.34787297  2.84714603]
 [-2.90120864  3.05218101  0.53292215]
 [-2.39053321  3.90761876 -0.61282998]]


In [19]:
pred = []
for result_i in model_outputs:
    pred.append(np.argmax(result_i))


In [20]:
flag = test_df['label'].tolist()

In [21]:
len(test_df)

968

In [22]:
print(classification_report(flag,pred,target_names=['positive','neutral','negative']))


              precision    recall  f1-score   support

    positive       0.79      0.87      0.83       126
     neutral       0.89      0.85      0.87       567
    negative       0.76      0.79      0.78       275

    accuracy                           0.84       968
   macro avg       0.81      0.84      0.82       968
weighted avg       0.84      0.84      0.84       968



In [23]:
roberta_model = ClassificationModel('roberta', 'roberta-base', \
                                   num_labels=3, \
                                   args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [24]:
train_df.head()

Unnamed: 0,text,label
289,The company expects its net sales in the first...,2
1624,The contractor of the shopping center China S...,1
796,Stonesoft sees great promise in the future of...,2
1887,We are happy to be working with Rapala to enc...,2
1544,In the video above Marimekko s design manager ...,1


In [25]:

roberta_model.train_model(train_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=3870.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=484.0, style=ProgressStyle(des…





(484, 0.800007300058857)

In [26]:
result_roberta, model_outputs_roberta, roberta_wrong = roberta_model.eval_model(test_df)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=968.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=121.0, style=ProgressStyle(descr…




In [27]:
print(result_roberta,model_outputs_roberta)

{'mcc': 0.49241451591954233, 'eval_loss': 0.6886919137736982} [[-1.93238139  2.84693956 -0.66488546]
 [-0.1780528  -0.94979125  0.98829556]
 [-0.20478076 -0.90307075  0.96687096]
 ...
 [-0.07242433 -1.16601181  1.08972108]
 [-1.78974569  2.15475821 -0.32540339]
 [-1.92918897  2.87153792 -0.71992493]]


In [28]:
pred_ro = []
for result_i in model_outputs_roberta:
    pred_ro.append(np.argmax(result_i))



In [29]:
print(classification_report(flag,pred_ro,target_names=['positive','neutral','negative']))



              precision    recall  f1-score   support

    positive       0.00      0.00      0.00       126
     neutral       0.88      0.81      0.84       567
    negative       0.51      0.82      0.63       275

    accuracy                           0.71       968
   macro avg       0.46      0.54      0.49       968
weighted avg       0.66      0.71      0.67       968



  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
xlnet_model = ClassificationModel('xlnet', 'xlnet-base-cased', \
                                   num_labels=3, \
                                   args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




In [68]:
xlnet_model.train_model(train_df)

HBox(children=(FloatProgress(value=0.0, max=3846.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=481.0, style=ProgressStyle(des…





(481, 0.7842893079387918)

In [72]:
result_xlnet, model_outputs_xlnet, _ = xlnet_model.eval_model(test_df)

HBox(children=(FloatProgress(value=0.0, max=962.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=121.0, style=ProgressStyle(descr…




In [73]:
print(result_xlnet,model_outputs_xlnet)

{'mcc': 0.5923441171380157, 'eval_loss': 0.5672913150600165} [[ 2.1099813  -1.85769784  0.11798   ]
 [-1.15270138 -1.2006073   2.59851885]
 [-2.15761399 -0.27710459  2.09578753]
 ...
 [-2.5214417   1.77293277 -0.55886918]
 [-1.82726216  1.23620474 -0.16409685]
 [-2.86081362  0.7579965   1.1962477 ]]


In [74]:
pred_xlnet = []
for result_i in model_outputs_xlnet:
    pred_xlm.append(np.argmax(result_i))
   
print(classification_report(flag,pred_xlnet,target_names=['positive','neutral','negative']))


              precision    recall  f1-score   support

    positive       0.67      0.73      0.70       124
     neutral       0.84      0.83      0.83       574
    negative       0.68      0.67      0.68       264

    accuracy                           0.77       962
   macro avg       0.73      0.74      0.74       962
weighted avg       0.77      0.77      0.77       962



In [90]:
accuracy_score(flag,pred_xlnet)

0.7733887733887734

In [31]:
def get_result_BERT(statement):
    result = bert_model.predict([statement])
    pos = np.where(result[1][0] == np.amax(result[1][0]))
    pos = int(pos[0])
    sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
    print(sentiment_dict[pos])
    return

In [32]:
get_result_BERT('Apple supplier Foxconn warns that component shortages will last until 2022')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


positive


In [35]:
get_result_BERT('Dollar heads for third weekly gain as payrolls data looms')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


neutral


In [36]:
get_result_BERT('Japan stocks jump more than 1.5% as other major markets close for Good Friday')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


neutral


In [37]:
get_result_BERT('Gartner Stock Gives Every Indication Of Being Modestly Overvalued')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


negative


In [38]:
get_result_BERT('Air Canada dropped its takeover of vacation operator Transat AT Inc. ')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


positive


In [40]:
get_result_BERT('Lazard Freres advised Brookfield Property’s special committee and gave a fair market value of US$14 to US$18.50 per unit')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


negative
