# Queen's University Alternative Assets Fund
#### Learning and Development Session - Sentiment Analysis for Financial News


* Prepared by Robert Davis for QUAAF
* May 20, 2021
* To be run in Google Colab


## Setup

#### Load required packages


In [None]:
# Note that the simpletransformers installation requires a runtime restart
!pip install simpletransformers

import pandas as pd

### Load Data

Load data from Kaggle dataset located at: https://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/QueensU-Alternative-Asset-Fund/Learning-and-Development/master/data/FinancialSentiment.csv', encoding='latin-1', header=None)

### Inspect Data

In [None]:
#Inspect dataframe
df

In [None]:
# Clean up the dataframe
# Need to add column titles, and remove any rows where the sentiment is neutral
# Need to change 'negative' to 0, and 'positive' to 1

df.columns = ['Sentiment','Text']
df = df[df['Sentiment']!= 'neutral']
df.reset_index(inplace=True,drop=True)

df.replace('negative',0, inplace=True)
df.replace('positive',1,inplace=True)


In [None]:
# Inspect updated dataframe
df

In [None]:
# Look at a particular row

row = 400
sentiment = df.iloc[row]['Sentiment']
text = df.iloc[row]['Text']

print(f'Row selected = {row}')
print(f'Sentiment: {sentiment}')
print(f'Text: {text}')

### Data Quality
Note that for most datasets, significant data cleaning would be involved.
This is a cleaned dataset, which allows us to skip that step.
Data cleaning/engineering will often represent upwards of 80% of the work required to do this type of analysis.


### Train Test Split


In [None]:
# Split the data

from sklearn.model_selection import train_test_split

X = df['Text']
y = df['Sentiment']

X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=42)

## Sentiment Analysis with Transformers

### Data Prep

In [None]:
# SimpleTransformers requires the input to be in one dataframe, but we currently have X and y stored separately

X_train_transformers = pd.DataFrame(X_train)
X_train_transformers['Polarity'] = y_train
X_train_transformers

X_val_transformers = pd.DataFrame(X_val)
X_val_transformers['Polarity'] = y_val


### Model Setup

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=5, sliding_window=False, overwrite_output_dir=True, save_model_every_epoch=False, max_seq_length=420)

# Create a ClassificationModel
model = ClassificationModel("xlnet", "xlnet-base-cased", args=model_args, use_cuda=True)

### Train the model

In [None]:
model.train_model(X_train_transformers)

### Evaluate the model

In [None]:
import sklearn

result, model_outputs, wrong_predictions = model.eval_model(X_val_transformers, f1 = sklearn.metrics.f1_score)
result

### Predict a new sentence

In [None]:
input_text = ['APPLE SHARES DOWN ABOUT 6% PREMARKET AFTER CO FORECASTS Q4 PROFIT BELOW ESTIMATES',
              '$TSLA IS STUCK WITH OVER 10,000 CARS ON FACTORY HOLD, RESULTING IN A LOGISTICAL NIGHTMARE - ELECTREK']

In [None]:

predictions = model.predict(input_text)

for i in range(0,len(input_text)):
  print(f'Sentence: {input_text[i]}')
  print(f'Prediction: {predictions[0][i]}')

## Named Entity Recognition


In [None]:
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp(input_text[0])

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

