In [None]:
import pandas as pd
import numpy as np 

%pip install matplotlib
import matplotlib.pyplot as plt

%pip install seaborn
import seaborn as sns
sns.set_style("whitegrid")

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import regex as re
from textblob import TextBlob

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_rows', 25)

In [None]:
# Read the pickle file of the cleaned headlines dataset created in Appendix A, into a dataframe

df = pd.read_pickle("Cleaned headlines dataset")

# View the DataFrame
df

# Sentiment analysis using VADER


In [None]:
# Applying sentiment analysis using VADER on the cleaned Headlines column
 
analyzer = SentimentIntensityAnalyzer()
df['Scores'] = df['Headlines'].apply(analyzer.polarity_scores)


In [None]:
# Converting the scores to string for further processing

scores_str = df['Scores'].to_string()

In [None]:
# Splitting the results

x = scores_str.split(",")

In [None]:
# Removing neg, neu, pos, compound from the dataset

stopwords = ['neg', 'neu', 'pos', 'compound']

new_words = [word for word in x if word not in stopwords]

In [None]:
# Placing the results into a dataframe

df_1 = pd.DataFrame(new_words)


In [None]:
# Clearing the column from words and leaving only numbers

for col in df_1:
    df_1[0] = [''.join(re.findall("\d+\.\d+", item)) for item in df_1[0]]

In [None]:
# Adding a sentiment columm to clarify the numbers above

L = ['Negative', 'Neutral', 'Positive']

df_1['Sentiment'] =  (df_1.index % len(L)).map(dict(enumerate(L)))

In [None]:
# Adding an index

df_1.reset_index(inplace = True)

In [None]:
# Creating an ad hoc index, I can merge the 2 df together later

l = [i for i in range(0,1057) for _ in range(3)]


In [None]:
# Placing the index into a dataframe

l = pd.DataFrame(l)

In [None]:
# Placing the index into index list object

Index_list = l[0]

In [None]:
# Concatenating index and sentiment scores

Concat_df = pd.concat([Index_list, (df_1.apply(pd.Series))], axis=1) # Working

In [None]:
# Renaming columns

Concate_New = Concat_df.rename({ 0 : 'Index'}, axis='columns')

In [None]:
# Dropping the second index column

Concate_dropped = Concate_New.drop(['index'], axis=1)

In [None]:
# Renaming the columns

Concate_dropped.columns = ['Index', 'Scores','Sentiment']

In [None]:
# Finalizing my sentiment df

Sentiment_df = Concate_dropped.pivot(index='Index', columns='Sentiment', values= 'Scores')

print(Sentiment_df)

In [None]:
# Placing the sentiment scores into a dataframe

Sentiment_df = pd.DataFrame(Sentiment_df)

In [None]:
# Filtering sentiment scores for the columns needed

Sentiment_df = Sentiment_df.iloc[1:]

In [None]:
# Counting the scores for better visualization

Counted = Sentiment_df.value_counts()

In [None]:
# Setting the correct column type

Sentiment_df["Negative"] = Sentiment_df["Negative"].astype(str).astype(float)

In [None]:
Sentiment_df["Neutral"] = Sentiment_df["Neutral"].astype(str).astype(float)

In [None]:
Sentiment_df["Positive"] = Sentiment_df["Positive"].astype(str).astype(float)

In [None]:
# Plotting a bar chart of the sentiment distribution

Sentiment_melted = Sentiment_df.melt(var_name='cols', value_name='vals')

Sentiment_melted

In [None]:
# Adjusting the column type

Sentiment_melted["vals"] = Sentiment_melted["vals"].astype(str).astype(float)

In [None]:
# Dropping zero's

Sentiment_melted_cleaned = Sentiment_melted[Sentiment_melted.vals != 0]

In [None]:
# Plotting the sentiment scores distribution

g = sns.barplot(x=Sentiment_melted_cleaned.index, y="vals", hue='cols', data=Sentiment_melted_cleaned, linewidth = 0.1)
plt.legend(title = 'Sentiment', bbox_to_anchor = (1, 1))
plt.xlabel('Sentiment', fontsize=10)
plt.ylabel('Sentiment Scores', fontsize=10)
plt.title('Sentiment scores distribution', fontsize=20)
g.set(xticklabels=[]) 
g.tick_params(bottom=False)




In [None]:
# Getting a count of the sentiment values

Data = Sentiment_melted_cleaned['cols'].value_counts()

In [None]:
# Placing the result into a dataframe

Data = pd.DataFrame(Data)

In [None]:
# Plotting a bar graph of the sentiment counts

g = sns.barplot(x=Data.index, y='cols', hue='cols', data=Data, linewidth = 0.1)
g.get_legend().remove()
plt.ylabel('Value counts', fontsize=10)
plt.title('Sentiment value counts', fontsize=20)


In [None]:
# Dropping the scores column from the initial dataframe for further processing

df_dropped = df.drop(['Scores'], axis=1)

In [None]:
# Resetting the index

df_dropped.reset_index(inplace = True)

In [None]:
# Merging this newly created Sentiment table to the initial dataset

Headline_Sentiment = pd.concat([df_dropped, Sentiment_df], axis=1)
Headline_Sentiment.head()

In [None]:
# Getting the VADER coumpound scores, which will be utilized later when training the Ml algorithms

scores = []

# Declaring the variable for the compound scores

compound_list = []

for i in range(df['Headlines'].shape[0]):

# Creating the scores and appending the results into a separate dataframa columns

    compound = analyzer.polarity_scores(df['Headlines'][i])["compound"]

    scores.append({"Compound": compound})

In [None]:
# Results of the above:

Compound_score = pd.DataFrame.from_dict(scores)
Compound_score

In [None]:
# This is the sentiment dataset, inclusive of the compound scores

df_comp = Headline_Sentiment.join(Compound_score)
df_comp

In [None]:
# Dropping Negative column as I will not need it

Headlines_Vader = df_comp.drop(['Negative'], axis=1)

In [None]:
# Dropping Neutral column as I will not need it

Headlines_Vader_1 = Headlines_Vader.drop(['Neutral'], axis=1)

In [None]:
# Dropping Positive column as I will not need it

Headlines_Vader_ok = Headlines_Vader_1.drop(['Positive'], axis=1)

In [None]:
# Final VADER sentiment Compound scores dataset, ready for further analysis

Headlines_Vader_ok

In [None]:
# Plotting the Compound score fo further visualization

Compound_score.plot(kind='kde')
plt.xlabel('Compound score', fontsize=10)
plt.ylabel('Compound density', fontsize=10)
plt.title('VADER Compound scores', fontsize=20)


# Sentiment analysis using TEXTBLOB

In [None]:
# Adjusting the Headlines column type

df['Headlines']  = df['Headlines'].astype(str)

In [None]:
# Calcolating TextBlob sentiment scores. Placing the results into a dataframe

df['Polarity'] = np.nan
df['Subjectivity'] = np.nan

pd.options.mode.chained_assignment = None

for idx, Headlines in enumerate(df['Headlines'].values):  # for each row in our df dataFrame
        if Headlines:
            sentA = TextBlob(Headlines) # pass the text only article to TextBlob to analyse
            df['Polarity'].iloc[idx] = sentA.sentiment.polarity # write sentiment polarity back to df
            df['Subjectivity'].iloc[idx] = sentA.sentiment.subjectivity # write sentiment subjectivity score back to df

df.head()


In [None]:
# Removing the Scores column

df.drop(['Scores'], axis=1, inplace = True)



In [None]:
# Plotting the TextBlob Polarity and Subjectivity scores

df.plot(kind='kde')
plt.xlabel('Scores', fontsize=10)
plt.ylabel('Density', fontsize=10)
plt.title('TEXTBLOB Polarity and Subjectivity scores', fontsize=20)

In [None]:
# Dropping the unneccessary Headlines column

df.drop(['Headlines'], axis=1, inplace = True)

In [None]:
df.reset_index(inplace = True)
df

# APPROACH A: Share price with sentiment scores

# VADER

In [None]:
# Reading the pickle file from Appendix A (full financial dataset)

num_RIC_features = pd.read_pickle("Merged numerical financial dataset")

#view DataFrame
num_RIC_features

In [None]:
# Renaming column Instrument to RIC, for further processing

num_RIC_features.rename(columns = { 'Instrument' :'RIC'}, inplace = True)

In [None]:
# Removing duplicated in column RIC

num_RIC_features.drop_duplicates(subset=['RIC'])

In [None]:
# Merging the VADER scores dataset together with the financial datasite on column RIC

Vader_num_df = num_RIC_features.merge(Headlines_Vader_ok[['Compound', 'RIC']], on = 'RIC')

In [None]:
# Final dataset merged on the single values of the RIC column. Is a 1883 rows × 24 columns dataframe

Vader_num_df

In [None]:
# Preparing share price dataset
# Remove all columns between column index 1 to 3

Vader_share_price = Vader_num_df.drop(Vader_num_df.iloc[:, 2:11], axis=1)


In [None]:
# Dropping uneccessary columns

Vader_share_price.drop(Vader_share_price.iloc[:, 2:5], inplace = True, axis=1)

In [None]:
# Dropping uneccessary columns

Vader_share_price.drop(Vader_share_price.iloc[:, 5:10], inplace = True, axis=1)

In [None]:
# Dropping uneccessary columns

Vader_share_price.drop(['Index','Current Ratio','Cash & Cash Equivalents - Total'], inplace = True, axis=1)


In [None]:
# Dropping uneccessary columns

Vader_share_price

# Applying ML on Share price dataset with VADER compound scores

In [None]:
# Choosing a subset of the data to split in between train and test:

X = Vader_share_price
y = Vader_share_price['Target/Non-Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=11)

In [None]:
# BASELINE MODEL logistic regression

# Baseline performance 2: Logistic regression classifier

lr = LogisticRegression()

# Applying the model to the training data:

lr.fit(X_train,y_train)


# Predict the test model:
labels_lr = lr.predict(X_test)


In [None]:
# Let's evaluate the results with accuracy:

print('Logistic Regression Test Accuracy:', accuracy_score(y_test, labels_lr))
print('Logistic Regression Train Accuracy:', accuracy_score(y_train, lr.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_lr))
print(classification_report(y_train, lr.predict(X_train)))

# Confusion matrix:

mat_lr = confusion_matrix(y_test,labels_lr)
sns.heatmap(mat_lr, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_lr, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_lr, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_lr, average=None)
print(f_score)

In [None]:
# Random Forest on feature selected dataset with Target

# Create regressor object
    
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  
# Applying the model to the training data:

regressor.fit(X_train,y_train)

# Predict the test model:

labels_regressor = regressor.predict(X_test)



In [None]:
# Let's evalueate the results with accuracy:

print('Random Forest Test Accuracy:', accuracy_score(y_test, labels_regressor))
print('Random Forest Train Accuracy:', accuracy_score(y_train, regressor.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_regressor))
print(classification_report(y_train, regressor.predict(X_train)))

# Confusion matrix:

mat_regressor = confusion_matrix(y_test,labels_regressor)
sns.heatmap(mat_regressor, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_regressor, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_regressor, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_regressor, average=None)
print(f_score)

In [None]:
# Neural Network NN
# Building the classifier

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

# Applying the model to the training data:

mlp.fit(X_train,y_train)

# Predict the test model:

labels_mlp = mlp.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('NN Test Accuracy:', accuracy_score(y_test, labels_mlp))
print('NN Train Accuracy:', accuracy_score(y_train, mlp.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_mlp))
print(classification_report(y_train, mlp.predict(X_train)))

# Confusion matrix:

mat_mlp = confusion_matrix(y_test,labels_mlp)
sns.heatmap(mat_mlp, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Applying nested cross-validation check:
scores_mlp = cross_val_score(mlp, X, y, cv=10, scoring='accuracy')
print(scores_mlp)
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores_mlp.mean(), scores_mlp.std()))

In [None]:
# Precision

precision = precision_score(y_test, labels_mlp, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_mlp, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_mlp, average=None)
print(f_score)

In [None]:
# Decision Tree

dt = DecisionTreeClassifier(criterion='entropy')

# Applying the model to the training data:

dt.fit(X_train,y_train)

# Predict the test model:

labels_dt = dt.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('Decision Tree Test Accuracy:', accuracy_score(y_test, labels_dt))
print('Decision Tree Train Accuracy:', accuracy_score(y_train, dt.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_dt))
print(classification_report(y_train, dt.predict(X_train)))

# Confusion matrix:

mat_dt = confusion_matrix(y_test,labels_dt)
sns.heatmap(mat_dt, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_dt, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_dt, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_dt, average=None)
print(f_score)

In [None]:
# Support Vector Machine (SVM)

# Building the linear Support Vector Machine Classifier

Svm = LinearSVC(dual = False, random_state = 0, penalty = 'l1',tol = 1e-5)

Svm.fit(X_train,y_train) 

# Predict the test model:

labels_svm = Svm.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('SVM Test Accuracy:', accuracy_score(y_test, labels_svm))
print('SVM Train Accuracy:', accuracy_score(y_train, Svm.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_svm))
print(classification_report(y_train, Svm.predict(X_train)))

# Confusion matrix:

mat_svm = confusion_matrix(y_test,labels_svm)
sns.heatmap(mat_svm, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Terget', 'Terget'], yticklabels=['Non-Terget', 'Terget'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_svm, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_svm, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_svm, average=None)
print(f_score)

# TEXTBLOB

In [None]:
# Merging the TEXTBLOB scores dataset together with the financial datasite on column RIC

Textblob_num_df = num_RIC_features.merge(df[['Polarity','Subjectivity','RIC']], on = 'RIC')

In [None]:
Textblob_num_df

In [None]:
# Preparing share price dataset
# Remove all columns between column index 1 to 3

Textblob_share_price = Textblob_num_df.drop(Textblob_num_df.iloc[:, 2:11], axis=1)


In [None]:
# Dropping uneccessary columns

Textblob_share_price.drop(Textblob_share_price.iloc[:, 2:5], inplace = True, axis=1)

In [None]:
# Dropping uneccessary columns

Textblob_share_price.drop(Textblob_share_price.iloc[:, 5:10], inplace = True, axis=1)

In [None]:
# Dropping uneccessary columns

Textblob_share_price.drop(['Index','Current Ratio','Cash & Cash Equivalents - Total'], inplace = True, axis=1)


In [None]:
Textblob_share_price

# Applying ML on Share price dataset with TEXTBLOB Polarity and Subjectivity scores

In [None]:
# Choosing a subset of the data to split in between train and test:

X = Textblob_share_price
y = Textblob_share_price['Target/Non-Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=11)

In [None]:
# BASELINE MODEL logistic regression

# Baseline performance 2: Logistic regression classifier

lr = LogisticRegression()

# Applying the model to the training data:

lr.fit(X_train,y_train)


# Predict the test model:
labels_lr = lr.predict(X_test)


In [None]:
# Let's evaluate the results with accuracy:

print('Logistic Regression Test Accuracy:', accuracy_score(y_test, labels_lr))
print('Logistic Regression Train Accuracy:', accuracy_score(y_train, lr.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_lr))
print(classification_report(y_train, lr.predict(X_train)))

# Confusion matrix:

mat_lr = confusion_matrix(y_test,labels_lr)
sns.heatmap(mat_lr, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_lr, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_lr, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_lr, average=None)
print(f_score)

In [None]:
# Random Forest on feature selected dataset with Target

# Create regressor object
    
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  
# Applying the model to the training data:

regressor.fit(X_train,y_train)

# Predict the test model:

labels_regressor = regressor.predict(X_test)



In [None]:
# Let's evalueate the results with accuracy:

print('Random Forest Test Accuracy:', accuracy_score(y_test, labels_regressor))
print('Random Forest Train Accuracy:', accuracy_score(y_train, regressor.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_regressor))
print(classification_report(y_train, regressor.predict(X_train)))

# Confusion matrix:

mat_regressor = confusion_matrix(y_test,labels_regressor)
sns.heatmap(mat_regressor, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_regressor, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_regressor, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_regressor, average=None)
print(f_score)

In [None]:
# Neural Network NN
# Building the classifier

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

# Applying the model to the training data:

mlp.fit(X_train,y_train)

# Predict the test model:

labels_mlp = mlp.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('NN Test Accuracy:', accuracy_score(y_test, labels_mlp))
print('NN Train Accuracy:', accuracy_score(y_train, mlp.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_mlp))
print(classification_report(y_train, mlp.predict(X_train)))

# Confusion matrix:

mat_mlp = confusion_matrix(y_test,labels_mlp)
sns.heatmap(mat_mlp, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Applying nested cross-validation check:
scores_mlp = cross_val_score(mlp, X, y, cv=10, scoring='accuracy')
print(scores_mlp)
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores_mlp.mean(), scores_mlp.std()))

In [None]:
# Precision

precision = precision_score(y_test, labels_mlp, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_mlp, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_mlp, average=None)
print(f_score)

In [None]:
# Decision Tree

dt = DecisionTreeClassifier(criterion='entropy')

# Applying the model to the training data:

dt.fit(X_train,y_train)

# Predict the test model:

labels_dt = dt.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('Decision Tree Test Accuracy:', accuracy_score(y_test, labels_dt))
print('Decision Tree Train Accuracy:', accuracy_score(y_train, dt.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_dt))
print(classification_report(y_train, dt.predict(X_train)))

# Confusion matrix:

mat_dt = confusion_matrix(y_test,labels_dt)
sns.heatmap(mat_dt, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_dt, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_dt, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_dt, average=None)
print(f_score)

In [None]:
# Support Vector Machine (SVM)

# Building the linear Support Vector Machine Classifier

Svm = LinearSVC(dual = False, random_state = 0, penalty = 'l1',tol = 1e-5)

Svm.fit(X_train,y_train) 

# Predict the test model:

labels_svm = Svm.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('SVM Test Accuracy:', accuracy_score(y_test, labels_svm))
print('SVM Train Accuracy:', accuracy_score(y_train, Svm.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_svm))
print(classification_report(y_train, Svm.predict(X_train)))

# Confusion matrix:

mat_svm = confusion_matrix(y_test,labels_svm)
sns.heatmap(mat_svm, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Terget', 'Terget'], yticklabels=['Non-Terget', 'Terget'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_svm, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_svm, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_svm, average=None)
print(f_score)

# Approach B: Financial dataset with sentiment scores

In [None]:
# Importing the Variable selected financial dataset created in Appendix B

# Read the pickle file of the cleaned headlines dataset created in Appendix A, into a dataframe

Variable_financial = pd.read_pickle("Variable selected financial dataset")

# Renaming columns

Variable_financial.rename(columns = {"Instrument": "RIC"}, inplace = True)


In [None]:
# Droping unneccessary index column

Variable_financial.drop(['Index'],inplace = True, axis=1)


In [None]:
Variable_financial

# Applying ML on full dataset with VADER compound scores

# VADER

In [None]:
# Merging the VADER scores dataset together with the financial datasite on column RIC

Vader_num_full_dataset = Variable_financial.merge(Headlines_Vader_ok[['Compound', 'RIC']], on = 'RIC')

In [None]:
# Removing duplicated in column RIC

Vader_num_full_dataset.drop_duplicates(subset=['RIC'], inplace = True)

In [None]:
# Droping unneccessary RIC column

Vader_num_full_dataset.drop(['RIC'],inplace = True, axis=1)


In [None]:
Vader_num_full_dataset

In [None]:
# Choosing a subset of the data to split in between train and test:

X = Vader_num_full_dataset
y = Vader_num_full_dataset['Target/Non-Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=11)

In [None]:
# BASELINE MODEL logistic regression

# Baseline performance 2: Logistic regression classifier

lr = LogisticRegression()

# Applying the model to the training data:

lr.fit(X_train,y_train)


# Predict the test model:
labels_lr = lr.predict(X_test)


In [None]:
# Let's evaluate the results with accuracy:

print('Logistic Regression Test Accuracy:', accuracy_score(y_test, labels_lr))
print('Logistic Regression Train Accuracy:', accuracy_score(y_train, lr.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_lr))
print(classification_report(y_train, lr.predict(X_train)))

# Confusion matrix:

mat_lr = confusion_matrix(y_test,labels_lr)
sns.heatmap(mat_lr, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_lr, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_lr, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_lr, average=None)
print(f_score)

In [None]:
# Applying nested cross-validation check:
scores_lr = cross_val_score(lr, X, y, cv=10, scoring='accuracy')
print(scores_lr)
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores_lr.mean(), scores_lr.std()))

In [None]:
# Random Forest on feature selected dataset with Target

# Create regressor object
    
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  
# Applying the model to the training data:

regressor.fit(X_train,y_train)

# Predict the test model:

labels_regressor = regressor.predict(X_test)



In [None]:
# Let's evalueate the results with accuracy:

print('Random Forest Test Accuracy:', accuracy_score(y_test, labels_regressor))
print('Random Forest Train Accuracy:', accuracy_score(y_train, regressor.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_regressor))
print(classification_report(y_train, regressor.predict(X_train)))

# Confusion matrix:

mat_regressor = confusion_matrix(y_test,labels_regressor)
sns.heatmap(mat_regressor, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_regressor, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_regressor, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_regressor, average=None)
print(f_score)

In [None]:
# Neural Network NN
# Building the classifier

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

# Applying the model to the training data:

mlp.fit(X_train,y_train)

# Predict the test model:

labels_mlp = mlp.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('NN Test Accuracy:', accuracy_score(y_test, labels_mlp))
print('NN Train Accuracy:', accuracy_score(y_train, mlp.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_mlp))
print(classification_report(y_train, mlp.predict(X_train)))

# Confusion matrix:

mat_mlp = confusion_matrix(y_test,labels_mlp)
sns.heatmap(mat_mlp, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Applying nested cross-validation check:
scores_mlp = cross_val_score(mlp, X, y, cv=10, scoring='accuracy')
print(scores_mlp)
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores_mlp.mean(), scores_mlp.std()))

In [None]:
# Precision

precision = precision_score(y_test, labels_mlp, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_mlp, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_mlp, average=None)
print(f_score)

In [None]:
# Decision Tree

dt = DecisionTreeClassifier(criterion='entropy')

# Applying the model to the training data:

dt.fit(X_train,y_train)

# Predict the test model:

labels_dt = dt.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('Decision Tree Test Accuracy:', accuracy_score(y_test, labels_dt))
print('Decision Tree Train Accuracy:', accuracy_score(y_train, dt.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_dt))
print(classification_report(y_train, dt.predict(X_train)))

# Confusion matrix:

mat_dt = confusion_matrix(y_test,labels_dt)
sns.heatmap(mat_dt, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_dt, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_dt, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_dt, average=None)
print(f_score)

In [None]:
# Support Vector Machine (SVM)

# Building the linear Support Vector Machine Classifier

Svm = LinearSVC(dual = False, random_state = 0, penalty = 'l1',tol = 1e-5)

Svm.fit(X_train,y_train) 

# Predict the test model:

labels_svm = Svm.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('SVM Test Accuracy:', accuracy_score(y_test, labels_svm))
print('SVM Train Accuracy:', accuracy_score(y_train, Svm.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_svm))
print(classification_report(y_train, Svm.predict(X_train)))

# Confusion matrix:

mat_svm = confusion_matrix(y_test,labels_svm)
sns.heatmap(mat_svm, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Terget', 'Terget'], yticklabels=['Non-Terget', 'Terget'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_svm, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_svm, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_svm, average=None)
print(f_score)

# Applying ML on full dataset with TEXTBLOB polarity and subjectivity scores

# TEXTBLOB

In [None]:
# Merging the TEXTBLOB scores dataset together with the financial datasite on column RIC

Textblob_num_full_dataset = Variable_financial.merge(df[['Polarity','Subjectivity','RIC']], on = 'RIC')

In [None]:
# Removing duplicated in column RIC

Textblob_num_full_dataset.drop_duplicates(subset=['RIC'], inplace = True)

In [None]:
# Droping unneccessary RIC column

Textblob_num_full_dataset.drop(['RIC'],inplace = True, axis=1)


In [None]:
Textblob_num_full_dataset

In [None]:
# Choosing a subset of the data to split in between train and test:

X = Textblob_num_full_dataset
y = Textblob_num_full_dataset['Target/Non-Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=11)

In [None]:
# BASELINE MODEL logistic regression

# Baseline performance 2: Logistic regression classifier

lr = LogisticRegression()

# Applying the model to the training data:

lr.fit(X_train,y_train)


# Predict the test model:
labels_lr = lr.predict(X_test)


In [None]:
# Let's evaluate the results with accuracy:

print('Logistic Regression Test Accuracy:', accuracy_score(y_test, labels_lr))
print('Logistic Regression Train Accuracy:', accuracy_score(y_train, lr.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_lr))
print(classification_report(y_train, lr.predict(X_train)))

# Confusion matrix:

mat_lr = confusion_matrix(y_test,labels_lr)
sns.heatmap(mat_lr, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_lr, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_lr, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_lr, average=None)
print(f_score)

In [None]:
# Applying nested cross-validation check:
scores_lr = cross_val_score(lr, X, y, cv=10, scoring='accuracy')
print(scores_lr)
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores_lr.mean(), scores_lr.std()))

In [None]:
# Random Forest on feature selected dataset with Target

# Create regressor object
    
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  
# Applying the model to the training data:

regressor.fit(X_train,y_train)

# Predict the test model:

labels_regressor = regressor.predict(X_test)



In [None]:
# Let's evalueate the results with accuracy:

print('Random Forest Test Accuracy:', accuracy_score(y_test, labels_regressor))
print('Random Forest Train Accuracy:', accuracy_score(y_train, regressor.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_regressor))
print(classification_report(y_train, regressor.predict(X_train)))

# Confusion matrix:

mat_regressor = confusion_matrix(y_test,labels_regressor)
sns.heatmap(mat_regressor, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')

In [None]:
# Precision

precision = precision_score(y_test, labels_regressor, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_regressor, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_regressor, average=None)
print(f_score)

In [None]:
# Neural Network NN
# Building the classifier

mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

# Applying the model to the training data:

mlp.fit(X_train,y_train)

# Predict the test model:

labels_mlp = mlp.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('NN Test Accuracy:', accuracy_score(y_test, labels_mlp))
print('NN Train Accuracy:', accuracy_score(y_train, mlp.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_mlp))
print(classification_report(y_train, mlp.predict(X_train)))

# Confusion matrix:

mat_mlp = confusion_matrix(y_test,labels_mlp)
sns.heatmap(mat_mlp, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Applying nested cross-validation check:
scores_mlp = cross_val_score(mlp, X, y, cv=10, scoring='accuracy')
print(scores_mlp)
print("%0.4f accuracy with a standard deviation of %0.4f" % (scores_mlp.mean(), scores_mlp.std()))

In [None]:
# Precision

precision = precision_score(y_test, labels_mlp, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_mlp, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_mlp, average=None)
print(f_score)

In [None]:
# Decision Tree

dt = DecisionTreeClassifier(criterion='entropy')

# Applying the model to the training data:

dt.fit(X_train,y_train)

# Predict the test model:

labels_dt = dt.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('Decision Tree Test Accuracy:', accuracy_score(y_test, labels_dt))
print('Decision Tree Train Accuracy:', accuracy_score(y_train, dt.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_dt))
print(classification_report(y_train, dt.predict(X_train)))

# Confusion matrix:

mat_dt = confusion_matrix(y_test,labels_dt)
sns.heatmap(mat_dt, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Target', 'Target'], yticklabels=['Non-Target', 'Target'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_dt, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_dt, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_dt, average=None)
print(f_score)

In [None]:
# Support Vector Machine (SVM)

# Building the linear Support Vector Machine Classifier

Svm = LinearSVC(dual = False, random_state = 0, penalty = 'l1',tol = 1e-5)

Svm.fit(X_train,y_train) 

# Predict the test model:

labels_svm = Svm.predict(X_test)


In [None]:
# Let's evalueate the results with accuracy:

print('SVM Test Accuracy:', accuracy_score(y_test, labels_svm))
print('SVM Train Accuracy:', accuracy_score(y_train, Svm.predict(X_train)))

# Recall - but also precision, f1-score and support:

print(classification_report(y_test, labels_svm))
print(classification_report(y_train, Svm.predict(X_train)))

# Confusion matrix:

mat_svm = confusion_matrix(y_test,labels_svm)
sns.heatmap(mat_svm, square=True, annot=True, fmt="d", cbar=False,
           xticklabels=['Non-Terget', 'Terget'], yticklabels=['Non-Terget', 'Terget'])

plt.xlabel('Predicted Label')
plt.ylabel('Label')


In [None]:
# Precision

precision = precision_score(y_test, labels_svm, average=None)
print(precision)

# Recall

recall = recall_score(y_test, labels_svm, average=None)
print(recall)

# F-score
f_score = f1_score(y_test, labels_svm, average=None)
print(f_score)

# Hypothesis testing: High share prices, together with no or very low positive news sentiment, is an indication of an imminent M&A announcement.

# VADER

In [None]:
# Filtering the dataset for ontly Target companies

Vader_share_target = Vader_share_price[Vader_share_price['Target/Non-Target'] == 1]

In [None]:
# Scaling my dataset for better visualization

scaler = MinMaxScaler()
scaler.fit(Vader_share_target)
scaled = scaler.transform(Vader_share_target)
scaled_df = pd.DataFrame(scaled, columns=Vader_share_target.columns)

scaled_df

In [None]:
# Plotting VADER Compound scores together with Price To Sales Per Share (Daily Time Series Ratio)

scaled_df.plot(x="Compound", y=["Price To Sales Per Share (Daily Time Series Ratio)"], kind="kde", figsize=(9, 8))

plt.xlabel('Compound scores', fontsize=10)
plt.ylabel('Density', fontsize=10)
plt.title('VADER Compound scores and Price To Sales Per Share (Daily Time Series Ratio)' , fontsize=20)
 


In [None]:
# Plotting VADER Compound scores together with Price To Book Value Per Share (Daily Time Series Ratio) 

scaled_df.plot(x="Compound", y=["Price To Book Value Per Share (Daily Time Series Ratio)"], kind="kde", figsize=(9, 8))

plt.xlabel('Compound scores', fontsize=10)
plt.ylabel('Density', fontsize=10)
plt.title('VADER Compound scores and Price To Book Value Per Share (Daily Time Series Ratio)' , fontsize=20)
plt.legend(bbox_to_anchor=(0.5, 0.9))

# TEXTBLOB

In [None]:
# Filtering the dataset for ontly Target companies

Textblob_share_target = Textblob_share_price[Textblob_share_price['Target/Non-Target'] == 1]

In [None]:
# Scaling my dataset for better visualization

scaler = MinMaxScaler()
scaler.fit(Textblob_share_target)
scaled = scaler.transform(Textblob_share_target)
scaled_df = pd.DataFrame(scaled, columns=Textblob_share_target.columns)

scaled_df

In [None]:
# Plotting TEXTBLOB Polarity and Subjectivity scores together with Price To Sales Per Share (Daily Time Series Ratio)

scaled_df.plot(x="Polarity", y=['Subjectivity', "Price To Sales Per Share (Daily Time Series Ratio)"], kind="kde", figsize=(9, 8))

plt.xlabel('Polarity scores', fontsize=10)
plt.ylabel('Density', fontsize=10)
plt.title('TEXTBLOB Polarity and Subjectivity scores and Price To Sales Per Share (Daily Time Series Ratio)' , fontsize=20)
 

In [None]:
# Plotting TEXTBLOB Polarity and Subjectivity scores together with Price To Book Value Per Share (Daily Time Series Ratio)

scaled_df.plot(x="Polarity", y=['Subjectivity', "Price To Sales Per Share (Daily Time Series Ratio)"], kind="kde", figsize=(9, 8))

plt.xlabel('Polarity scores', fontsize=10)
plt.ylabel('Density', fontsize=10)
plt.title('TEXTBLOB Polarity and Subjectivity scores and Price To Book Value Per Share (Daily Time Series Ratio)' , fontsize=20)
 