In [2]:
## Libraries and packages

# data processessing packages
import numpy as np
import pandas as pd
import re

# ml packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

## Loading the data

In [3]:
train_data = pd.read_csv('../data/clean_train_data.csv')
test_data = pd.read_csv('../data/clean_test_data.csv')

In [4]:
display(train_data)
display(test_data)

Unnamed: 0.1,Unnamed: 0,text,label
0,0,Here are Thursday's biggest analyst calls: App...,0
1,1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,3,"Analysts react to Tesla's latest earnings, bre...",0
4,4,Netflix and its peers are set for a ‘return to...,0
...,...,...,...
16985,16985,KfW credit line for Uniper could be raised to ...,3
16986,16986,KfW credit line for Uniper could be raised to ...,3
16987,16987,Russian sells 1 bln roubles at one-year repo...,3
16988,16988,Global ESG bond issuance posts H1 dip as supra...,3


Unnamed: 0.1,Unnamed: 0,text,label
0,0,Analyst call of the day for @CNBCPro subscribe...,0
1,1,"Loop upgrades CSX to buy, says it's a good pla...",0
2,2,BofA believes we're already in a recession — a...,0
3,3,JPMorgan sees these derivative plays as best w...,0
4,4,Morgan Stanley's Huberty sees Apple earnings m...,0
...,...,...,...
4112,4112,Dollar bonds of Chinese developers fall as str...,3
4113,4113,Longer maturity Treasury yields have scope to ...,3
4114,4114,Pimco buys €1bn of Apollo buyout loans from ba...,3
4115,4115,Analysis: Banks' snubbing of junk-rated loan f...,3


In [16]:
## Label list
labels = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

## Viewing Samples

In [6]:
## FUNCTION FOR SAMPLING DATA AND VIEWING RESULTS

def preview_random_sample(df, sample_num = 5):
    # A function to grab a random sample and display the text and associated labels

    # VARIABLES
    # df = dataframe
    # col_name = name of column to display
    # sample_num = number of samples to display

    # generating random sample
    random_sample = df.sample(sample_num)

    # looping over sampple an displaying results
    for i, data in enumerate(random_sample.iterrows()):
        print(f'RECORD {i+1}')
        print(f'Text: {data[1].text}')
        print(f'Label: {labels[data[1].label]}\n')

    return 'SAMPLING COMPLETE'

#### Previewing data samples

In [7]:
print('Train data sample:\n' + '---------------'*5)
preview_random_sample(train_data)

print('Test data sample:\n' + '---------------'*5)
preview_random_sample(test_data)

Train data sample:
---------------------------------------------------------------------------
RECORD 1
Text: Despite its reputation for overpowering strength and bitterness, robusta coffee is finding new followers, and Brazilian farmers are stepping in to supply it  
Label: General News | Opinion

RECORD 2
Text: $NRDY - Dialing In On Nerdy.   #markets #economy #business
Label: Stock Commentary

RECORD 3
Text: $XFOR - X4 Pharma to focus resources on lead drug candidate, cut workforce by about 20%  
Label: Company | Product News

RECORD 4
Text: $CDUAF - Canadian Utilities declares CAD 0.4442 dividend  
Label: Dividend

RECORD 5
Text: @Pres_Elect_Matt careful, right up against ⚓️VWAP from 3/29 high
Label: Stock Commentary

Test data sample:
---------------------------------------------------------------------------
RECORD 1
Text: Asbury Automotive Group Schedules Release of Second Quarter 2022 Financial Results    
Label: Earnings

RECORD 2
Text: Myers Industries Releases Inaugural Envir

'SAMPLING COMPLETE'

#### Generate training/testing and labels

In [8]:
## Function for selecting/encoding features and labels

def model_data_labels(train_df, test_df, features = 'text', labels = 'label'):
    # VARIABLES:
    # df = dataframe
    # features = name of column for features in df
    # labels = name of column for labels in df

    # selecting train features and labels
    x_train = train_df[features]
    y_train = train_df[labels]
   
    # selecting trest features and labels
    x_test = test_df[features]
    y_test = test_df[labels]

    # encoding features   
    vectorizer = TfidfVectorizer()
    x_train_enc_tf = vectorizer.fit_transform(x_train)

 # encoding features
    x_test_enc_tf = vectorizer.transform(x_test)

    print(f'x_train shape:\t\t\t\t{x_train_enc_tf.shape}\n' +
          f'y_train shape:\t\t\t\t{y_train.shape}\n' + 
          f'x_test shape:\t\t\t\t{x_test_enc_tf.shape}\n' + 
          f'y_test shape:\t\t\t\t{y_test.shape}\n' )

    return  x_train_enc_tf, x_test_enc_tf, y_train, y_test





In [9]:
x_train, x_test, y_train, y_test = model_data_labels(train_data, test_data)

x_train shape:				(16990, 23072)
y_train shape:				(16990,)
x_test shape:				(4117, 23072)
y_test shape:				(4117,)



## Baselines

In [10]:
# Creating Linear SVM model for baseline testing
lin_svc = LinearSVC()
lin_svc.fit(x_train, y_train)


In [11]:
# Evaluating baseline model
test_accuracy = lin_svc.score(x_test, y_test)
print(f"{str(lin_svc)}\nTest Accuracy: {np.round(test_accuracy, decimals=4)}\n")

LinearSVC()
Test Accuracy: 0.8438



In [17]:
# Computing F1 Scores
svc_pred = lin_svc.predict(x_test)
print(classification_report(y_test, svc_pred, target_names=labels, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.8833    0.7260    0.7970        73
        Fed | Central Banks     0.8565    0.8364    0.8463       214
     Company | Product News     0.8207    0.8862    0.8521       852
Treasuries | Corporate Debt     0.9180    0.7273    0.8116        77
                   Dividend     0.9320    0.9897    0.9600        97
                   Earnings     0.9538    0.9380    0.9458       242
               Energy | Oil     0.8056    0.7945    0.8000       146
                 Financials     0.8718    0.8500    0.8608       160
                 Currencies     0.8519    0.7188    0.7797        32
     General News | Opinion     0.8000    0.7738    0.7867       336
  Gold | Metals | Materials     0.4667    0.5385    0.5000        13
                        IPO     0.8667    0.9286    0.8966        14
         Legal | Regulation     0.9327    0.8151    0.8700       119
          M&A | Investments     0