# Getting Insights from Text Data

### Use NLP to extract context from Financial news headlines

### 1. Importing Libraries

In [43]:
import pandas as pd
import numpy as np

#Libraries for Text Pre-processing
import re, string
string.punctuation
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from pprint import pprint

#Libraries for Model Building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# Libraries for Bag of Words Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Libraries for Word Embedding
import gensim
from gensim.models import Word2Vec


In [56]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

### 2. Load & Transform Dataset

In [2]:
data = pd.read_csv('World_Economy.csv',parse_dates=True)

data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   date     9993 non-null   object
 1   group    9861 non-null   object
 2   Class    9994 non-null   object
 3   ClassID  9994 non-null   int64 
 4   title    9886 non-null   object
 5   URL      9886 non-null   object
dtypes: int64(1), object(5)
memory usage: 468.6+ KB


Unnamed: 0,date,group,Class,ClassID,title,URL
0,October 19 2021,Bank of England,Finance,1,Traders fear policymakers will move too aggres...,https://www.ft.com/content/45febe54-ca7a-40ff-...
1,October 18 2021,The Road to Recovery,Finance,1,Hopes damped over extra gas from Russia; Europ...,https://www.ft.com/content/0ba98a96-a4fc-4f35-...
2,October 18 2021,Sheikh Hasina,Finance,1,My country is investing for a zero-carbon futu...,https://www.ft.com/content/67b17114-5503-4db6-...
3,October 18 2021,UK inflation,Finance,1,"Damn, or be damned.",https://www.ft.com/content/5a94cb7b-f918-40ee-...
4,October 18 2021,Gideon Rachman,Finance,1,The US president’s domestic problems are hobbl...,https://www.ft.com/content/76ad8f97-3927-4a55-...


In [3]:
data.shape

(9994, 6)

In [4]:
#Define test & train sets
df_train= pd.read_csv('train_data.csv')
df_test=pd.read_csv('test_data.csv')

In [23]:
#convert to lowercase, strip and remove punctuations
df_train.title=df_train.title.astype(str)
df_test.title=df_test.title.astype(str)
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [24]:
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [25]:
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [26]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train['clean_text'] = df_train['title'].apply(lambda x: finalpreprocess(x))
df_train.head()

Unnamed: 0,date,group,Class,ClassID,title,URL,clean_text
0,October 14 2021,Ngozi Okonjo-Iweala,Finance,1,A common approach to the cost of polluting is ...,https://www.ft.com/content/b0bcc93c-c6d6-475e-...,common approach cost pollute fair straightforw...
1,October 14 2021,Martin Sandbu,Finance,1,IEA report shows current pledges fall short — ...,https://www.ft.com/content/9918f2da-131f-4f2a-...,iea report show current pledge fall short poin...
2,October 14 2021,Turkish economy,Fintech,0,President removes two deputy governors in new ...,https://www.ft.com/content/88cbe503-e5a0-40d4-...,president remove two deputy governor new round...
3,October 14 2021,Chinese economy,Fintech,0,Producer price index climbs 10.7% cent in Sept...,https://www.ft.com/content/75871eb7-360e-40b5-...,producer price index climb cent september year...
4,October 14 2021,FT Magazine,Fintech,0,The country’s supply issues have been exacerba...,https://www.ft.com/content/ff650169-34bc-4ac9-...,countrys supply issue exacerbate brexit thats ...


### Vectorization

In [27]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(df_train["clean_text"],df_train["ClassID"],test_size=0.2,shuffle=True)
#Word2Vec
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

### using Bag-of-Words (with Tf-Idf ) and Word2Vec

In [None]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
def fit(self, X, y):
        return self
def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
model = Word2Vec(df['clean_text_tok'],min_count=1) 
w2v = dict(zip(model.wv.index2word, model.wv.syn0)) 
df['clean_text_tok']=[nltk.word_tokenize(i) for i in df['clean_text']]
modelw = MeanEmbeddingVectorizer(w2v)
# converting text to numerical data using Word2Vec
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_test_tok)

## Machine Learning Algorithms

In [None]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [None]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression (W2v)
lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(X_train_vectors_w2v, y_train)  #model
#Predict y value for test dataset
y_predict = lr_w2v.predict(X_test_vectors_w2v)
y_prob = lr_w2v.predict_proba(X_test_vectors_w2v)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

In [33]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1176
           1       0.00      0.00      0.00         1

    accuracy                           1.00      1177
   macro avg       0.50      0.50      0.50      1177
weighted avg       1.00      1.00      1.00      1177

Confusion Matrix: [[1176    0]
 [   1    0]]
AUC: 0.9481292517006802


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Best Model Selection

In [34]:
#Pre-processing the new dataset
df_test['clean_text'] = df_test['title'].apply(lambda x: finalpreprocess(x)) #preprocess the data
X_test=df_test['clean_text'] 
#converting words to numerical data using tf-idf
X_vector=tfidf_vectorizer.transform(X_test)
#use the best model to predict 'target' value for the new dataset 
y_predict = lr_tfidf.predict(X_vector)      
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]
df_test['predict_prob']= y_prob
df_test['target']= y_predict
final=df_test[['clean_text','target']].reset_index(drop=True)
print(final.head())

                                          clean_text  target
0  move save taxpayer billion pound year hit pens...       0
1  kremlin critic say sue sanction breach credit ...       0
2  beggar thy neighbour battle business benefit p...       0
3  eu ombudsman say decision expose weakness comm...       0
4  uk bank warn vulnerability economic effect mul...       0
