In [1]:
import sys
import os
import pandas as pd
import seaborn as sns
from database import database
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
from pprint import pprint
import gensim.corpora as corpora
import re
from nltk.stem import WordNetLemmatizer 
import gensim
from gensim.utils import simple_preprocess
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
keywords = ["Ownership change", "Change of control", "Acceleration", "accelerate", "Default", \
            "Insolvency", "Insolvent", "Delay", "Late", "Failure", "fail", "Dispute", "Liquidation", \
            "Liquidator", "Margin call", "Haircut", "Bank run", "Termination", "Moratorium", "Suspension", \
            "Suspend", "Fraud", "misrepresentation", "Fine", "sanction", "Breach", "Reschedule", "Restructuring", \
            "Restructure", "Credit event", "Losses", "Loss", "Bailout", "Bailin", "Bankrupt", "Receivership", \
            "Receiver", "Judicial Management", "Judicial Manager", "Administration", "Administrator", "Sequestrate", \
            "Sequestration", "Support", "Capital call", "Liquidity event", "Negative trends", "Price changes", \
            "Board infighting", "Corruption", "Inappropriate or ultra vires dealings", "Negative working capital", \
            "Acquisition", "LBO", "Qualified audit opinion", "Regulatory breach", "Non-performing assets", \
            "Provisions", "Force majeur", "Distress", "Frozen", "Delisted", "Sued", "Suit", "Arrested", \
            "Disappeared", "Uncontactable"]

In [3]:
class LDAModel:
    pass

In [4]:
def lemmatize (sentence):
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(sentence) # Tokenize: Split the sentence into words
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    return lemmatized_output

def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc removes punctuations

In [5]:
def preprocess_dataframe (news_df, counterparty: str):
    
    '''
    lose the irrelevant columns
    remove punctuations
    lowercase
    combine summary and headline for news
    convert counterpary articles to words
    remove stopwords
    
    '''
    
    news_df = news_df.drop(columns=['_id', 'url', 'image', 'source', 'api'], axis=1).sample(100)
    news_df.reset_index(drop=True, inplace=True)
    
    news_df["headline"] = news_df["headline"].map(lambda x: re.sub(r'[,\.!?]', '', x))
    news_df["headline"] = news_df["headline"].map(lambda x: x.lower())
    news_df["summary"] = news_df["summary"].map(lambda x: re.sub(r'[,\.!?]', '', x))
    news_df["summary"] = news_df["summary"].map(lambda x: x.lower())
    
    counterparty_news = news_df[news_df["counterparty"] == counterparty]
    counterparty_news = counterparty_news["headline"] + counterparty_news["summary"]
    counterparty_news = counterparty_news.values.tolist()
    
    counterparty_article_to_words = list(sent_to_words(counterparty_news))
    
    stop_words = stopwords.words('english')
    
    tokenized_counterparty_news = [[word for word in simple_preprocess(str(news)) if word not in stop_words \
                                    and len(word) > 4] for news in counterparty_article_to_words]
    
    return tokenized_counterparty_news
    
    
def get_news_dataframe (counterparty_name: str):
    news_doc = []
    for doc in database.news_collection.find({}):
        news_doc.append(doc)
    news_df = pd.DataFrame(news_doc)
    
    news_df = preprocess_dataframe(news_df, counterparty_name)
    
    return news_df

In [7]:
# tesla_news = get_news_dataframe('TSLA')

In [8]:
def lda_model (counterparty: str, counterparty_news, num_topics):
    id2word = corpora.Dictionary(counterparty_news) # Create Dictionary
    articles = counterparty_news # Create Corpus
    corpus = [id2word.doc2bow(article) for article in articles] # Term Document Frequency
    

    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics) # Build LDA model
    
    

    pprint(lda_model.print_topics()) # Print the Keyword in the num topics
    
    path = './lda_results/ldavis_' # have this path on your computer -> will point to the database in future

    pyLDAvis.enable_notebook()
    LDAvis_data_filepath = os.path.join(path + str(num_topics) + counterparty)
    if 1 == 1:
        LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds='mmds')
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    
#     LDAvis_prepared
    
    return lda_model

In [9]:
# lda_model ('TSLA', tesla_news, 5)

## Advanced Techniques

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

In [11]:
def tokenize (articles):
    stemmer = WordNetLemmatizer()
    tokens = [word for word in nltk.word_tokenize(articles) if (len(word) > 4) ] 
    stems = [stemmer.lemmatize(item) for item in tokens]
    return stems

def preprocess_dataframe (news_df, counterparty_name: str):
    
    '''
    lose the irrelevant columns
    remove punctuations
    lowercase
    combine summary and headline for news
    convert counterpary articles to words
    remove stopwords
    
    '''
    
    news_df = news_df.drop(columns=['_id', 'url', 'image', 'source', 'api'], axis=1)
    news_df.reset_index(drop=True, inplace=True)
    
    news_df["headline"] = news_df["headline"].map(lambda x: re.sub(r'[,\.!?]', '', x))
    news_df["headline"] = news_df["headline"].map(lambda x: x.lower())
    news_df["summary"] = news_df["summary"].map(lambda x: re.sub(r'[,\.!?]', '', x))
    news_df["summary"] = news_df["summary"].map(lambda x: x.lower())

    news_df = news_df[news_df["counterparty"] == counterparty_name]
    
    X_train, X_test = train_test_split(news_df, test_size=0.7, random_state=4201)
    
    return X_train, X_test

def get_news_dataframe (counterparty_name: str):
    news_doc = []
    for doc in database.news_collection.find({}):
        news_doc.append(doc)
    news_df = pd.DataFrame(news_doc)
    
    train_news_df, test_news_df = preprocess_dataframe(news_df, counterparty_name)
    return train_news_df, test_news_df

In [12]:
def topic_modelled (lda, train_news_df, test_news_df, vectorizer_tf, W1, H1):
    
    colnames = ["Topic" + str(i) for i in range(lda.n_components)]
    docnames = ["Doc" + str(i) for i in range(len(train_news_df.summary))]
    df_doc_topic_train = pd.DataFrame(np.round(W1, 2), columns=colnames, index=docnames)
    significant_topic = np.argmax(df_doc_topic_train.values, axis=1)
    df_doc_topic_train['dominant_topic'] = significant_topic
    
    WHold = lda.transform(vectorizer_tf.transform(test_news_df.headline + test_news_df.summary))
    
    colnames = ["Topic" + str(i) for i in range(lda.n_components)]
    docnames = ["Doc" + str(i) for i in range(len(test_news_df.summary))]
    df_doc_topic_test = pd.DataFrame(np.round(WHold, 2), columns=colnames, index=docnames)
    significant_topic = np.argmax(df_doc_topic_test.values, axis=1)
    df_doc_topic_test['dominant_topic'] = significant_topic

    return df_doc_topic_train, df_doc_topic_test

In [13]:
def advanced_lda_model (counterparty_name: str, num_topic_words = 10, num_topics = 10):
    train_news_df, test_news_df = get_news_dataframe (counterparty_name)
    
    
    vectorizer_tf = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english',\
                                    max_features = 1000, use_idf = False, norm = None)
    tf_vectors = vectorizer_tf.fit_transform(train_news_df.headline + train_news_df.summary) 
    
    lda = decomposition.LatentDirichletAllocation(n_components = num_topics,\
                                              max_iter = 3, learning_method = 'online',\
                                              learning_offset = 50, n_jobs = -1, random_state=4201)
    
    W1 = lda.fit_transform(tf_vectors)
    H1 = lda.components_
    vocab = np.array(vectorizer_tf.get_feature_names())

    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_topic_words-1:-1]]
    topic_words = ([top_words(t) for t in H1])
    topics = [' '.join(t) for t in topic_words]
    
    df_doc_topic_train, df_doc_topic_test = topic_modelled (lda, train_news_df, test_news_df, vectorizer_tf, W1, H1)
    
    return topics, df_doc_topic_train, df_doc_topic_test, train_news_df, test_news_df

In [14]:
def define_topic_per_article (counterparty_name: str, num_topic_words = 10, num_topics = 10):
    topics, df_doc_topic_train, df_doc_topic_test, train_news_df, test_news_df = advanced_lda_model \
                                                                (counterparty_name, num_topic_words, num_topics)
    test_news_df['dominant_topic'] = list(df_doc_topic_test['dominant_topic'])
    train_news_df['dominant_topic'] = list(df_doc_topic_train['dominant_topic'])
    
    test_news_df.reset_index(drop=True, inplace=True)
    train_news_df.reset_index(drop=True, inplace=True)
    
    frames = [test_news_df, train_news_df]
    counterparty_news = pd.concat(frames)
    counterparty_news.reset_index(drop=True, inplace=True)
    
    return topics, counterparty_news

In [None]:
topics, counterparty_news = define_topic_per_article('TSLA', 8, 8)

In [None]:
topics

In [None]:
counterparty_news

In [None]:
def graph():
    topics, counterparty_news = define_topic_per_article('TSLA', 8, 8)
    num_topics = len(topics)
    
    topics_dict_count = {}
    
    for topic_num in range(num_topics):
        topic_dict_count[topic_num] = \
        counterparty_news[counterparty_news["dominant_topic"] == topic_num].dominant_topic.count()
    
    topics_list = topics_dict_count.items()
    x, y = zip(*topics_list) 

    plt.rcParams["figure.figsize"] = (20,3)
    plt.plot(x, y)
    plt.xticks(rotation='vertical')
    plt.show()