---
# Scrapping Twitter for Company Sentiment
---

## Steps for analyzing Twitter data to determine sentiment towards companies

* Scrape Tweets over N-period
* Add relevance of financial institutions
* Use LDA to extract relevant topics from the Tweets
* LSTM (deep learning) for sentiment extraction
* NER tagger (Stanford) used for enittyu recognition (Organization, Location, Names)

## Scrape Tweet Data
---

In [1]:
import pandas as pd
import warnings
import re
import numpy as np

# NTLK functions
import nltk
from nltk.corpus import stopwords
from nltk import tokenize as tok
from nltk.stem.snowball import SnowballStemmer # load nltk's SnowballStemmer as variabled 'stemmer'
import lda # topic modeling -NMF & LDA
import string
from nltk.tag import StanfordNERTagger

warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
# Tf-Idf and Clustering packages
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## Read in Tweet data

In [8]:
tweet_df_raw = pd.read_csv('../../data/data_all_tweets.csv')

In [9]:
search_terms = ['mortgage','current account','savings account','insurance','credit card','pension',
                'personal loan','money transfer','tax advice','investment','wealth management']

In [10]:
tweet_df_all.shape

(0, 0)

## Add List of Financial Institutions providing afformentioned products
---

In [12]:
tweet_df_raw = tweet_df_raw[tweet_df_raw['text'].notna()]

In [14]:
 print(tweet_df_raw.shape);tweet_df_raw.head()

(43704, 13)


Unnamed: 0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,geo,urls,search_term
0,1178457108276289536,40080176,This normalisation of no deal is horrendous. P...,0,https://twitter.com/KatarinaKeys/status/117845...,2019-09-29 23:50:43+00:00,Sun Sep 29 23:50:43 +0000 2019,0,,,,,mortgage
1,1178455823242035201,1126071201481334787,Jumbo Mortgage Program https:// conclud.com/ht...,0,https://twitter.com/Conclud2/status/1178455823...,2019-09-29 23:45:37+00:00,Sun Sep 29 23:45:37 +0000 2019,0,,,,https://conclud.com/https-www-madisonmortgageg...,mortgage
2,1178450126219685893,729387514914603009,If you have no work it's harder to feed your k...,0,https://twitter.com/cjhenrygonzo/status/117845...,2019-09-29 23:22:59+00:00,Sun Sep 29 23:22:59 +0000 2019,0,,,,,mortgage
3,1178446295985541120,1697126574,"Solution. ""You'll need to be: 18+ and a UK res...",0,https://twitter.com/blazedstorm/status/1178446...,2019-09-29 23:07:46+00:00,Sun Sep 29 23:07:46 +0000 2019,2,,,,,mortgage
4,1178446170722619393,1239955070,Kabaddi x3 UK Premier 1st show House Full Show...,0,https://twitter.com/habamoment/status/11784461...,2019-09-29 23:07:16+00:00,Sun Sep 29 23:07:16 +0000 2019,0,@Peepal,,,https://www.facebook.com/habteam/posts/1106547...,mortgage


In [24]:
 # create df object of companies looking to analyze
fin_firms = pd.read_csv('../../data/fin_firms.csv')
tweet_df_raw['text'] = tweet_df_raw['text'].str.lower()
tweet_df_raw['company']=''
fin_firms.head()

Unnamed: 0,sector,company_name
0,Banks,Barclays
1,Banks,Lloyds
2,Banks,HSBC
3,Banks,Citi Bank
4,Banks,Santander


In [25]:
 # locate instances of the firms being mentioned in the raw Twitter data
for comp in fin_firms['company_name'].unique():
    print(comp.lower())
    tweet_df_raw.loc[tweet_df_raw['text'].str.contains(comp.lower()),'company'] = comp

barclays
lloyds
hsbc
citi bank
santander
nationwide
allianz
western union
starling
axa
monzo
revolut
qbe


In [27]:
tweet_df_filtered = tweet_df_raw.merge(fin_firms, how='left', left_on='company', right_on='company_name')
tweet_df_filtered.head().T

Unnamed: 0,0,1,2,3,4
id,1178457108276289536,1178455823242035201,1178450126219685893,1178446295985541120,1178446170722619393
author_id,40080176,1126071201481334787,729387514914603009,1697126574,1239955070
text,this normalisation of no deal is horrendous. p...,jumbo mortgage program https:// conclud.com/ht...,if you have no work it's harder to feed your k...,"solution. ""you'll need to be: 18+ and a uk res...",kabaddi x3 uk premier 1st show house full show...
retweets,0,0,0,0,0
permalink,https://twitter.com/KatarinaKeys/status/117845...,https://twitter.com/Conclud2/status/1178455823...,https://twitter.com/cjhenrygonzo/status/117845...,https://twitter.com/blazedstorm/status/1178446...,https://twitter.com/habamoment/status/11784461...
date,2019-09-29 23:50:43+00:00,2019-09-29 23:45:37+00:00,2019-09-29 23:22:59+00:00,2019-09-29 23:07:46+00:00,2019-09-29 23:07:16+00:00
formatted_date,Sun Sep 29 23:50:43 +0000 2019,Sun Sep 29 23:45:37 +0000 2019,Sun Sep 29 23:22:59 +0000 2019,Sun Sep 29 23:07:46 +0000 2019,Sun Sep 29 23:07:16 +0000 2019
favorites,0,0,0,2,0
mentions,,,,,@Peepal
hashtags,,,,,


In [28]:
 # search terms that were used most commonly with the associated firms
 tweet_df_filtered.groupby('search_term')['id'].count()

search_term
credit card           2226
current account        254
insurance            10814
investment           15673
money transfer          69
mortgage              4704
pension               9347
peronal loan            20
savings account        182
tax advice             146
wealth management      269
Name: id, dtype: int64

## Topic Extraction: LDA Model
---

In [29]:
# remove extra values
# removing everthing that is not a regular expression
is_url = re.compile(r'http[s]?:// (?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', re.VERBOSE | re.IGNORECASE)
is_rt_username = re.compile(r'^RT+[\s]+(@[\w_]+:)',re.VERBOSE | re.IGNORECASE) #r'^RT+[\s]+(@[\w_]+:)'
# removing tags
is_entity = re.compile(r'@[\w_]+', re.VERBOSE | re.IGNORECASE)

# print topics
def print_topics(model, count_vectorizer, n_top_words):
    """
    Prints the topics of the twitter data
    """
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])) 

# show top n keywords for each of the topics
def show_topics(vectorizer, lda_model, n_words=20):
    """
    Show the topcs and the most common keywords
    """
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))

    return topic_keywords
     
        
def clean_tweet(row):
    """
    Clean the tweets of urls, usernames, and excess words
    """
    row = is_url.sub("",row)
    row = is_rt_username.sub("",row)
    row = is_entity.sub("",row)

    return row

def tokenize_only(text):
    """
    Toeknize the tweets (Sentence -> words) and filter any numerical tokens
    """
    # tokenize by sentence,then word
    tokens = [word.lower() for sent in tok.sent_tokenize(text) for word in tok.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    return filtered_tokens

In [30]:
 # remove urls and retweets and entities from the text
tweet_df_filtered['text_clean'] = tweet_df_filtered['text'].apply(lambda row:clean_tweet(row))

# remove punctuations
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])  
tweet_df_filtered['text_clean'] = tweet_df_filtered['text_clean'].str.replace(RE_PUNCTUATION, "")
tweet_df_filtered.head()

Unnamed: 0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,geo,urls,search_term,company,sector,company_name,text_clean
0,1178457108276289536,40080176,this normalisation of no deal is horrendous. p...,0,https://twitter.com/KatarinaKeys/status/117845...,2019-09-29 23:50:43+00:00,Sun Sep 29 23:50:43 +0000 2019,0,,,,,mortgage,,,,this normalisation of no deal is horrendous pe...
1,1178455823242035201,1126071201481334787,jumbo mortgage program https:// conclud.com/ht...,0,https://twitter.com/Conclud2/status/1178455823...,2019-09-29 23:45:37+00:00,Sun Sep 29 23:45:37 +0000 2019,0,,,,https://conclud.com/https-www-madisonmortgageg...,mortgage,,,,jumbo mortgage program https concludcomhttpsww...
2,1178450126219685893,729387514914603009,if you have no work it's harder to feed your k...,0,https://twitter.com/cjhenrygonzo/status/117845...,2019-09-29 23:22:59+00:00,Sun Sep 29 23:22:59 +0000 2019,0,,,,,mortgage,,,,if you have no work its harder to feed your ki...
3,1178446295985541120,1697126574,"solution. ""you'll need to be: 18+ and a uk res...",0,https://twitter.com/blazedstorm/status/1178446...,2019-09-29 23:07:46+00:00,Sun Sep 29 23:07:46 +0000 2019,2,,,,,mortgage,,,,solution youll need to be 18 and a uk resident...
4,1178446170722619393,1239955070,kabaddi x3 uk premier 1st show house full show...,0,https://twitter.com/habamoment/status/11784461...,2019-09-29 23:07:16+00:00,Sun Sep 29 23:07:16 +0000 2019,0,@Peepal,,,https://www.facebook.com/habteam/posts/1106547...,mortgage,,,,kabaddi x3 uk premier 1st show house full show...


In [34]:
# list of stopwords (used to remove stopwords)
stop_words = stopwords.words('english') #import stopwords from NLTK package
stop_words_df = pd.read_csv("../../data/pre_process_twitter_stop_words.csv", encoding='ISO-8859-1') # import stopwords from CSV file as pandas data frame
stop_words_df = stop_words_df.wordList.tolist() # convert pandas data frame to a list
stop_words_df.append('http')
stop_words_df.append('https')

# add in search terms as topic extraction is performed within each search topic, 
# we do not want the word or variation of the word captured as a topic word
search_terms_revised = ['mortgages','wealthmanagement','pensions','money','transfer']
stop_words_df.extend(search_terms)
stop_words_df.extend(search_terms_revised)

stop_list = stop_words + stop_words_df # combine two lists i.e. NLTK stop words and CSV stopwords
stop_list = list(set(stop_list)) # store only unique values 

In [37]:
# parameter for lda
number_topics = 5
number_words = 5

In [38]:
nltk.download('punkt')
tweets_all_topics = pd.DataFrame()
# term frequency modeling
for terms in tweet_df_comp['search_term'].unique():
    print(terms)
    tweets_search_topics  = tweet_df_filtered[tweet_df_filtered['search_term']==terms].reset_index(drop=True)
    corpus = tweets_search_topics['text_clean'].tolist()
    # print(corpus)
    tf_vectorizer = CountVectorizer(max_df=0.9, min_df=0.00, stop_words=stop_list, tokenizer=tokenize_only) # use tf (raw term      count) features for LDA.
    tf = tf_vectorizer.fit_transform(corpus)
    
    # create and fit the LDA model
    model = LDA(n_components=number_topics, n_jobs=-1)
    id_topic = model.fit(tf)
    # print the topics found by the LDA model
    print("Topics found via LDA:")
    topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=model, n_words=number_words)        
    # topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    df_topic_keywords = df_topic_keywords.reset_index()
    df_topic_keywords['topic_index'] = df_topic_keywords['index'].str.split(' ', n = 1, expand = True)[[1]].astype('int')
    print(df_topic_keywords)
    
    ############ get the dominat topic for each document in a data frame ###############
    # create document — Topic Matrix
    lda_output = model.transform(tf)
    # column names
    topicnames = ["Topic" + str(i) for i in range(model.n_components)]
    # index names
    docnames = ["Doc" + str(i) for i in range(len(corpus))]
    
    # make pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    # get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic   
    df_document_topic = df_document_topic.reset_index()
        
    # combine all the search terms into one data frame
    tweets_topics = tweets_search_topics.merge(df_document_topic, left_index=True, right_index=True, how='left')
    tweets_topics_words = tweets_topics.merge(df_topic_keywords, how='left', left_on='dominant_topic', right_on='topic_index')
    tweets_all_topics = tweets_all_topics.append(tweets_topics_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\seanm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
mortgage
Topics found via LDA:
     index   Word 0 Word 1  Word 2    Word 3  Word 4  topic_index
0  Topic 0  adviser   jobs    pass   trainee   cemap            0
1  Topic 1    house    pay  people       get   years            1
2  Topic 2      pay   need   least     years  people            2
3  Topic 3      get    buy     one  property    time            3
4  Topic 4      pay    get    rent    people    know            4
current account
  'stop_words.' % sorted(inconsistent))
Topics found via LDA:
     index   Word 0  Word 1  Word 2   Word 3   Word 4  topic_index
0  Topic 0       hi    help    bank   online  banking            0
1  Topic 1  deficit    card  switch  balance       im            1
2  Topic 2  service     set     new    china     make            2
3  Topic 3     also  switch     one    using    hello            3
4  Topic

In [39]:
tweets_all_topics = tweets_all_topics.reset_index(drop=True)
print(tweets_all_topics.shape)
tweets_all_topics.head()

(43704, 31)


Unnamed: 0,id,author_id,text,retweets,permalink,date,formatted_date,favorites,mentions,hashtags,...,Topic3,Topic4,dominant_topic,index_y,Word 0,Word 1,Word 2,Word 3,Word 4,topic_index
0,1178457108276289536,40080176,this normalisation of no deal is horrendous. p...,0,https://twitter.com/KatarinaKeys/status/117845...,2019-09-29 23:50:43+00:00,Sun Sep 29 23:50:43 +0000 2019,0,,,...,0.01,0.01,1,Topic 1,house,pay,people,get,years,1
1,1178455823242035201,1126071201481334787,jumbo mortgage program https:// conclud.com/ht...,0,https://twitter.com/Conclud2/status/1178455823...,2019-09-29 23:45:37+00:00,Sun Sep 29 23:45:37 +0000 2019,0,,,...,0.04,0.04,2,Topic 2,pay,need,least,years,people,2
2,1178450126219685893,729387514914603009,if you have no work it's harder to feed your k...,0,https://twitter.com/cjhenrygonzo/status/117845...,2019-09-29 23:22:59+00:00,Sun Sep 29 23:22:59 +0000 2019,0,,,...,0.01,0.01,1,Topic 1,house,pay,people,get,years,1
3,1178446295985541120,1697126574,"solution. ""you'll need to be: 18+ and a uk res...",0,https://twitter.com/blazedstorm/status/1178446...,2019-09-29 23:07:46+00:00,Sun Sep 29 23:07:46 +0000 2019,2,,,...,0.01,0.01,2,Topic 2,pay,need,least,years,people,2
4,1178446170722619393,1239955070,kabaddi x3 uk premier 1st show house full show...,0,https://twitter.com/habamoment/status/11784461...,2019-09-29 23:07:16+00:00,Sun Sep 29 23:07:16 +0000 2019,0,@Peepal,,...,0.02,0.02,1,Topic 1,house,pay,people,get,years,1


In [42]:
tweets_all_topics.to_csv('../../data/tweets_all_topics.csv', index=False)

## Sentiment Analysis with Deep Learning
---

I have trained the model usign the movie review data. The details of the training of the model can be found here: https://towardsdatascience.com/sentiment-analysis-for-text-with-deep-learning-2f0a0c6472b5

In [46]:
pip install keras

Collecting keras
  Using cached Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3
Note: you may need to restart the kernel to use updated packages.


In [75]:
import keras
import h5py
from keras.models import model_from_json
from tensorflow.keras.models import load_model
import json
from nltk.tokenize import RegexpTokenizer
import tensorflow as tf

In [77]:
# read in the weight of the trained model
weight_path = '../../data/model/dl_model.hdf5'

prd_model = load_model(weight_path)
prd_model.summary()
word_idx = json.load(open("../../data/model/word_idx.txt"))

AttributeError: 'list' object has no attribute 'items'

In [49]:
def get_sentiment_DL(prd_model, text_data, word_idx):

    # data = "Pass the salt"

    live_list = []
    batchSize = len(text_data)
    live_list_np = np.zeros((56,batchSize))
    for index, row in text_data.iterrows():
        # print (index)
        text_data_sample = text_data['text'][index]
        # split the sentence into its words and remove any punctuations.
        tokenizer = RegexpTokenizer(r'\w+')
        text_data_list = tokenizer.tokenize(text_data_sample)

        # text_data_list = text_data_sample.split()


        labels = np.array(['1','2','3','4','5','6','7','8','9','10'], dtype = "int")
        # word_idx['I']
        # get index for the live stage
        data_index = np.array([word_idx[word.lower()] if word.lower() in word_idx else 0 for word in text_data_list])
        data_index_np = np.array(data_index)

        # padded with zeros of length 56 i.e maximum length
        padded_array = np.zeros(56)
        padded_array[:data_index_np.shape[0]] = data_index_np[:56]
        data_index_np_pad = padded_array.astype(int)


        live_list.append(data_index_np_pad)

    live_list_np = np.asarray(live_list)
    score = prd_model.predict(live_list_np, batch_size=batchSize, verbose=0)
    single_score = np.round(np.dot(score, labels)/10,decimals=2)

    score_all  = []
    for each_score in score:

        top_3_index = np.argsort(each_score)[-3:]
        top_3_scores = each_score[top_3_index]
        top_3_weights = top_3_scores/np.sum(top_3_scores)
        single_score_dot = np.round(np.dot(top_3_index, top_3_weights)/10, decimals = 2)
        score_all.append(single_score_dot)

    text_data['Sentiment_Score'] = pd.DataFrame(score_all)

    return text_data

In [50]:
text_data =  tweets_all_topics
# deep Learning sentiment scoring
text_out = get_sentiment_DL(prd_model, text_data, word_idx)

NameError: name 'prd_model' is not defined

### Example negative tweets

In [23]:
text_out.sort_values(by='Sentiment_Score')[['text','Sentiment_Score']].head().T 

NameError: name 'text_out' is not defined

### Example positive tweets

In [24]:
text_out.sort_values(by='Sentiment_Score', ascending=False)[['text','Sentiment_Score']].head().T 

NameError: name 'text_out' is not defined

In [25]:
# save the output files
text_out.to_csv('../processed_data/tweets_topics_sentiment.csv', index=False)

NameError: name 'text_out' is not defined

## Named Entity Recognition
---

The below section is implementing a stanford 3 class NER tagger. The model is trained based on on supervised Conditional Random Field (CRF) model. Additional information on the model is available at https://nlp.stanford.edu/software/CRF-NER.html

In [None]:
def get_NER(text_data):
    # /Users/prajwalshreyas/Desktop/Singularity/dockerApps/ner-algo/stanford-ner-2015-01-30
    stanford_classifier = '../models/ner/english.all.3class.distsim.crf.ser.gz'
    stanford_ner_path = '../models/ner/stanford-ner.jar'

    # try:
        # Creating Tagger Object
    st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')
    # except Exception as e:
    #       print (e)

    # get keyword for the input data frame
    # keyword = tweetDataFrame.keyword.unique()
    # Subset column containing tweet text and convert to list
    # next insert a placeholder ' 12345678 ' to signify end of individual tweets

    # text_data = pd.read_json('/Users/prajwalshreyas/Desktop/Singularity/dockerApps/sentiment-algo/ app-sentiment-algo/sample_text.json')
    print ('start get_NER')
    text_out = text_data.copy()
    doc = [ docs + ' 12345678 ' for docs in list(text_data['text'])]
    # ------------------------- Stanford Named Entity Recognition
    tokens = nltk.word_tokenize(str(doc))
    entities = st.tag(tokens) # actual tagging takes place using Stanford NER algorithm


    entities = [list(elem) for elem in entities] # Convert list of tuples to list of list
    print ('tag complete')
    for idx,element in enumerate(entities):
        try:
            if entities[idx][0] == '12345678':
                entities[idx][1] = "DOC_NUMBER"  #  Modify data by adding the tag "Doc_Number"
            # elif entities[idx][0].lower() == keyword:
            # entities[idx][1] = "KEYWORD"
            # combine First and Last name into a single word
            elif entities[idx][1] == "PERSON" and entities[idx + 1][1] == "PERSON":
                entities[idx + 1][0] = entities[idx][0] + '-' + entities[idx+1][0]
                entities[idx][1] = 'Combined'
            # combine consecutive Organization names
            elif entities[idx][1] == 'ORGANIZATION' and entities[idx + 1][1] == 'ORGANIZATION':
                entities[idx + 1][0] = entities[idx][0] + '-' + entities[idx+1][0]
                entities[idx][1] = 'Combined'
        except IndexError:
            break
    print ('enumerate complete')
    # filter list of list for the words we are interested in
    filter_list = ['DOC_NUMBER','PERSON','LOCATION','ORGANIZATION']
    entityWordList = [element for element in entities if any(i in element for i in filter_list)]

    entityString = ' '.join(str(word) for insideList in entityWordList for word in insideList) 
    # convert list to string and concatenate it
    entitySubString = entityString.split("DOC_NUMBER") # split the string using the separator 'TWEET_NUMBER'
    del entitySubString[-1] # delete the extra blank row created in the previous step

    # store the classified NERs in the main tweet data frame
    for idx,docNER in enumerate(entitySubString):
        docNER = docNER.strip().split() # split the string into word list
        # filter for words tagged as Organization and store it in data frame
        text_out.loc[idx,'Organization'] =  ','.join([docNER[i-1]  for i,x in enumerate(docNER) if x
        == 'ORGANIZATION'])
        # filter for words tagged as LOCATION and store it in data frame
        text_out.loc[idx,'Place'] = ','.join([docNER[i-1] for i,x in enumerate(docNER) if x ==
        'LOCATION'])
        # filter for words tagged as PERSON and store it in data frame
        text_out.loc[idx,'Person'] = ','.join([docNER[i-1]  for i,x in enumerate(docNER) if x ==   
        'PERSON'])   

    print ('process complete')
    return text_out

In [26]:
text_ner_out = get_NER(text_out)

NameError: name 'get_NER' is not defined

In [27]:
# the outputs of the ner tagger
text_ner_out.loc[(text_ner_out['Place'] != '') | (text_ner_out['Organization'] != '')|(text_ner_out['Person'] != '')][['text','Organization','Place','Person']].head()

NameError: name 'text_ner_out' is not defined

In [28]:
text_ner_out.to_csv('../processed_data/tweets_topics_sentiment_ner.csv', index=False)

NameError: name 'text_ner_out' is not defined