In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import re

from nltk.stem.snowball import SnowballStemmer
import nltk
from gensim.parsing.preprocessing import STOPWORDS
import gensim

stemmer = SnowballStemmer("english")

In [2]:
def unique(x):
    d={}
    for i in x:
        d[i]=d.get(i,0)+1
    return sorted(d.items(),key=lambda x: -x[1])

def find_kol(data,rank):
    user=[]
    for i in data:
        user.append(i['user_id'])
    user_d={}
    for i in user:
        user_d[i]=user_d.get(i,0)+1
    user_frequency=unique(user)
    kol_id=[i[0] for i in user_frequency[:rank]]
    kol_name=[]
    for j in kol_id:
        for i in data:
            if i['user_id']==j:
                kol_name.append(i['username'])
                break
    return kol_name

def unique_time(data):
    time=[]
    for i in data:
        time.append(i['timestamp'][:10])
    return sorted(unique(time),key=lambda x: x[0])

def exforeign(data):
    eng_data=[]
    for i in data:
        pos=i['text_html'].find('>')
        if 'lang="en"' in i['text_html'][:pos]:
            eng_data.append(i)
    return eng_data

def duplicate(data):
    result=[]
    text=[]
    for i in data:
        if i['text'] not in text:
            result.append(i)
            text.append(i['text'])
    return result

def remove_link(data):
    for x,i in enumerate(data):
        if '/' in i['text']:
            result = re.sub(r"(https|http)\S+", "", i['text'])
            #result=re.sub(r'(https|http)?:\/(\w|\.|\/|\?|\=|\&|\%)*\b','',i['text'])
            result = re.sub(r"\S+.com\S+", "", result)
            data[x]['text']=result
    return data

def remove_sign(data):
    for x,i in enumerate(data):
        # hash tag
        result=re.sub(r'#\S+', '', i['text'])
        # user mention
        #result=re.sub(r'#\S+', '', result)
        # emoji
        result=re.sub(r'[^\x00-\x7F]+', '', result)
        # html tags
        result=re.sub(r'<.*?>', '', result)
        # extra spaces
        result=re.sub(r' +', ' ', result)
        # punctuation
        from string import punctuation as punc
        result=re.sub('[{}]'.format(punc), '', result)
        data[x]['text']=result
    return data
        
def toword(data,stem=False):
    result=[]
    t=nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    f=(stemmer.stem) if (stem) else (lambda x: x)
    for x,i in enumerate(data):
        temp=[f(j).lower() for j in t.tokenize(i['text'])\
              if j.isalnum() and f(j).lower() not in STOPWORDS]
        result.append(temp)
    return result#stemmer.stem(j)

def LDA(texts,topics=20,num_words=15):
    from gensim import corpora
    from gensim.models.ldamodel import LdaModel
    import pprint
    from operator import itemgetter
    dictionary = corpora.Dictionary(texts) 
    corpus = [dictionary.doc2bow(text) for text in texts]
    num_topics = topics #The number of topics that should be generated
    passes = 20
    lda = LdaModel(corpus,
              id2word=dictionary,
              num_topics=num_topics,
              passes=passes)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(lda.print_topics(num_words=num_words))
    lda.get_document_topics(corpus[0],minimum_probability=0.05,per_word_topics=False)
    pp.pprint(sorted(lda.get_document_topics(corpus[0],minimum_probability=0,per_word_topics=False),key=itemgetter(1),reverse=True))

In [3]:
data=[]
for j in range(1,7):
    try: #Wealth_Management2017-02.json
        temp=json.load(open('.\wm\Wealth_Management{}-{:02d}.json'.format(2017,j)))
        print(2017,j,len(temp))
        data+=temp
    except:
        break

2017 1 8252
2017 2 8047
2017 3 8068
2017 4 8070
2017 5 8255
2017 6 8682


In [4]:
len(data)

49374

In [5]:
import copy
data=exforeign(data)
data=duplicate(data)
data0=copy.deepcopy(data)
data=remove_link(data)
data=remove_sign(data)
#data=duplicate(data)

In [6]:
''' 
import statsmodels.api as sm
a=np.array([i[1] for i in datedist])
#plt.plot(np.diff(a))
plt.bar(range(41),sm.tsa.stattools.pacf(a))
'''

' \nimport statsmodels.api as sm\na=np.array([i[1] for i in datedist])\n#plt.plot(np.diff(a))\nplt.bar(range(41),sm.tsa.stattools.pacf(a))\n'

In [7]:
print(len(data),len(data0))

40991 40991


In [8]:
words=toword(data)
nstem=toword(data,stem=True)

In [9]:
for i in range(100): 
    print(data0[i]['text'])
    print(data[i]['text'])
    print(nstem[i])
    print(words[i])
    print()

High growth doesn't mean higher stock returnshttps://wi.st/2i50e76 #Videoblog #WealthManagement #CapitalAtRisk
High growth doesnt mean higher stock returns 
['high', 'growth', 'doesnt', 'mean', 'higher', 'stock', 'return']
['high', 'growth', 'doesnt', 'mean', 'higher', 'stock', 'returns']

Future ready: Seismic moves for digital wealth management via @finplan by @FinTechie http://ow.ly/z4Hr307lzjd pic.twitter.com/wpAhD7Xolm
Future ready Seismic moves for digital wealth management via finplan by FinTechie 
['futur', 'readi', 'seismic', 'digit', 'wealth', 'manag', 'finplan', 'fintechi']
['future', 'ready', 'seismic', 'moves', 'digital', 'wealth', 'management', 'finplan', 'fintechie']

Northland Wealth Management up for Four Family Wealth Report "made in Canada" https://hedgeaccordingly.com/2016/12/northland-wealth-management-up-for-four-family-wealth-report-made-in-canada-awards.html … #canada #housing #portfoliomanagers
Northland Wealth Management up for Four Family Wealth Report made i

### Aspect-terms Extraction

In [10]:
texts=[' '.join(i) for i in nstem]

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [12]:
temp=nlp(texts[0])
temp

high growth doesnt mean higher stock return

In [13]:
type(temp[0])

spacy.tokens.token.Token

In [14]:
for i in temp:
    print(i.tag_,i.shape_)

JJ xxxx
NN xxxx
VBZ xxxx
RB xx
VB xxxx
JJR xxxx
NN xxxx
NN xxxx


In [15]:
def word_extraction(texts):
    extracted_words = []
    for i in texts:
        doc = nlp(i)
        for token in doc:
            if token.tag_=='NN' and token.shape_ != 'x' and token.shape_ != 'xx' and token.shape_ != 'xxx':
                extracted_words.append(token.lemma_)
    return extracted_words
        
extract_words=word_extraction(texts)

In [16]:
model = gensim.models.KeyedVectors.load_word2vec_format('./pretrained/GoogleNews-vectors-negative300.bin', binary=True)

In [17]:
'''
def word2vec(words):
    # removal of words which not present in the word2vec model vocabulary. (wrongly spelled)
    filtered_terms = []
    for i in range(len(words)):
        correct_words = [token for token in words[i] if token in model.vocab]
        if len(correct_words) > 0 :
            filtered_terms.extend(correct_words)
    #converting words into vector
    vector_of_terms = []
    filtered_terms=list(set(filtered_terms))
    for i in range(len(filtered_terms)):
        vector_of_terms.append(model.get_vector(filtered_terms[i]))
    return vector_of_terms,filtered_terms
'''
def word2vec(words):
    # removal of words which not present in the word2vec model vocabulary. (wrongly spelled)
    filtered_terms = [i for i in words if i in model.vocab]
    #converting words into vector
    vector_of_terms = []
    filtered_terms=list(set(filtered_terms))
    for i in range(len(filtered_terms)):
        vector_of_terms.append(model.get_vector(filtered_terms[i]))
    return vector_of_terms,filtered_terms

In [18]:
aspect=['price','service','selfservice','product','digital','robot','robo']

In [19]:
def topic_words(aspect):
    vec=word2vec(extract_words)
    aspect_vec=np.array([model.get_vector(i) for i in aspect])
    result=[]
    for i in range(len(aspect)):
        temp=1-vec[0]@aspect_vec[i]/np.linalg.norm(aspect_vec[i])/np.linalg.norm(vec[0],axis=1)
        word=[vec[1][i] for i in np.argsort(temp)[:100]]
        result.append(word)
    return result

In [20]:
candidate=topic_words(aspect)

In [21]:
print(candidate)

[['price', 'premium', 'stock', 'cost', 'market', 'discount', 'sell', 'tariff', 'buyer', 'sale', 'seller', 'demand', 'markup', 'rate', 'rent', 'cent', 'dividend', 'wage', 'share', 'investor', 'benchmark', 'dollar', 'premia', 'level', 'profit', 'import', 'offer', 'trade', 'costco', 'worth', 'commod', 'broker', 'threshold', 'liter', 'minimum', 'auction', 'coupon', 'rental', 'supplier', 'payment', 'marketwatch', 'target', 'payout', 'aapl', 'output', 'index', 'cash', 'yield', 'takeup', 'preorder', 'tuition', 'trader', 'sector', 'fetch', 'date', 'fare', 'size', 'dealer', 'cusip', 'outlook', 'cashflow', 'fuel', 'increment', 'forecast', 'goog', 'ration', 'petroleum', 'highend', 'peso', 'ratio', 'deal', 'penney', 'premarket', 'hike', 'percent', 'longrun', 'hsbc', 'option', 'ownership', 'standard', 'lowcost', 'portfolio', 'current', 'q3', 'pump', 'debt', 'distributor', 'stake', 'maker', 'refund', 'ripoff', 'bread', 'margin', 'budget', 'afford', 'invest', 'number', 'analyst', 'multitrillion', 'tr

In [22]:
s=str(candidate)
s=re.sub(r'\', \'',' ',s)
s=re.sub(r'\'\], \[\'',' \n ',s)

In [23]:
print(s)

[['price premium stock cost market discount sell tariff buyer sale seller demand markup rate rent cent dividend wage share investor benchmark dollar premia level profit import offer trade costco worth commod broker threshold liter minimum auction coupon rental supplier payment marketwatch target payout aapl output index cash yield takeup preorder tuition trader sector fetch date fare size dealer cusip outlook cashflow fuel increment forecast goog ration petroleum highend peso ratio deal penney premarket hike percent longrun hsbc option ownership standard lowcost portfolio current q3 pump debt distributor stake maker refund ripoff bread margin budget afford invest number analyst multitrillion trend 
 servic network access serv portal care mail provi broadband transit staff traffic work turnkey prayer engin payment function phone support telemarket program product charter rental mission internet user branch takeup expeditor address unit site station travel taxi comfort platform sector ho