## Get recent tweets from 143 famous investors on Twitter

The Best Invsting Blogs of 2017
https://thecollegeinvestor.com/15601/the-best-investing-blogs/

17 Venture Capital and Angel Investors to Follow on Twitter
https://www.inc.com/larry-kim/17-venture-capital-and-angel-investors-to-follow-on-twitter.html

Nine Twitter Accounts For Aspiring Real Estate Investors To Follow
https://www.forbes.com/sites/forbesrealestatecouncil/2017/05/03/9-twitter-accounts-for-aspiring-real-estate-investors-to-follow/#61c1f6a627a0

10 VCs & Angel Investors to Follow on Twitter
http://www.techinsurance.com/blog/business-tips/10-vcs-angel-investors-to-follow-on-twitter/

These Are The Top 20 Tech Investors You Should Follow On Twitter
http://www.businessinsider.com/top-20-tech-investors-on-twitter-2013-5?op=1

Top 50 Investors on Twitter and How To Engage Them
http://startupfundraising.com/top-50-investors-on-twitter-and-how-to-engage-them/

In [14]:
random_state = 81917

In [2]:
import pandas as pd
import numpy as np
import tweepy
import os

In [4]:
from pymongo import MongoClient
client = MongoClient()
investor_tweets_db = client.investor_tweets_db
tweets_collection = investor_tweets_db.tweets_collection

In [3]:
consumer_key = os.environ["TWITTER_CONSUMER_KEY"]
consumer_secret = os.environ["TWITTER_CONSUMER_SECRET"]
access_token = os.environ["TWITTER_ACCESS_TOKEN"]
access_token_secret = os.environ["TWITTER_TOKEN_SECRET"]
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [5]:
text_file = open("Investors_on_twitter.txt", "r")
user_list = text_file.readlines()
for index, user in enumerate(user_list):
    user_list[index]= user.rstrip('\n')

In [13]:
def get_all_tweets(screen_name):
    '''get all tweets for one user'''
    #initialize a list to hold all the tweepy Tweets
    alltweets = []	

    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name, count=200, tweet_mode='extended')

    #save most recent tweets
    alltweets.extend(new_tweets)

    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        #print ("getting tweets before %s" % (oldest))

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest, tweet_mode='extended')

        #save most recent tweets
        alltweets.extend(new_tweets)

        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

    print ("{}: {} tweets downloaded so far".format(screen_name, len(alltweets)))
    return alltweets

In [None]:
for user in user_list[100:]:
    print(user)
    if tweets_collection.find({'user': user}).count() > 0:
        continue
    
    user_twitter = api.get_user(user)
    description = user_twitter.description
    
    all_tweets = get_all_tweets(user)
    all_tweets_raw = []
    for tweet in all_tweets:
        all_tweets_raw.append(tweet.full_text)
    user_dict = {}
    user_dict['user'] = user
    user_dict['all_tweets_raw'] = all_tweets_raw
    user_dict['description'] = description
    tweets_collection.insert_one((user_dict))

In [34]:
tweets_collection.count()

143

## Clean Tweets

In [36]:
import re
import sys
import unicodedata
from string import digits
import html

In [7]:
for investor in tweets_collection.find({}):
    all_tweets_clean = []
    for tweet in investor['all_tweets_raw']:
        tweet_clean = html.unescape(tweet) #convert html entities to string
        tweet_clean = ''.join(c for c in unicodedata.normalize('NFC', tweet_clean) if c <= '\uFFFF') #remove emojis
        tweet_clean = re.sub('(#[A-Za-z0-9_]+)|(%)|(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)|(^rt)|(^RT)|(^Rt)|(\sRT\s)|(\sRt\s)|(\srt\s)|(http.+?)', ' ', tweet_clean).strip() #remove %, @, https, RT 
        tweet_clean = re.sub('\s+', ' ', tweet_clean) #remove multiple spaces
        tweet_clean = ''.join([i for i in tweet_clean if not i.isdigit()]) #remove all digits
        all_tweets_clean.append(tweet_clean)
    tweets_collection.update_one({"_id": investor["_id"]}, {"$set": {"all_tweets_clean": all_tweets_clean}}) 

## Natural Language Processing

In [34]:
import spacy
nlp = spacy.load('en')

In [15]:
for investor in tweets_collection.find({}):
    all_tweets_bow = []
    for tweet in investor['all_tweets_clean']:
        doc = nlp(tweet)
        bow = ''
        for token in doc:
            if not(token.is_stop or token.is_space or token.is_punct or token.like_num):
                bow += token.lemma_ + ' '
        all_tweets_bow.append(bow)
    tweets_collection.update_one({"_id": investor["_id"]}, {"$unset":{"bag_of_words": ""}});
    tweets_collection.update_one({"_id": investor["_id"]}, {"$set": {"all_tweets_bow": all_tweets_bow}})

In [37]:
for investor in tweets_collection.find({}):
    bag_of_words = ''
    for tweet in investor['all_tweets_clean']:
        doc = nlp(tweet)
        for token in doc:
            if not(token.is_stop or token.is_space or token.is_punct or token.like_num):
                bag_of_words += token.lemma_ + ' '
    bag_of_words = re.sub('\s+', ' ', bag_of_words) #remove multiple spaces
    tweets_collection.update_one({"_id": investor["_id"]}, {"$set": {"bag_of_words": bag_of_words}})

In [39]:
tweets_collection.find_one({})['description']

"Macro Trader; tweeting on rates, FX, equities, commods, life's rich pageant...in roughly that order. Not looking to set the world to rights in 140 characters"

## Functions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import textwrap
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from pymongo import MongoClient
import pickle

In [None]:
user_tweet_df = pd.DataFrame(columns=['user', 'tweet'])
for user in user_list:
    investor = tweets_collection.find_one({'user': user})
    for bow in investor['all_tweets_bow']:
        user_tweet_df = user_tweet_df.append(pd.DataFrame([[user, bow]], columns = ['user', 'tweet']))

In [None]:
pickle.dump(user_tweet_df, open('user_tweet_df.p', 'wb'))

In [2]:
def kmeans_inertias(ns_clusters, X):
    inertias = []
    for n_clusters in ns_clusters:
        kmeans = KMeans(n_clusters=n_clusters , random_state=random_state).fit(X)
        inertias.append(kmeans.inertia_)
    return inertias

In [3]:
def kmeans_Sil_coefs(ns_clusters, X):
    Sil_coefs = []
    for n_clusters in ns_clusters:
        kmeans = KMeans(n_clusters=n_clusters , random_state=random_state).fit(X)
        labels = kmeans.labels_
        Sil_coefs.append(silhouette_score(X, labels, metric='euclidean'))
    return Sil_coefs

In [4]:
def make_hover_text(user_tweet_cluster, pred_cluster):
    hover_text = []
    for index, row in user_tweet_cluster.iterrows():
        user = row['user']
        bow = '<br>'.join(textwrap.wrap(row['bow'], 50))
        cluster = pred_cluster[index]
        hover_text.append(('User: {user}<br>'+
                          'Bow: {bow}<br>'+
                          'Cluster: {cluster}<br>').format(cluster=cluster,
                                                name=user,
                                                description=description))
    return hover_text

In [5]:
def plot_3D(X, hover_text, pred_cluster):
    trace1 = go.Scatter3d(
        x=X[:,0],
        y=X[:,1],
        z=X[:,2],
        mode='markers',
        marker=dict(
            size=12,
            color=pred_cluster,                # set color to an array/list of desired values
            colorscale='Viridis',   # choose a colorscale
            opacity=0.8
        ),
        text=hover_text,

        hoverinfo='text'
    )

    data = [trace1]
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )
    return data, layout

## LDA

In [6]:
client = MongoClient()
investor_tweets_db = client.investor_tweets_db
tweets_collection = investor_tweets_db.tweets_collection
text_file = open("Investors_on_twitter.txt", "r")
user_list = text_file.readlines()
for index, user in enumerate(user_list):
    user_list[index]= user.rstrip('\n')

In [7]:
corpus = []

In [8]:
for user in user_list:
    investor = tweets_collection.find_one({'user': user})
    corpus.extend(investor['all_tweets_bow'])

In [9]:
vectorizer = CountVectorizer(min_df = 0.00001, max_df = 0.3, stop_words='english')

In [10]:
X = vectorizer.fit_transform(corpus)

In [11]:
X.shape

(412601, 22792)

In [12]:
word_list = vectorizer.get_feature_names()

In [26]:
n_topics = 5
lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method='online',                
                                learning_offset=50, random_state=random_state)

In [30]:
investor_topic_matrix = lda_model.fit_transform(X[3500:7000])

In [31]:
topic_word_matrix = lda_model.components_

In [32]:
words_in_topics = []
for i in range(n_topics):
    topic = topic_word_matrix[i, :]
    importance = -np.sort(-topic)[:10]
    word_indices = (-topic).argsort()[:10]
    print([word_list[word_index] for word_index in word_indices])
    #print(importance)
    words_in_topics.append([word_list[word_index] for word_index in word_indices])

['new', 'market', 'high', 'low', 'stock', 'day', 'study', 'spy', 'wix', 'today']
['debt', 'tonight', 'bring', 'thought', 'acquire', 'miami', 'chance', 'forget', 'competitor', 'pull']
['appreciate', 'player', 'lie', 'yup', 'sq', 'ft', 'profitability', 'bloomberg', 'theater', 'campus']
['man', 'lol', 'question', 'save', 'sunday', 'thanks', 'usain', 'skill', 'politic', 'star']
['year', 'good', 'great', 'study', 'apple', 'day', 'like', 'time', 'earning', 'company']


## SVD

In [20]:
tf = TfidfTransformer()
X_tfidf =  tf.fit_transform(X)

In [120]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=random_state)
svd.fit(X_tfidf)
X_reduced = svd.transform(X_tfidf)

In [121]:
explained_variance_ratio = svd.explained_variance_ratio_

In [123]:
sum_ratio = 0
for index, ratio in enumerate(explained_variance_ratio):
    sum_ratio += ratio
    if sum_ratio > 0.95:
        print(index)
        break

In [126]:
sum_ratio

0.15797616903886358

In [124]:
svd_3 = TruncatedSVD(n_components=3, n_iter=7, random_state=random_state)
svd_3.fit(X)
X_reduced_3 = svd_3.transform(X)

In [125]:
print(np.sum(svd_3.explained_variance_ratio_))

0.0319885316371


In [127]:
components = svd_3.components_
components.shape

(3, 22949)

In [128]:
words_in_component = []
for component in components:
    word_indices = (-component).argsort()[:10]
    words_in_component.append([word_list[word_index] for word_index in word_indices])

In [129]:
for component in words_in_component:
    print(component)

['be', 'not', 'good', 'like', 'new', 'time', 'day', 'think', 'year', 'know']
['not', 'good', 'new', 'day', 'time', 'like', 'year', 'know', 'great', 'market']
['not', 'be', 'know', 'will', 'wait', 'believe', 'understand', 'anymore', 'care', 'mean']


In [None]:
ns_clusters = np.arange(2, 20, 1)

In [None]:
inertias = kmeans_inertias(ns_clusters, X_reduced_3) 
Sil_coefs = kmeans_Sil_coefs(ns_clusters, X_reduced_3)

In [None]:
plt.plot(ns_clusters, inertias)
plt.xlabel('n_cluster')
plt.ylabel('inertia');

In [None]:
plt.plot(ns_clusters, Sil_coefs)
plt.xlabel('n_cluster')
plt.ylabel('Sihouette score');

In [None]:
n = 6
kmeans = KMeans(n_clusters=n, random_state=random_state).fit(X_tfidf)

In [None]:
pred_cluster = kmeans.fit_predict(X_reduced_3)

In [None]:
user_cluster = make_user_cluster_dataframe(pred_cluster)

In [None]:
hover_text = make_hover_text(pred_cluster=pred_cluster, user_list = user_list)

In [None]:
data, layout = plot_3D(X_reduced_3, hover_text, user_cluster.cluster)
fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename='3d-scatter-colorscale')