In [11]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas
import math

# make this point to the file with the clean tweets
DATAFILE = "data_clean_new.csv"
# make this point to the file with the text tweets
TWEETSFILE = "data_original.csv"

In [5]:
# import the tweets
text_tweets = pandas.read_csv(TWEETSFILE, index_col=0)
# read the csv
data = pandas.read_csv(DATAFILE, index_col=0)
# get all the tweets
# as of now, they are still strings, the string representation of the list
tweets = []
for tweet in data["text"]:
    tweets.append(eval(tweet))
    
# find all unique words and count them
uniqueWords = {}
total_words = 0
for tweet in tweets:
    for word in tweet:
        total_words += 1
        if word in uniqueWords.keys():
            uniqueWords[word] += 1
        else:
            uniqueWords[word] = 1
print(f"We have {total_words} words, of which {len(uniqueWords.keys())} are unique")
words_tuples = [(word, uniqueWords[word]) for word in uniqueWords.keys()]
words_tuples.sort(reverse=True, key=lambda t: t[1])
print(words_tuples[0:10])

We have 53640 words, of which 4819 are unique
[('great', 832), ('amp', 485), ('peopl', 433), ('countri', 410), ('border', 351), ('democrat', 342), ('get', 308), ('state', 303), ('presid', 299), ('u', 292)]


In [6]:
# create the one-hot encoding of each tweet
# start by fixing some ordering of the words
orderedWords = uniqueWords.keys()
mat = []
for tweet in tweets:
    one_hot_enc = []
    for word in orderedWords:
        one_hot_enc.append(tweet.count(word)) # most of these will be 0s
    mat.append(one_hot_enc)

In [38]:
# get a more time-efficient (but more memory-consuming) way of retrieving
# the different PCAs, for different numbers of components:
def get_pca(n):
    cached = get_pca.memory.get(n, None)
    if cached is not None:
        return cached
    else:
        pca = PCA(n_components = n, random_state=3534)
        reduced = pca.fit_transform(mat)
        get_pca.memory[n] = reduced
        return reduced
get_pca.memory = dict()

In [40]:
reduced = get_pca(20)
# apply the KMeans algorithm to the reduced data
kmeans = KMeans(random_state=73).fit(reduced)
sk_clusters = [[] for i in range(max(kmeans.labels_+1))]
for idx, lbl in enumerate(kmeans.labels_):
    sk_clusters[lbl].append(idx)
    
print(list(map(len, sk_clusters)))

random.seed(52345234)
# take some random tweets from every cluster
for cluster in sk_clusters:
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + text_tweets["text"][idx])
    print("-"*40)

[1438, 567, 251, 164, 213, 139, 88, 175]
1083 - What was Nike thinking?
628 - Republicans will totally protect people with Pre-Existing Conditions, Democrats will not! Vote Republican.
820 - Women for Kavanaugh, and many others who support this very good man, are gathering all over Capital Hill in preparation for a 3-5 P.M. VOTE. It is a beautiful thing to see - and they are not paid professional protesters who are handed expensive signs. Big day for America!
1058 - The GDP Rate (4.2%) is higher than the Unemployment Rate (3.9%) for the first time in over 100 years!
1546 - https://t.co/E3xvdUGZqa
----------------------------------------
2003 - ....Germany pays 1% (slowly) of GDP towards NATO, while we pay 4% of a MUCH larger GDP. Does anybody believe that makes sense? We protect Europe (which is good) at great financial loss, and then get unfairly clobbered on Trade. Change is coming!
1563 - The Remains of American Servicemen will soon be leaving North Korea and heading to the United S

In [47]:
for ncomps in range(5, 21):
    # use PCA to reduce the dimension
    reduced = get_pca(ncomps)
    # apply the DBSCAN algorithm to the reduced data
    dbscan = DBSCAN().fit(reduced)
    dbscan_clusters = [[] for i in range(max(dbscan.labels_+1))]
    for idx, lbl in enumerate(dbscan.labels_):
        dbscan_clusters[lbl].append(idx)

    sizes = list(map(len, dbscan_clusters))
    print(f"Using {ncomps} PCA comps, got {len(dbscan_clusters)} clusters")
    print(f"Cluster size has std dev {math.sqrt(np.var(sizes))}")

Using 5 PCA comps, got 1 clusters
Cluster size has std dev 0.0
Using 6 PCA comps, got 7 clusters
Cluster size has std dev 908.5935531897412
Using 7 PCA comps, got 11 clusters
Cluster size has std dev 695.0778183439024
Using 8 PCA comps, got 14 clusters
Cluster size has std dev 577.9001617397035
Using 9 PCA comps, got 15 clusters
Cluster size has std dev 531.047853043605
Using 10 PCA comps, got 14 clusters
Cluster size has std dev 528.0810514862046
Using 11 PCA comps, got 15 clusters
Cluster size has std dev 431.5449249176987
Using 12 PCA comps, got 14 clusters
Cluster size has std dev 461.19490434730346
Using 13 PCA comps, got 15 clusters
Cluster size has std dev 456.37362860806155
Using 14 PCA comps, got 12 clusters
Cluster size has std dev 512.0425532989312
Using 15 PCA comps, got 13 clusters
Cluster size has std dev 505.66477423815616
Using 16 PCA comps, got 12 clusters
Cluster size has std dev 535.9487628392185
Using 17 PCA comps, got 13 clusters
Cluster size has std dev 530.974954