In [35]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas
import math

# make this point to the file with the clean tweets
DATAFILE = "data_clean.csv"

In [36]:
# read the csv
data = pandas.read_csv(DATAFILE, index_col=0)
# get all the tweets
# as of now, they are still strings, the string representation of the list
tweets = []
for tweet in data["text"]:
    tweets.append(eval(tweet))
    
# find all unique words and count them
uniqueWords = {}
total_words = 0
for tweet in tweets:
    for word in tweet:
        total_words += 1
        if word in uniqueWords.keys():
            uniqueWords[word] += 1
        else:
            uniqueWords[word] = 1
print(f"We have {total_words} words, of which {len(uniqueWords.keys())} are unique")
words_tuples = [(word, uniqueWords[word]) for word in uniqueWords.keys()]
words_tuples.sort(reverse=True, key=lambda t: t[1])
print(words_tuples[1:10])

We have 60873 words, of which 7496 are unique
[('amp', 528), ('people', 469), ('rt', 464), ('trump', 375), ('president', 367), ('country', 341), ('u', 306), ('democrats', 290), ('many', 283)]


In [37]:
# create the one-hot encoding of each tweet
# start by fixing some ordering of the words
orderedWords = uniqueWords.keys()
mat = []
for tweet in tweets:
    one_hot_enc = []
    for word in orderedWords:
        one_hot_enc.append(tweet.count(word)) # most of these will be 0s
    mat.append(one_hot_enc)

In [38]:
# use PCA to reduce the dimension
pca = PCA(n_components = 20, random_state=3534)
reduced = pca.fit_transform(mat)

In [39]:
reduced.shape

(3557, 20)

In [40]:
# apply the KMeans algorithm to the reduced data
kmeans = KMeans(random_state=73).fit(reduced)
sk_clusters = [[] for i in range(max(kmeans.labels_+1))]
for idx, lbl in enumerate(kmeans.labels_):
    sk_clusters[lbl].append(idx)
    
print(list(map(len, sk_clusters)))

random.seed(52345234)
# take some random tweets from every cluster
for cluster in sk_clusters:
    some_tweet_idx = random.sample(cluster, 10)
    for idx in some_tweet_idx:
        print(data["text"][idx].replace("', '", " "))
    print("-"*40)

[97, 181, 606, 1559, 208, 435, 275, 196]
['congressman pete sessions texas great job fighter tough crime border fight hard second amendment loves military vets full complete endorsement']
['paulsen love country great state minnesota winners always get job done need congress maga border military vets 2nd go vote minnesota strong endorsement']
['congressman john faso new york worked hard smart strong crime borders 2nd amendment john respected vote john complete total endorsement']
['congressman keith rothfus continues great job people pennsylvania keith strong crime border second amendment loves military vets total endorsement']
['incredible people great state wyoming go vote today foster friess fantastic governor strong crime borders amp 2nd amendment loves military amp vets complete total endorsement']
['governor doug ducey arizona great job would really nice show support tomorrow voting tuesdays primary doug strong crime border second amendment loves military amp vets full complete en