In [1]:
#!python -m pip install pysentiment==0.1

In [148]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas
import math

# make this point to the file with the clean tweets
DATAFILE = "data_clean.csv"
# make this point to the file with the text tweets
TWEETSFILE = "data_original.csv"
# this points to the csv with the sentiments
SENTIMENTCSV = "Data/sentiment_lexicons.csv"

In [123]:
# import the tweets
text_tweets = pandas.read_csv(TWEETSFILE, index_col=0)
# read the csv
data = pandas.read_csv(DATAFILE, index_col=0)
# get all the tweets
# as of now, they are still strings, the string representation of the list
tweets = []
for tweet in data["text"]:
    tweets.append(eval(tweet))
    
# find all unique words and count them
uniqueWords = {}
total_words = 0
for tweet in tweets:
    for word in tweet:
        total_words += 1
        if word in uniqueWords.keys():
            uniqueWords[word] += 1
        else:
            uniqueWords[word] = 1
print(f"We have {total_words} words, of which {len(uniqueWords.keys())} are unique")
words_tuples = [(word, uniqueWords[word]) for word in uniqueWords.keys()]
words_tuples.sort(reverse=True, key=lambda t: t[1])
print(words_tuples[0:10])

We have 60873 words, of which 7496 are unique
[('great', 877), ('amp', 528), ('people', 469), ('rt', 464), ('trump', 375), ('president', 367), ('country', 341), ('u', 306), ('democrats', 290), ('many', 283)]


In [124]:
# import the sentiments and set up their categories
# based on https://github.com/beefoo/text-analysis
sentiment_lexicons = pandas.read_csv(SENTIMENTCSV)
# keep only the words that we will eventually need
sentiment_lexicons = sentiment_lexicons[sentiment_lexicons.word.map(lambda x: x in uniqueWords.keys())]
sentiment_lexicons = sentiment_lexicons.drop(columns=["source"])

categories = {
  "emotion": ["anger", "fear", "anticipation", "trust", "surprise", "sadness", "joy", "disgust"],
  "subjectivity": ["weak", "strong"],
  "sentiment": ["positive", "negative"],
  "orientation": ["active", "passive"],
  "color": ["white", "black", "red", "green", "yellow", "blue", "brown", "pink", "purple", "orange", "grey"]
}

print(sentiment_lexicons[:10])

          word       emotion  color orientation sentiment subjectivity
5      abandon          fear    NaN         NaN  negative         weak
6    abandoned         anger  black         NaN  negative         weak
28     ability           NaN    NaN         NaN  positive         weak
33        able           NaN    NaN         NaN  positive         weak
37     abolish         anger    NaN      active  negative         weak
60    abruptly           NaN    red         NaN  negative          NaN
69    absolute           NaN    NaN         NaN  positive       strong
70  absolutely           NaN    NaN         NaN       NaN       strong
80   abundance  anticipation  green         NaN  negative         weak
82       abuse         anger  black      active  negative         weak


In [125]:
# we will cluster by emotion now
# if <categories> has E emotions, each tweet will be a vector v of length E
# for i < E, v[i] = {# of words associated with emotion categories["emotion"][i]}
mat = []
# at the cost of some temporary memory, make this processing faster
# cache[word] = i, if the word correspondes to the coordinate i of the vector,
# as per the comment above; cache[word] = -1 if the word has been processed BUT
# corresponds to no emotion
cache = {}
emotions = categories["emotion"]
E = len(emotions)
for tweet in tweets:
    vec = [0 for i in range(E)]
    for word in tweet:
        idx = cache.get(word, None)
        if idx is not None and idx > -1:
            vec[idx] += 1
        else:
            # is this word in our lexicon data base?
            p = sentiment_lexicons.loc[word == sentiment_lexicons.word]
            if len(p) == 0:
                cache[word] = -1
            else:
                emot = p.loc[p.index[0], "emotion"]
                idx = -1 if emot not in emotions else emotions.index(emot)
                cache[word] = idx
                if idx > -1:
                    vec[cache[word]] += 1
    mat.append(vec)

In [128]:
# apply the KMeans algorithm to the data
kmeans = KMeans(5, random_state=10032019).fit(mat)
sk_clusters = [[] for i in range(max(kmeans.labels_+1))]
for idx, lbl in enumerate(kmeans.labels_):
    sk_clusters[lbl].append(idx)
    
print(list(map(len, sk_clusters)))

random.seed(112233)
# take some random tweets from every cluster
for k, cluster in enumerate(sk_clusters):
    center = kmeans.cluster_centers_[k]
    # order the emotions by most present to least present, include the score for non-emotional words
    orders = [(emotions[i], center[i]) for i in range(len(emotions))]
    orders.sort(key=lambda t: t[1], reverse=True)
    print(orders)
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + " ".join(tweets[idx]))
    print("-"*40)

[1343, 316, 522, 795, 581]
[('trust', 0.3276247207743721), ('anticipation', 0.31273268801190607), ('fear', 0.16753536857781032), ('joy', 0.16604616530156432), ('sadness', 0.11839166046165372), ('surprise', 0.09977661950856381), ('disgust', 0.045420699925538585), ('anger', -1.7541523789077473e-14)]
488 - ron desantis showed great courage hard fought campaign become governor florida congratulations ron family
1263 - rt trump train stand national anthem
870 - happy nationalfarmersday recent usmca great farmers better ever
1587 - going make america great never great believe governor highest taxed state u andrew cuomo total meltdown
3257 - fake news big ratings loser cnn
----------------------------------------
[('anger', 3.620253164556949), ('trust', 0.8386075949367089), ('anticipation', 0.6740506329113923), ('fear', 0.382911392405063), ('joy', 0.23734177215189867), ('sadness', 0.18037974683544306), ('surprise', 0.1550632911392406), ('disgust', 0.13924050632911397)]
704 - pennsylvania fant

In [146]:
# apply the DBSCAN algorithm to the data
dbscan = DBSCAN(eps=0.9, min_samples=50).fit(mat)
dbscan_clusters = [[] for i in range(max(dbscan.labels_+1))]
for idx, lbl in enumerate(dbscan.labels_):
    dbscan_clusters[lbl].append(idx)

print(list(map(len, dbscan_clusters)))

random.seed(1653)
# take some random tweets from every cluster
for k, cluster in enumerate(dbscan_clusters):
    center = dbscan.components_[k]
    # order the emotions by most present to least present, include the score for non-emotional words
    orders = [(emotions[i], center[i]) for i in range(len(emotions))]
    orders.sort(key=lambda t: t[1], reverse=True)
    print(orders)
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + " ".join(tweets[idx]))
    print("-"*40)

[55, 64, 142, 455, 159, 68, 103, 2511]
[('anger', 1), ('trust', 1), ('fear', 0), ('anticipation', 0), ('surprise', 0), ('sadness', 0), ('joy', 0), ('disgust', 0)]
71 - wishing supreme court justice ruth bader ginsburg full speedy recovery
3196 - rt president dem fisa memo lot bad things happened side side
1623 - scott walker wisconsin tremendous governor done incredible things great state complete amp total endorsement brought amazing foxconn wisconsin 15 000 jobs much vote scott tuesday republican primary
182 - care think president cannot bleed fbi comey confirming bias fbi chris swecker
495 - florida important get vote florida congressional candidate michael waltz r strong endorsement
----------------------------------------
[('joy', 1), ('anger', 0), ('fear', 0), ('anticipation', 0), ('trust', 0), ('surprise', 0), ('sadness', 0), ('disgust', 0)]
1183 - love cajunnavy thank florencehurricane2018
2493 - matters never fired james comey russia corrupt mainstream media loves keep pushing

In [152]:
# apply agglomerative clustering
agg = AgglomerativeClustering(n_clusters=8).fit(mat)
agg_clusters = [[] for i in range(max(agg.labels_+1))]
for idx, lbl in enumerate(agg.labels_):
    agg_clusters[lbl].append(idx)

print(list(map(len, agg_clusters)))

random.seed(162323)
# take some random tweets from every cluster
for k, cluster in enumerate(agg_clusters):
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + " ".join(tweets[idx]))
    print("-"*40)

[397, 962, 410, 318, 483, 596, 122, 269]
1537 - failing new york times wrote story made seem like white house councel turned president fact opposite amp two fake reporters knew fake news media become enemy people bad america
1902 - rigged witch hunt headed 13 angry democrats 4 added one worked directly obama w h seems intent damaging republican party chances november election democrat excuse losing 16 election never ends
1610 - strzok started illegal rigged witch hunt called probe ended immediately angry conflicted democrats instead looking crooked hillary
2318 - rt doj ig report 562 pages nothing bias animus
3242 - hope republicans great state pennsylvania challenge new pushed congressional map way supreme court necessary original correct let dems take elections away raise taxes amp waste money
----------------------------------------
3523 - fake news media barely mentions fact stock market hit another new record business u booming people know imagine president numbers would biggest s