In [9]:
#!python -m pip install pysentiment==0.1

In [11]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas
import math

# make this point to the file with the clean tweets
DATAFILE = "data_clean_notStemmed_withoutRT.csv"
# make this point to the file with the text tweets
TWEETSFILE = "data_original_notStemmed_withoutRT.csv"
# this points to the csv with the sentiments
SENTIMENTCSV = "Data/sentiment_lexicons.csv"

In [12]:
# import the tweets
text_tweets = pandas.read_csv(TWEETSFILE, index_col=0)
# read the csv
data = pandas.read_csv(DATAFILE, index_col=0)
# get all the tweets
# as of now, they are still strings, the string representation of the list
tweets = []
for tweet in data["text"]:
    tweets.append(eval(tweet))
    
# find all unique words and count them
uniqueWords = {}
total_words = 0
for tweet in tweets:
    for word in tweet:
        total_words += 1
        if word in uniqueWords.keys():
            uniqueWords[word] += 1
        else:
            uniqueWords[word] = 1
print(f"We have {total_words} words, of which {len(uniqueWords.keys())} are unique")
words_tuples = [(word, uniqueWords[word]) for word in uniqueWords.keys()]
words_tuples.sort(reverse=True, key=lambda t: t[1])
print(words_tuples[0:10])

We have 53640 words, of which 6661 are unique
[('great', 826), ('amp', 485), ('people', 432), ('country', 324), ('u', 292), ('trump', 287), ('president', 283), ('democrats', 272), ('many', 269), ('big', 257)]


In [13]:
# import the sentiments and set up their categories
# based on https://github.com/beefoo/text-analysis
sentiment_lexicons = pandas.read_csv(SENTIMENTCSV)
# keep only the words that we will eventually need
sentiment_lexicons = sentiment_lexicons[sentiment_lexicons.word.map(lambda x: x in uniqueWords.keys())]
sentiment_lexicons = sentiment_lexicons.drop(columns=["source"])

categories = {
  "emotion": ["anger", "fear", "anticipation", "trust", "surprise", "sadness", "joy", "disgust"],
  "subjectivity": ["weak", "strong"],
  "sentiment": ["positive", "negative"],
  "orientation": ["active", "passive"],
  "color": ["white", "black", "red", "green", "yellow", "blue", "brown", "pink", "purple", "orange", "grey"]
}

print(sentiment_lexicons[:10])

          word       emotion  color orientation sentiment subjectivity
5      abandon          fear    NaN         NaN  negative         weak
6    abandoned         anger  black         NaN  negative         weak
28     ability           NaN    NaN         NaN  positive         weak
33        able           NaN    NaN         NaN  positive         weak
37     abolish         anger    NaN      active  negative         weak
60    abruptly           NaN    red         NaN  negative          NaN
69    absolute           NaN    NaN         NaN  positive       strong
70  absolutely           NaN    NaN         NaN       NaN       strong
80   abundance  anticipation  green         NaN  negative         weak
82       abuse         anger  black      active  negative         weak


In [14]:
# we will cluster by emotion now
# if <categories> has E emotions, each tweet will be a vector v of length E
# for i < E, v[i] = {# of words associated with emotion categories["emotion"][i]}
mat = []
# at the cost of some temporary memory, make this processing faster
# cache[word] = i, if the word correspondes to the coordinate i of the vector,
# as per the comment above; cache[word] = -1 if the word has been processed BUT
# corresponds to no emotion
cache = {}
emotions = categories["emotion"]
E = len(emotions)
for tweet in tweets:
    vec = [0 for i in range(E)]
    for word in tweet:
        idx = cache.get(word, None)
        if idx is not None and idx > -1:
            vec[idx] += 1
        else:
            # is this word in our lexicon data base?
            p = sentiment_lexicons.loc[word == sentiment_lexicons.word]
            if len(p) == 0:
                cache[word] = -1
            else:
                emot = p.loc[p.index[0], "emotion"]
                idx = -1 if emot not in emotions else emotions.index(emot)
                cache[word] = idx
                if idx > -1:
                    vec[cache[word]] += 1
    mat.append(vec)

In [15]:
# apply the KMeans algorithm to the data
kmeans = KMeans(5, random_state=10032019).fit(mat)
sk_clusters = [[] for i in range(max(kmeans.labels_+1))]
for idx, lbl in enumerate(kmeans.labels_):
    sk_clusters[lbl].append(idx)
    
print(list(map(len, sk_clusters)))

random.seed(112233)
# take some random tweets from every cluster
for k, cluster in enumerate(sk_clusters):
    center = kmeans.cluster_centers_[k]
    # order the emotions by most present to least present, include the score for non-emotional words
    orders = [(emotions[i], center[i]) for i in range(len(emotions))]
    orders.sort(key=lambda t: t[1], reverse=True)
    print(orders)
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + (text_tweets["text"][idx]))
    print("-"*40)

[1144, 488, 306, 438, 659]
[('anticipation', 0.2954545454545385), ('trust', 0.295454545454533), ('joy', 0.1713286713286703), ('fear', 0.16433566433566268), ('sadness', 0.1188811188811182), ('anger', 0.09353146853145045), ('surprise', 0.07779720279720355), ('disgust', 0.05069930069929941)]
504 - Get out and VOTE for @DeanHeller! https://t.co/4HEkpQL3oy
1252 - It was my great honor to host the Foreign Investment Risk Review Modernization Act Roundtable today at the @WhiteHouse! https://t.co/TjtI7ddtZM
803 - https://t.co/4ySIkmfllE
1666 - Thank you @RandPaul, you really get it! “The President has gone through a year and a half of totally partisan investigations - what’s he supposed think?”
769 - Don’t miss our GREAT @FLOTUS, Melania, on @ABC @ABC2020 tonight at 10pmE. Enjoy!
----------------------------------------
[('trust', 2.5860655737704765), ('anticipation', 0.7725409836065573), ('anger', 0.5348360655737743), ('fear', 0.34016393442622966), ('joy', 0.28483606557377084), ('surprise', 0

In [16]:
# apply the DBSCAN algorithm to the data
dbscan = DBSCAN(eps=0.9, min_samples=50).fit(mat)
dbscan_clusters = [[] for i in range(max(dbscan.labels_+1))]
for idx, lbl in enumerate(dbscan.labels_):
    dbscan_clusters[lbl].append(idx)

print(list(map(len, dbscan_clusters)))

random.seed(1653)
# take some random tweets from every cluster
for k, cluster in enumerate(dbscan_clusters):
    center = dbscan.components_[k]
    # order the emotions by most present to least present, include the score for non-emotional words
    orders = [(emotions[i], center[i]) for i in range(len(emotions))]
    orders.sort(key=lambda t: t[1], reverse=True)
    print(orders)
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + (text_tweets["text"][idx]))
    print("-"*40)

[52, 111, 351, 123, 60, 81, 2257]
[('joy', 1), ('anger', 0), ('fear', 0), ('anticipation', 0), ('trust', 0), ('surprise', 0), ('sadness', 0), ('disgust', 0)]
200 - ....I am thankful to both of these incredible men for their service to our Country! Date of transition to be determined.
1479 - Thank you Pennsylvania. I love you! https://t.co/qoswnBZb3f
254 - We would save Billions of Dollars if the Democrats would give us the votes to build the Wall. Either way, people will NOT be allowed into our Country illegally! We will close the entire Southern Border if necessary. Also, STOP THE DRUGS!
674 - Beautiful evening in Mesa, Arizona with GREAT PATRIOTS - thank you! https://t.co/0pWiwCq4MH #MAGARally????replay: https://t.co/6vHEaB37VH https://t.co/pHmU6pMKh7
815 - Thank you Kansas - I love you! 
https://t.co/ymCFNr9WQY
----------------------------------------
[('trust', 1), ('anger', 0), ('fear', 0), ('anticipation', 0), ('surprise', 0), ('sadness', 0), ('joy', 0), ('disgust', 0)]
2384 - L

In [17]:
# apply agglomerative clustering
agg = AgglomerativeClustering(n_clusters=8).fit(mat)
agg_clusters = [[] for i in range(max(agg.labels_+1))]
for idx, lbl in enumerate(agg.labels_):
    agg_clusters[lbl].append(idx)

print(list(map(len, agg_clusters)))

random.seed(162323)
# take some random tweets from every cluster
for k, cluster in enumerate(agg_clusters):
    some_tweet_idx = random.sample(cluster, 5)
    for idx in some_tweet_idx:
        print(str(idx) + " - " + (text_tweets["text"][idx]))
    print("-"*40)

[886, 422, 362, 526, 112, 317, 237, 173]
978 - Tariffs have put the U.S. in a very strong bargaining position, with Billions of Dollars, and Jobs, flowing into our Country - and yet cost increases have thus far been almost unnoticeable. If countries will not make fair deals with us, they will be “Tariffed!”
1282 - A Blue Wave means Crime and Open Borders. A Red Wave means Safety and Strength!
1036 - 17 years since September 11th!
1673 - It was an honor to join you this morning. Thank you! https://t.co/NOUTroe8MV
2476 - Slippery James Comey, a man who always ends up badly and out of whack (he is not smart!), will go down as the WORST FBI Director in history, by far!
----------------------------------------
2600 - .@HowieCarrShow just wrote a book which everyone is talking about. He was a great help. He is a veteran journalist who had a great influence in NH and beyond. He calls it the most amazing political campaign of modern times. The book is called, “What Really Happened.” Enjoy! #MA