In [15]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import pandas
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# make this point to the file with the clean tweets
DATAFILE = "data_clean_stemmed_withoutRT.csv"
# make this point to the file with the text tweets
TWEETSFILE = "data_original_notStemmed_withoutRT.csv"

In [45]:
# import the tweets
text_tweets = pandas.read_csv(TWEETSFILE, index_col=0)
# read the csv
data = pandas.read_csv(DATAFILE, index_col=0)
# get all the tweets
# as of now, they are still strings, the string representation of the list
tweets = []
for tweet in data["text"]:
    tweets.append(eval(tweet))
    
# find all unique words
uniqueWords = {}
words = []
wordIndex = {}
for tweet in tweets:
    for word in tweet:
        if word in uniqueWords.keys():
            uniqueWords[word] += 1
        else:
            words.append(word)
            wordIndex[word] = len(words)-1
            uniqueWords[word] = 1

In [3]:
# build a matrix A
# each row i refers to a tweet
# entry A_(i,j) has the number of times word j shows up in tweet i

matrix = np.zeros([len(tweets), len(uniqueWords.keys())])
print(matrix.shape)

for i, tweet in enumerate(tweets):
    for w, word in enumerate(tweet):
        wordIdx = wordIndex[word]
        matrix[i, wordIdx] += 1

(3035, 4820)


In [18]:
def get_svd(n):
    cached = get_svd.memory.get(n, None)
    if cached is not None:
        return cached
    else:
        svd = TruncatedSVD(n_components = n)
        svd.fit(matrix)
        get_svd.memory[n] = svd
        return svd
get_svd.memory = dict()

def get_reduced_tweets(n):
    cached = get_reduced_tweets.memory.get(n, None)
    if cached is not None:
        return cached
    else:
        svd = get_svd(n)
        reduced = svd.transform(matrix)
        get_reduced_tweets.memory[n] = reduced
        return reduced
get_reduced_tweets.memory = dict()

In [53]:
def query(n, q, k):
    """ Looks for the K closest tweets in the reduced data in R^n (the argument of the function)
    The query q should be a string, for example composed of terms or a tweet
    
    Returns the ***indices*** of the K closest matches """
    words = q.split()
    stemmed = list(map(lambda w: stemmer.stem(w), words))
    print(stemmed)
    # build the vector with the right representation
    vec = np.zeros([1, len(uniqueWords.keys())])
    for i, word in enumerate(stemmed):
        idx = wordIndex.get(word, None)
        if idx is None:
            # if this word has never been seen, let the user know
            print(f"Ignoring word '{words[i]}'")
        else:
            vec[0, idx] += 1
    print()
    
    # transform the vector
    svd = get_svd(n)
    transformed = svd.transform(vec)
    
    # find the closest vectors
    tweets = get_reduced_tweets(n)
    sims = cosine_similarity(tweets, transformed)
    
    # return the top k similarities
    tops = []
    for i in range(k):
        tops.append(np.argmax(sims))
        sims[tops[-1]] = -100
    return tops

## Example usage

In [54]:
# does trump ever talk about cars..?
q = "car automobile vehicle"
dim = 20
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['car', 'automobil', 'vehicl']

“Ford has abruptly killed a plan to sell a Chinese-made small vehicle in the U.S. because of the prospect of higher U.S. Tariffs.” CNBC.  This is just the beginning. This car can now be BUILT IN THE U.S.A. and Ford will pay no tariffs!

Canada charges the U.S. a 270%  tariff on Dairy Products! They didn’t tell you that, did they? Not fair to our farmers!

If the U.S. sells a car into China, there is a tax of 25%. If China sells a car into the U.S., there is a tax of 2%. Does anybody think that is FAIR? The days of the U.S. being ripped-off by other nations is OVER!

China has agreed to reduce and remove tariffs on cars coming into China from the U.S. Currently the tariff is 40%.

“U.S. Stocks Widen Global Lead” https://t.co/Snhv08ulcO

The U.S. is respected again! https://t.co/NtQ4vsoqnk

Secretary of Commerce Wilbur Ross will be speaking with representatives of the European Union about eliminating the large Tariffs and Barriers they use against the U.S.

In [56]:
# find tweets that could look similar to this one
q = "i am so great and incredible"
dim = 40
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['i', 'am', 'so', 'great', 'and', 'incred']
Ignoring word 'i'
Ignoring word 'am'
Ignoring word 'so'
Ignoring word 'and'

Gina is Great! https://t.co/TyLQ2W42y5

.@seanhannity on @foxandfriends now! Great! 8:18 A.M.

Promises Kept for our GREAT Veterans! https://t.co/C0h8cW4FuH

Great to be in Singapore, excitement in the air!

Happy 243rd Birthday to our GREAT @USNavy! #243NavyBday https://t.co/m1YtoKSHFw

Great couple, great book! https://t.co/cLDI79rin8

Don’t miss our GREAT @FLOTUS, Melania, on @ABC @ABC2020 tonight at 10pmE. Enjoy!

The Queen of Soul, Aretha Franklin, is dead. She was a great woman, with a wonderful gift from God, her voice. She will be missed!

I will be interviewed tonight by Trish Regan on @FoxBusiness at 8:00 P.M., right after the great Lou Dobbs!

Budd and Mark, two great patriots for Congress! https://t.co/xx0cqUf7wj

