In [1]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
import pandas
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# make this point to the file with the clean tweets
DATAFILE = "data_clean_stemmed_withoutRT.csv"
# make this point to the file with the text tweets
TWEETSFILE = "data_original_notStemmed_withoutRT.csv"

In [2]:
# import the tweets
text_tweets = pandas.read_csv(TWEETSFILE, index_col=0)
# read the csv
data = pandas.read_csv(DATAFILE, index_col=0)
# get all the tweets
# as of now, they are still strings, the string representation of the list
tweets = []
for tweet in data["text"]:
    tweets.append(eval(tweet))
    
# find all unique words
uniqueWords = {}
words = []
wordIndex = {}
for tweet in tweets:
    for word in tweet:
        if word in uniqueWords.keys():
            uniqueWords[word] += 1
        else:
            words.append(word)
            wordIndex[word] = len(words)-1
            uniqueWords[word] = 1

In [3]:
# build a matrix A
# each row i refers to a tweet
# entry A_(i,j) has the number of times word j shows up in tweet i

matrix = np.zeros([len(tweets), len(uniqueWords.keys())])
print(matrix.shape)

for i, tweet in enumerate(tweets):
    for w, word in enumerate(tweet):
        wordIdx = wordIndex[word]
        matrix[i, wordIdx] += 1

(3035, 4820)


In [4]:
def get_svd(n):
    cached = get_svd.memory.get(n, None)
    if cached is not None:
        return cached
    else:
        svd = TruncatedSVD(n_components = n)
        svd.fit(matrix)
        get_svd.memory[n] = svd
        return svd
get_svd.memory = dict()

def get_reduced_tweets(n):
    cached = get_reduced_tweets.memory.get(n, None)
    if cached is not None:
        return cached
    else:
        svd = get_svd(n)
        reduced = svd.transform(matrix)
        get_reduced_tweets.memory[n] = reduced
        return reduced
get_reduced_tweets.memory = dict()

In [5]:
def query(n, q, k):
    """ Looks for the K closest tweets in the reduced data in R^n (the argument of the function)
    The query q should be a string, for example composed of terms or a tweet
    
    Returns the ***indices*** of the K closest matches """
    words = q.split()
    stemmed = list(map(lambda w: stemmer.stem(w), words))
    print(stemmed)
    # build the vector with the right representation
    vec = np.zeros([1, len(uniqueWords.keys())])
    for i, word in enumerate(stemmed):
        idx = wordIndex.get(word, None)
        if idx is None:
            # if this word has never been seen, let the user know
            print(f"Ignoring word '{words[i]}'")
        else:
            vec[0, idx] += 1
    print()
    
    # transform the vector
    svd = get_svd(n)
    transformed = svd.transform(vec)
    
    # find the closest vectors
    tweets = get_reduced_tweets(n)
    sims = cosine_similarity(tweets, transformed)
    
    # return the top k similarities
    tops = []
    for i in range(k):
        tops.append(np.argmax(sims))
        sims[tops[-1]] = -100
    return tops

## Example usage

In [15]:
# does trump ever talk about cars..?
q = "car automobile vehicle"
dim = 10
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['car', 'automobil', 'vehicl']

“U.S. Stocks Widen Global Lead” https://t.co/Snhv08ulcO

The U.S. is respected again! https://t.co/NtQ4vsoqnk

If the U.S. sells a car into China, there is a tax of 25%. If China sells a car into the U.S., there is a tax of 2%. Does anybody think that is FAIR? The days of the U.S. being ripped-off by other nations is OVER!

Iranian Harassment of U.S. Warships:

2015: 22
2016: 36
2017: 14
2018: 0

Source: @USNavy

GOD BLESS THE U.S.A.! https://t.co/n9OkDlqz11

GOD BLESS THE U.S.A.! #MAGA???? https://t.co/pquqyy5S3G

“Ford has abruptly killed a plan to sell a Chinese-made small vehicle in the U.S. because of the prospect of higher U.S. Tariffs.” CNBC.  This is just the beginning. This car can now be BUILT IN THE U.S.A. and Ford will pay no tariffs!

“Manufacturing in U.S. Expands at Fastest Pace Since May 2004” https://t.co/XZkwS8tTml

China has agreed to reduce and remove tariffs on cars coming into China from the U.S. Currently the tariff is 40%.

If the

In [17]:
# does trump ever talk about cars..?
q = "car automobile vehicle"
dim = 20
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['car', 'automobil', 'vehicl']

“Ford has abruptly killed a plan to sell a Chinese-made small vehicle in the U.S. because of the prospect of higher U.S. Tariffs.” CNBC.  This is just the beginning. This car can now be BUILT IN THE U.S.A. and Ford will pay no tariffs!

If the U.S. sells a car into China, there is a tax of 25%. If China sells a car into the U.S., there is a tax of 2%. Does anybody think that is FAIR? The days of the U.S. being ripped-off by other nations is OVER!

“U.S. Stocks Widen Global Lead” https://t.co/Snhv08ulcO

The U.S. is respected again! https://t.co/NtQ4vsoqnk

Iranian Harassment of U.S. Warships:

2015: 22
2016: 36
2017: 14
2018: 0

Source: @USNavy

Canada charges the U.S. a 270%  tariff on Dairy Products! They didn’t tell you that, did they? Not fair to our farmers!

Secretary of Commerce Wilbur Ross will be speaking with representatives of the European Union about eliminating the large Tariffs and Barriers they use against the U.S.A. Not fair to our farmer

In [19]:
# does trump ever talk about cars..?
q = "car automobile vehicle"
dim = 1000
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['car', 'automobil', 'vehicl']

Sergio Marchionne, who passed away today, was one of the most brilliant &amp; successful car executives since the days of the legendary Henry Ford. It was a great honor for me to get to know Sergio as POTUS, he loved the car industry, and fought hard for it. He will be truly missed!

If the E.U. wants to further increase their already massive tariffs and barriers on U.S. companies doing business there, we will simply apply a Tax on their Cars which freely pour into the U.S. They make it impossible for our cars (and more) to sell there. Big trade imbalance!

If the U.S. sells a car into China, there is a tax of 25%. If China sells a car into the U.S., there is a tax of 2%. Does anybody think that is FAIR? The days of the U.S. being ripped-off by other nations is OVER!

The reason that the small truck business in the U.S. is such a go to favorite is that, for many years, Tariffs of 25% have been put on small trucks coming into our country. It is called the

In [26]:
# find tweets that could look similar to this one
q = "I am so great and incredible"
dim = 2000
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['I', 'am', 'so', 'great', 'and', 'incred']
Ignoring word 'I'
Ignoring word 'am'
Ignoring word 'so'
Ignoring word 'and'

.@seanhannity on @foxandfriends now! Great! 8:18 A.M.

Great couple, great book! https://t.co/cLDI79rin8

Incredible to be with our GREAT HEROES today in California. We will always be with you! https://t.co/B1MCTF83Zf

Gina is Great! https://t.co/TyLQ2W42y5

Great #StateDinner2018 in Ohio tonight! Together, we are MAKING AMERICA GREAT AGAIN! https://t.co/ALU1PHEsvh

Our Country is doing GREAT. Best financial numbers on the Planet. Great to have USA WINNING AGAIN!

Great photo from Ocean City, Maryland. Thank you. MAKE AMERICA GREAT AGAIN! https://t.co/kILZz31yDJ

A great First Lady! https://t.co/Pt35aluI4C

....@NASCAR and Champion @MartinTruex_Jr were recently at the White House. It was a great day for a great sport!

Thank you to all of the incredible law enforcement officers and firefighters in Bethpage, New York. Keep up the great work! https://t.co/SMaZ8Hfas4



In [29]:
# find tweets that could look similar to this one
q = " Trump great incredible perfect best good "
dim = 4000
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['trump', 'great', 'incred', 'perfect', 'best', 'good']

.@seanhannity on @foxandfriends now! Great! 8:18 A.M.

Our Country is doing GREAT. Best financial numbers on the Planet. Great to have USA WINNING AGAIN!

Great couple, great book! https://t.co/cLDI79rin8

Incredible to be with our GREAT HEROES today in California. We will always be with you! https://t.co/B1MCTF83Zf

Good (Great) meeting in the Oval Office tonight with the NRA!

“Trump the orator outlines the greatness of America to Democrats’ disgust” https://t.co/XpirrtTY6V

Great to have our incredible First Lady back home in the White House. Melania is feeling and doing really well. Thank you for all of your prayers and best wishes!

A productive dialogue is not only good for the United States and good for Russia, but it is good for the world. #HELSINKI2018 https://t.co/Q2Y1PhM9au

Great to have our incredible First Lady back home in the White House. Melanie is feeling and doing really well. Thank you for all of your prayers 

In [34]:
#Comparing to a Hillary Clinton pinned tweet (she is obviously very proud of this one)
q = "To all the little girls watching never doubt that you are valuable and powerful & deserving of every chance & opportunity in the world ."
dim = 1000
topK = 5
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['To', 'all', 'the', 'littl', 'girl', 'watch', 'never', 'doubt', 'that', 'you', 'are', 'valuabl', 'and', 'power', '&', 'deserv', 'of', 'everi', 'chanc', '&', 'opportun', 'in', 'the', 'world', '.']
Ignoring word 'To'
Ignoring word 'all'
Ignoring word 'the'
Ignoring word 'that'
Ignoring word 'you'
Ignoring word 'are'
Ignoring word 'valuable'
Ignoring word 'and'
Ignoring word '&'
Ignoring word 'of'
Ignoring word '&'
Ignoring word 'in'
Ignoring word 'the'
Ignoring word '.'

A Harley-Davidson should never be built in another country-never! Their employees and customers are already very angry at them. If they move, watch, it will be the beginning of the end - they surrendered, they quit! The Aura will be gone and they will be taxed like never before!

If you meet every day with optimism – if you confront every obstacle with determination – if you refuse to give up, if you never quit, if you face every challenge with confidence and pride – then there is no goal you cannot achieve, and no drea

In [35]:
#Comparing to a Hillary Clinton pinned tweet (she is obviously very proud of this one)
q = "To all the little girls watching never doubt that you are valuable and powerful & deserving of every chance & opportunity in the world ."
dim = 4500
topK = 5
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['To', 'all', 'the', 'littl', 'girl', 'watch', 'never', 'doubt', 'that', 'you', 'are', 'valuabl', 'and', 'power', '&', 'deserv', 'of', 'everi', 'chanc', '&', 'opportun', 'in', 'the', 'world', '.']
Ignoring word 'To'
Ignoring word 'all'
Ignoring word 'the'
Ignoring word 'that'
Ignoring word 'you'
Ignoring word 'are'
Ignoring word 'valuable'
Ignoring word 'and'
Ignoring word '&'
Ignoring word 'of'
Ignoring word '&'
Ignoring word 'in'
Ignoring word 'the'
Ignoring word '.'

A Harley-Davidson should never be built in another country-never! Their employees and customers are already very angry at them. If they move, watch, it will be the beginning of the end - they surrendered, they quit! The Aura will be gone and they will be taxed like never before!

If you meet every day with optimism – if you confront every obstacle with determination – if you refuse to give up, if you never quit, if you face every challenge with confidence and pride – then there is no goal you cannot achieve, and no drea

In [36]:
# find tweets that could look similar to this one
q = " my car is parked by the wall"
dim = 2000
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['my', 'car', 'is', 'park', 'by', 'the', 'wall']
Ignoring word 'my'
Ignoring word 'is'
Ignoring word 'by'
Ignoring word 'the'

This isn’t about the Wall, everybody knows that a Wall will work perfectly (In Israel the Wall works 99.9%). This is only about the Dems not letting Donald Trump &amp; the Republicans have a win. They may have the 10 Senate votes, but we have the issue, Border Security. 2020!

.....on a Border is only effective in conjunction with a Wall. Properly designed and built Walls work, and the Democrats are lying when they say they don’t. In Israel the Wall is 99.9% successful. Will not be any different on our Southern Border! Hundreds of $Billions saved!

We need the Wall for the safety and security of our country. We need the Wall to help stop the massive inflow of drugs from Mexico, now rated the number one most dangerous country in the world. If there is no Wall, there is no Deal!

The Wall is different than the 25 Billion Dollars in Border Security. The complete W

In [38]:
# find tweets that could look similar to this one
q = " my car is parked by the wall"
dim = 4000
topK = 5
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['my', 'car', 'is', 'park', 'by', 'the', 'wall']
Ignoring word 'my'
Ignoring word 'is'
Ignoring word 'by'
Ignoring word 'the'

This isn’t about the Wall, everybody knows that a Wall will work perfectly (In Israel the Wall works 99.9%). This is only about the Dems not letting Donald Trump &amp; the Republicans have a win. They may have the 10 Senate votes, but we have the issue, Border Security. 2020!

.....on a Border is only effective in conjunction with a Wall. Properly designed and built Walls work, and the Democrats are lying when they say they don’t. In Israel the Wall is 99.9% successful. Will not be any different on our Southern Border! Hundreds of $Billions saved!

We need the Wall for the safety and security of our country. We need the Wall to help stop the massive inflow of drugs from Mexico, now rated the number one most dangerous country in the world. If there is no Wall, there is no Deal!

The Wall is different than the 25 Billion Dollars in Border Security. The complete W

In [40]:
# find tweets that could look similar to this one
q = " Michelle and I send our condolences to the people of New Zealand . We grieve with you and the Muslim community . All of us must stand against hatred in all its forms ."
dim = 4000
topK = 10
idxs = query(dim, q, topK)

# print the closest matches
for idx in idxs:
    print(text_tweets["text"][idx])
    print()

['michel', 'and', 'I', 'send', 'our', 'condol', 'to', 'the', 'peopl', 'of', 'new', 'zealand', '.', 'We', 'griev', 'with', 'you', 'and', 'the', 'muslim', 'commun', '.', 'all', 'of', 'us', 'must', 'stand', 'against', 'hatr', 'in', 'all', 'it', 'form', '.']
Ignoring word 'and'
Ignoring word 'I'
Ignoring word 'our'
Ignoring word 'to'
Ignoring word 'the'
Ignoring word 'of'
Ignoring word 'Zealand'
Ignoring word '.'
Ignoring word 'We'
Ignoring word 'with'
Ignoring word 'you'
Ignoring word 'and'
Ignoring word 'the'
Ignoring word 'Muslim'
Ignoring word '.'
Ignoring word 'All'
Ignoring word 'of'
Ignoring word 'against'
Ignoring word 'in'
Ignoring word 'all'
Ignoring word 'its'
Ignoring word '.'

Mexico, whose laws on immigration are very tough, must stop people from going through Mexico and into the U.S. We may make this a condition of the new NAFTA Agreement. Our Country cannot accept what is happening! Also, we must get Wall funding fast.

We send our deepest condolences to Lou and the entire 