In [14]:
import re
import tweepy
import numpy as np
from tweepy import OAuthHandler
import os

In [15]:
%load_ext dotenv
%dotenv ./auth.txt

In [16]:


class TwitterClient(object):
    def __init__(self):
        try:
            self.auth = OAuthHandler(os.getenv('api_key'), os.getenv('api_secret'))
            self.auth.set_access_token(os.getenv('oauth_token'), os.getenv('oauth_token_secret'))
            self.api = tweepy.API(self.auth)
            assert self.api
        except:
            print("Error: Authentication Failed")
    
    
    def get_tweets(self, query, count = 10):
        tweets = []
        try:
            fetched_tweets = self.api.search_tweets(q = query, count = count)
            for tweet in fetched_tweets:
                parsed_tweet = {}
                parsed_tweet['text'] = tweet.text
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
                if tweet.retweet_count > 0:
                    if parsed_tweet not in tweets:
                        tweets.append(parsed_tweet)
                else:
                    tweets.append(parsed_tweet)
            return tweets
        except tweepy.TweepyException as e:
            print("Error : " + str(e))

    def fetch_tweets(self, query, count = 10):
        try:
            return self.api.search_tweets(q = query, count = count)
        except tweepy.TweepyException as e:
            print("Error : " + str(e))  

In [22]:
api = TwitterClient()

In [23]:
with open('data.txt', 'w') as f:
    tweets = api.get_tweets('#Rhea', 20000)
    for tweet in tweets:
        f.write(tweet + "_$_")

AttributeError: 'TwitterClient' object has no attribute 'get_tweet_sentiment'

# Read Data

In [1]:
f = open('data.txt', 'r')
tweets = f.read().split('_$_')
f.close()

# Preprocess

In [26]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download
from collections import Counter

# download('stopwords')

def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    tweets_eng = []
    for word in tweets_clean:
        flag = True
        for i in word:
            if ord(i) >= 256:
                flag = False
                break
        if flag:
            tweets_eng.append(word)
    return tweets_eng

In [27]:
processed_tweets = [process_tweet(tweet) for tweet in tweets]
while [] in processed_tweets:
    processed_tweets.remove([])

In [28]:
len(processed_tweets)

66

In [29]:
def count_words(tweet:list, freqs:dict, wordToTweet:dict):
    for word in tweet:
        if word in freqs:
            freqs[word] += 1
            wordToTweet[word].append(tweet)
        else:
            freqs[word] = 1
            wordToTweet[word] = [tweet]
    return freqs

In [30]:
freqs = dict()
wordToTweet = dict()
for tweet in processed_tweets:
    count_words(tweet, freqs, wordToTweet)

In [31]:
# freqs

In [32]:
freq_sorted = list(freqs.items())
freq_sorted.sort(key = lambda x : -x[1])
freq_sorted[:10]

[('rhea', 50),
 ('fe3h', 16),
 ('...', 9),
 ('fireemblem', 9),
 ('ke', 8),
 ('fireemblemthreehous', 7),
 ('seteth', 5),
 ('sushantsinghrajput', 4),
 ('one', 4),
 ('titan', 4)]

In [33]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v
def closestCluster(vector, centroids):
    closest = -1
    minDist = 2**30
    for key in centroids:
        dist = np.linalg.norm(centroids[key] - vector)
        if dist < minDist:
            minDist = dist
            closest = key
    return closest
def assignToCluster(clusters, vectors, centroids):
    for i in range(len(vectors)):
        c = closestCluster(vectors[i], centroids)
        clusters[c].append(i)
    return clusters
def kmeans(k, max_iter, vectors):
    clusters = {}
    centroids = {}
    idx = np.random.choice(len(vectors), k, replace=False)
    for i in range(k):
        clusters[i] = []
        centroids[i] = vectors[idx[i]] 
    clusters = assignToCluster(clusters, vectors, centroids)
    for _ in range(max_iter-1):
        for i in range(k):
            for j in clusters[i]:
                centroids[i] = centroids[i] + vectors[j]
            if clusters[i] != []:
                centroids[i] = centroids[i] / len(clusters[i])
            if len(clusters[i]):
                clusters[i].clear()
        clusters = assignToCluster(clusters, vectors, centroids)
    return clusters


def len_counts(clusters):
    lens = [len(cluster) for cluster in clusters.values()]
    return dict(Counter(lens))


def display_unique_tweets(tweets, cluster):
    c_tweets = [tweets[i] for i in cluster]
    for i in range(len(c_tweets)):
        if not c_tweets[i] in c_tweets[:i]:
            print(c_tweets[i])

In [34]:
alphabet = [i[0] for i in freq_sorted]
clusters = dict()
vectors = dict()
for i in range(len(processed_tweets)):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
clusters[0] = [i for i in range(len(processed_tweets))]

In [35]:
clusters = kmeans(20, 100, vectors)

In [36]:
clusters

{0: [],
 1: [],
 2: [],
 3: [24, 41, 42],
 4: [0,
  1,
  2,
  4,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  26,
  28,
  29,
  30,
  32,
  36,
  39,
  40,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  61,
  62,
  63,
  64,
  65],
 5: [],
 6: [],
 7: [],
 8: [],
 9: [],
 10: [],
 11: [],
 12: [],
 13: [],
 14: [],
 15: [],
 16: [],
 17: [3, 5, 6, 20, 21, 22, 23, 25, 27, 31, 33, 34, 35, 37, 38, 60],
 18: [],
 19: []}

In [37]:
print(len(alphabet))

384


In [38]:
print("lengths:")
print({cluster:len(clusters[cluster]) for cluster in clusters})
print("len counts:")
print(len_counts(clusters))

lengths:
{0: 0, 1: 0, 2: 0, 3: 3, 4: 47, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 16, 18: 0, 19: 0}
len counts:
{0: 17, 3: 1, 47: 1, 16: 1}


In [39]:
display_unique_tweets(processed_tweets, clusters[17])

['beagl', 'run', 'part', '4', 'seteth', 'neat', 'byleth', 'setleth', 'rhea', 'rhealeth', 'fe3h', 'fireemblemthreehous']
['seteth', 'rhea', 'fe3h']
['children', 'goddess', 'go', 'trick', 'treat', 'fe3h', 'fehero', 'fireemblem', 'rhea', 'seteth']
['btw', 'final', 'cave', 'wrote', 'pine', 'rheagard', '<3', '3', 'rheagard', 'edelgard', 'rhea', 'fe3h']
['seiro', 'fire', 'emblem', 'feel', 'fe3h', 'rhea']
['->', 'fe3h', 'fire_emblem', 'rhea']
['get', 'loser', 'go', 'church', 'fe3h', 'seteth', 'flayn', 'rhea']
['immacul', 'one', 'fe3h', 'rhea', 'archbishoprhea']
['ladi', 'rhea', 'fanart', 'wip', 'rhea', 'fireemblem', 'fe3h', 'fehero', 'feh', 'jrpg']
['new', 'year', 'new', 'draw', 'manga', 'mangaka', 'anim', 'animeart', 'mangaart', 'fe3h', 'fireemblemthreehous', 'fireemblem', 'nintendo']
["he'", 'gone', 'mother', '...', 'rhea', 'fe3h']
['rhea', 'definit', 'problem', 'fireemblem', 'threehous', 'fe3h', 'fireemblemthreehous', 'rhea']
['halloween', 'rhea', 'beauti', '...', 'feh', 'rhea', 'fireemble