In [1]:
import re
import tweepy
import numpy as np
from tweepy import OAuthHandler
import os

In [2]:
%load_ext dotenv
%dotenv ./auth.txt

In [3]:
class TwitterClient(object):
    def __init__(self):
        try:
            self.auth = OAuthHandler(os.getenv('api_key'), os.getenv('api_secret'))
            self.auth.set_access_token(os.getenv('oauth_token'), os.getenv('oauth_token_secret'))
            self.api = tweepy.API(self.auth)
        except Tweepy.TweepError:
            print("Error: Authentication Failed")

    def get_tweets(self, query, count = 10):
        tweets = []
        try:
            fetched_tweets = self.api.search_tweets(q = query, count = count)
            for tweet in fetched_tweets:
                if tweet.text not in tweets:
                    tweets.append(tweet.text)
            return tweets
        except tweepy.TweepError as e:
            print("Error : " + str(e))
            
    def get_trending_tags(self):
        trends1 = self.api.get_place_trends(1)
        data = trends1[0] 
        trends = data['trends']
        names = []
        for trend in trends:
            if trend['name'][0]=='#':
                names.append(trend['name'])
        return names

In [4]:
api = TwitterClient()
# api.get_tweets(query = '#chess', count = 200)
api.get_trending_tags()

['#AEWDynamite', '#Survivor', '#LakeShow', '#FestaAFazenda', '#VijayDiwas']

In [5]:
with open('data.txt', 'w') as f:
    tweets = api.get_tweets('#FarmLaws', 20000)
    for tweet in tweets:
        f.write(tweet + "_$_")

# Read Data

In [6]:
f = open('data.txt', 'r')
tweets = f.read().split('_$_')
f.close()

# Preprocess

In [7]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download
# download('stopwords')

def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    tweets_eng = []
    for word in tweets_clean:
        flag = True
        for i in word:
            if ord(i) >= 256:
                flag = False
                break
        if flag:
            tweets_eng.append(word)
    return tweets_eng

In [8]:
processed_tweets = [process_tweet(tweet) for tweet in tweets]
while [] in processed_tweets:
    processed_tweets.remove([])

In [9]:
len(processed_tweets)

39

In [10]:
def count_words(tweet:list, freqs:dict, wordToTweet:dict):
    for word in tweet:
        if word in freqs:
            freqs[word] += 1
            wordToTweet[word].append(tweet)
        else:
            freqs[word] = 1
            wordToTweet[word] = [tweet]
    return freqs

In [11]:
freqs = dict()
wordToTweet = dict()
for tweet in processed_tweets:
    count_words(tweet, freqs, wordToTweet)

In [12]:
freqs

{'touch': 2,
 'moment': 2,
 '...': 2,
 'daughter': 2,
 'hug': 2,
 'father': 2,
 'return': 4,
 'delhi': 3,
 'win': 2,
 'long': 4,
 'battl': 2,
 'farmer': 11,
 'victori': 2,
 'farmersprotest_fatehma': 1,
 'farmlaw': 22,
 'backbon': 1,
 'india': 5,
 "can't": 1,
 'decid': 1,
 'crop': 1,
 'cost': 1,
 'incredibleindia': 1,
 'ye': 1,
 'anoth': 1,
 'bill': 1,
 'like': 2,
 'indian': 1,
 'gather': 1,
 'voic': 1,
 'reveal': 1,
 'true': 1,
 'valu': 1,
 'produc': 1,
 'adjust': 1,
 'ineffici': 1,
 'leakag': 1,
 'loss': 1,
 'given': 1,
 'decent': 1,
 'burial': 1,
 'deserv': 1,
 'govern': 1,
 'bid': 1,
 'food-gr': 1,
 'farm': 2,
 'modi': 1,
 'exclus': 2,
 'msp': 3,
 'http': 1,
 ':/': 1,
 '5': 1,
 '10': 1,
 'live': 1,
 'baattochubhegi': 1,
 'justiceforlakhimpurfarm': 2,
 'mera': 1,
 'bhai': 1,
 'maar': 1,
 'diya': 1,
 'gund': 1,
 'ne': 1,
 'chief': 1,
 'economist': 1,
 'deputi': 1,
 'md': 1,
 'imf': 1,
 'geetagopinath': 1,
 'support': 1,
 'anti': 1,
 'peopl': 4,
 'tikait': 2,
 'wan': 1,
 'haryana': 1,


In [13]:
freq_sorted = list(freqs.items())
freq_sorted.sort(key = lambda x : -x[1])
freq_sorted[:10]

[('farmlaw', 22),
 ('farmer', 11),
 ('india', 5),
 ('return', 4),
 ('long', 4),
 ('peopl', 4),
 ('ki', 4),
 ('delhi', 3),
 ('msp', 3),
 ('leader', 3)]

# Clustering

In [14]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v

In [15]:
def closestCluster(vector, centroids):
    closest = -1
    minDist = 2**30
    for key in centroids:
        dist = np.linalg.norm(centroids[key] - vector)
        if dist < minDist:
            minDist = dist
            closest = key
    return closest

In [16]:
def assignToCluster(clusters, vectors, centroids):
    for i in range(len(vectors)):
        c = closestCluster(vectors[i], centroids)
        clusters[c].append(i)
    return clusters

In [17]:
def kmeans(k, max_iter, vectors):
    clusters = {}
    centroids = {}
    idx = np.random.choice(len(vectors), k, replace=False)
    for i in range(k):
        clusters[i] = []
        centroids[i] = vectors[idx[i]] 
    clusters = assignToCluster(clusters, vectors, centroids)
    for _ in range(max_iter-1):
        for i in range(k):
            centroids[i] = np.zeros(len(alphabet))
            for j in clusters[i]:
                centroids[i] = centroids[i] + vectors[j]
            if clusters[i] != []:
                centroids[i] = centroids[i] / len(clusters[i])
            if len(clusters[i]):
                clusters[i].clear()
        clusters = assignToCluster(clusters, vectors, centroids)
    return clusters

In [18]:
alphabet = [i[0] for i in freq_sorted]
clusters = dict()
vectors = dict()
for i in range(len(processed_tweets)):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
clusters[0] = [i for i in range(len(processed_tweets))]

In [19]:
clusters = kmeans(4, 100, vectors)

In [20]:
clusters

{0: [23],
 1: [0,
  1,
  2,
  3,
  4,
  6,
  7,
  8,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38],
 2: [5],
 3: [9]}

In [21]:
print(len(alphabet))

208


In [26]:
processed_tweets[0]

['touch',
 'moment',
 '...',
 'daughter',
 'hug',
 'father',
 'return',
 'delhi',
 'win',
 'long',
 'battl']

In [23]:
for key in clusters:
    clusters[key] = np.array(clusters[key])

In [24]:
clusters

{0: array([23]),
 1: array([ 0,  1,  2,  3,  4,  6,  7,  8, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38]),
 2: array([5]),
 3: array([9])}

In [25]:
len(processed_tweets)

39