In [1]:
import os
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob

In [2]:
%load_ext dotenv
%dotenv ./auth.txt

In [3]:


class TwitterClient(object):
    def __init__(self):
        try:
            self.auth = OAuthHandler(os.getenv('api_key'), os.getenv('api_secret'))
            self.auth.set_access_token(os.getenv('oauth_token'), os.getenv('oauth_token_secret'))
            self.api = tweepy.API(self.auth)
            assert self.api
        except:
            print("Error: Authentication Failed")
    
    
    def get_tweets(self, query, count = 10):
        tweets = []
        try:
            fetched_tweets = self.api.search_tweets(q = query, count = count)
            for tweet in fetched_tweets:
                parsed_tweet = {}
                parsed_tweet['text'] = tweet.text
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
                if tweet.retweet_count > 0:
                    if parsed_tweet not in tweets:
                        tweets.append(parsed_tweet)
                else:
                    tweets.append(parsed_tweet)
            return tweets
        except tweepy.TweepyException as e:
            print("Error : " + str(e))

    def fetch_tweets(self, query, count = 10):
        try:
            return self.api.search_tweets(q = query, count = count)
        except tweepy.TweepyException as e:
            print("Error : " + str(e))  

In [4]:
tc = TwitterClient()
#tc.fetch_tweets('#FarmLaws',1)

In [5]:
## The tweets have been fetched and stored in a file. 
## Use the cached tweets for consistent results instead of fetching new from twitter.
with open('farmer.txt', 'w') as f:
    tweets = tc.fetch_tweets('#FarmLaws',8000)
    for tweet in tweets:
        f.write(tweet.text + "_$_")

# 1. Data Preparation

## 1.1 Fetch from file

In [6]:

f = open('farmer.txt', 'r')
tweets = f.read().split('_$_')
f.close()

In [7]:
tweets[0]

'RT @ajitanjum: ग़ाज़ीपुर बॉर्डर पर जश्न की रात .\nरवानगी से पहले जीत का जश्न मनाते किसानों की बॉर्डर पर ये आखिरी रात है .\nएक साल यूं ही सड़क…'

# 1.2 Preprocess each tweet

In [8]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download
# download('stopwords')

def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    tweets_eng = []
    for word in tweets_clean:
        flag = True
        for i in word:
            if ord(i) >= 256:
                flag = False
                break
        if flag:
            tweets_eng.append(word)
    return tweets_eng

In [9]:
processed_tweets = [process_tweet(tweet) for tweet in tweets]
print(tweets[0])
print(processed_tweets[0])

RT @ajitanjum: ग़ाज़ीपुर बॉर्डर पर जश्न की रात .
रवानगी से पहले जीत का जश्न मनाते किसानों की बॉर्डर पर ये आखिरी रात है .
एक साल यूं ही सड़क…
[]


# 2. Feature Extraction

## 2.1 Word counts  

In [10]:
def count_words(tweet:list, freqs:dict, wordToTweet:dict):
    for word in tweet:
        if word in freqs:
            freqs[word] += 1
            wordToTweet[word].append(tweet)
        else:
            freqs[word] = 1
            wordToTweet[word] = [tweet]
    return freqs

In [11]:
freqs = dict()
wordToTweet = dict()
for tweet in processed_tweets:
    count_words(tweet, freqs, wordToTweet)

In [12]:
max(freqs.values())

38

# 3. Analysis

In [13]:
#!pip3 install numpy pandas seaborn
import numpy as np
import pandas as pd
import seaborn as sb

In [14]:
# freqs.keys()
# list(freqs.items())

In [15]:
len(freqs.keys())

121

In [16]:
most_freq_word = np.argmax(list(freqs.values()))
list(freqs.items())[most_freq_word]

('farmlaw', 38)

In [17]:

# Sorted in descending order of frequencies
freq_sorted = list(freqs.items())
freq_sorted.sort(key = lambda x : -x[1])
freq_sorted[:10]

[('farmlaw', 38),
 ('farmersprotest', 21),
 ('farmer', 18),
 ('farmlawsrep', 16),
 ('farm', 14),
 ('usergener', 10),
 ('live', 8),
 ('baattochubhegi', 8),
 ('tribal', 8),
 ('farmerswon', 6)]

# Clustering

In [18]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v

def normalize(v):
    norm = np.power(np.sum(np.power(v,2)), 0.5)
    if norm == 0:
        return 0
    return v / norm

In [19]:
from collections import Counter
def len_counts(clusters):
    lens = [len(cluster) for cluster in clusters.values()]
    return dict(Counter(lens))

In [20]:
def get_avg_dist(clusterA, clusterB, vectors):
    dist_sum = 0
    for i in clusterA:
        for j in clusterB:
            dist_sum += np.sum(np.power(vectors[i] - vectors[j], 2))
    
    dist_avg = dist_sum / (len(clusterA) * len(clusterB))
    return dist_avg

In [21]:
def mergeOnce(clusters, vectors):
    min_dist = 2**30
    min_loc = (0, 0)
    for clusterInxA in clusters:
        for clusterInxB in clusters:
            if clusterInxA != clusterInxB:
                dist = get_avg_dist(clusters[clusterInxA], clusters[clusterInxB], vectors)
                if dist < min_dist:
                    min_loc = (clusterInxA, clusterInxB)
                    min_dist = dist

    clusters[min_loc[0]] = clusters[min_loc[0]] + clusters[min_loc[1]]
    clusters.pop(min_loc[1])

def mergeToK(clusters, vectors, K):
    while len(clusters.keys()) > K:
        mergeOnce(clusters, vectors)

In [22]:
def display_unique_tweets(tweets, cluster):
    c_tweets = [tweets[i] for i in cluster]
    for i in range(len(c_tweets)):
        if not c_tweets[i] in c_tweets[:i]:
            print(c_tweets[i])
        

In [30]:
alphabet = [i[0] for i in freq_sorted]
n = len(processed_tweets)
m = len(alphabet)
clusters = dict()
vectors = dict()
for i in range(n):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
    clusters[i] = [i]
mergeToK(clusters, vectors, 15)


In [32]:
# print("lengths:")
# print({cluster:len(clusters[cluster]) for cluster in clusters})
# print("len counts:")
# print(len_counts(clusters))
print(clusters)

{0: [0, 1, 3, 4, 5, 7, 9, 13, 17, 20, 25, 31, 33, 36, 42, 45, 47, 51, 52, 55, 56, 61, 62, 63, 64, 69, 71, 72, 75, 76, 78, 81, 82, 83, 90, 91, 92, 94, 96, 100, 98, 19, 49, 39, 21, 29, 34, 35, 74, 77, 85, 97, 22, 10, 23, 27, 32, 37, 48, 50, 59, 65, 67, 87, 93, 99], 2: [2, 15, 24, 28, 44, 46, 53, 57, 70, 84, 6, 8, 11, 58, 86, 88, 60], 12: [12, 26], 14: [14], 16: [16], 18: [18, 66], 30: [30], 38: [38], 40: [40], 41: [41, 73, 79, 89], 43: [43], 54: [54], 68: [68], 80: [80], 95: [95]}


In [36]:
display_unique_tweets(processed_tweets, clusters[2])

['farmer', 'farmersprotest', 'farmlaw', 'usergener']
['farmersprotest', 'farmlaw']
['farmerswon', 'farmlaw', 'farmersprotest']
['farmersprotest', 'farmerswon', 'farmlaw']
['...', 'farmlaw', 'farmersprotest', 'kisanandolan']


# Divisive Clustering

In [39]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v

def normalize(v):
    norm = np.power(np.sum(np.power(v,2)), 0.5)
    if norm == 0:
        return 0
    return v / norm

In [97]:
def findMostDistant(cluster, vectors):
    if len(cluster)<=1:
        return -1
    maxDist = -1
    item = 0
    n = len(cluster)
    for i in cluster:
        dist = 0
        for j in cluster:
            dist += (np.sum(np.power(vectors[i] - vectors[j], 2)))**0.5
        dist /= n
        if dist>maxDist:
            maxDist = dist
            item = i
    return item

In [49]:
def isCloserToFirstCluster(cluster1, cluster2, item, vectors):
    dist1 = 0
    dist2 = 0
    n1 = len(cluster1)
    n2 = len(cluster2)
    for i in cluster1:
        dist1 += (np.sum(np.power(vectors[i] - vectors[item], 2)))**0.5
    dist1 /= n1
    for i in cluster2:
        dist2 += (np.sum(np.power(vectors[i] - vectors[item], 2)))**0.5
    dist2 /= n2
    if dist1<=dist2:
        return True
    return False

In [106]:
def dClusteringOnce(clusters, vectors):
    tmp = []
    for key in clusters:
        tmp.append(key)
    for key in tmp:
        item = findMostDistant(clusters[key], vectors)
        if item == -1:
            continue
        if item==key:
            item = clusters[key][1]
            clusters[item] = clusters[key][1:]
            for i in clusters[key][1:]:
                clusters[key].remove(i)
            continue
        clusters[item] = [item]
        clusters[key].remove(item)
        for i in clusters[key]:
            if isCloserToFirstCluster(clusters[item], clusters[key], i, vectors):
                clusters[item].append(i)
                clusters[key].remove(i)

In [128]:
def dClusteringMinK(clusters, vectors, k):
    while len(clusters)<k:
        dClusteringOnce(clusters, vectors)

In [61]:
while [] in processed_tweets:
    processed_tweets.remove([])
print(processed_tweets)

[['farmer', 'farmersprotest', 'farmlaw', 'usergener'], ['farmersprotest', 'farmlaw'], ['farmersprotest', 'farmlaw'], ['farmlawsrep', 'farmlaw', 'farm'], ['farmersprotest', 'farmlaw'], ['come', 'meet', 'parti', 'leader', 'start', 'parliament', 'session', 'come', 'loksabha'], ['tu', 'aur', 'tere', 'jais', 'laakh', 'bhatka', 'lain', 'ab', 'baar', 'koi', 'bhi', 'voter', 'demonetis', 'caa', 'nrc', 'farmlaw'], ['farmer', 'farmersprotest', 'farmlaw', 'usergener'], ['ahead', 'centr', 'brought', 'farmlaw', 'interest', 'farmer', 'farmer', 'union', 'happi', 'govern'], ['...', 'farmer', 'start', 'remov', 'makeshift', 'tent', 'skm', 'announc', 'decis', 'su'], ['cartoon', 'farmlaw'], ['live', 'baattochubhegi'], ['reporterdiari', 'farm'], ['farmlawsrep', 'farmlaw', 'farm'], ['farmer', 'farmersprotest', 'farmlaw', 'usergener'], ['come', 'meet', 'parti', 'leader', 'start', 'parliament', 'session', 'come', 'loksabha'], ['farmlawsrep', 'farmlaw', 'farm'], ['farmer', 'farmersprotest', 'farmlaw', 'usergene

In [126]:
alphabet = [i[0] for i in freq_sorted]
clusters = dict()
vectors = dict()
for i in range(len(processed_tweets)):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
clusters[0] = [i for i in range(len(processed_tweets))]
print(clusters)
print(vectors)

{0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]}
{0: array([1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.]), 1: array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [129]:
dClusteringMinK(clusters, vectors, 7)
print(clusters)
print(len(clusters))

{0: [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 28, 29, 30, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 49, 50, 51, 52, 53, 54, 56, 58, 59, 60], 6: [6], 19: [19], 26: [26], 57: [57], 27: [27], 35: [35], 45: [45, 48, 55]}
8
