In [1]:
import os
import re
import tweepy
from tweepy import OAuthHandler
from textblob import TextBlob

In [2]:
%load_ext dotenv
%dotenv ./auth.txt

In [3]:


class TwitterClient(object):
    def __init__(self):
        try:
            self.auth = OAuthHandler(os.getenv('api_key'), os.getenv('api_secret'))
            self.auth.set_access_token(os.getenv('oauth_token'), os.getenv('oauth_token_secret'))
            self.api = tweepy.API(self.auth)
            assert self.api
        except:
            print("Error: Authentication Failed")
    
    
    def get_tweets(self, query, count = 10):
        tweets = []
        try:
            fetched_tweets = self.api.search_tweets(q = query, count = count)
            for tweet in fetched_tweets:
                parsed_tweet = {}
                parsed_tweet['text'] = tweet.text
                parsed_tweet['sentiment'] = self.get_tweet_sentiment(tweet.text)
                if tweet.retweet_count > 0:
                    if parsed_tweet not in tweets:
                        tweets.append(parsed_tweet)
                else:
                    tweets.append(parsed_tweet)
            return tweets
        except tweepy.TweepyException as e:
            print("Error : " + str(e))

    def fetch_tweets(self, query, count = 10):
        try:
            return self.api.search_tweets(q = query, count = count)
        except tweepy.TweepyException as e:
            print("Error : " + str(e))  

In [4]:
tc = TwitterClient()
#tc.fetch_tweets('#FarmLaws',1)

In [41]:
## The tweets have been fetched and stored in a file. 
## Use the cached tweets for consistent results instead of fetching new from twitter.
with open('divtext.txt', 'w') as f:
    tweets = tc.fetch_tweets('#messi',8000)
    for tweet in tweets:
        f.write(tweet.text + "_$_")

# 1. Data Preparation

## 1.1 Fetch from file

In [42]:

f = open('divtext.txt', 'r')
tweets = f.read().split('_$_')
f.close()

In [43]:
tweets

['RT @ActualiteBarca: 💣  Nouvelle vidéo ! \n\nLe #Barca en négociations avec CVC, si le deal se fait, le club pourra recruter «\xa0gros\xa0» en été.…',
 'RT @365Scores: 🚨Special Announcement: Our sharp minds at #365Scores have decided to pitch-in ALL our opinions and pick OUR #PlayerOfTheYear…',
 "#Aguero si ritira: arriva il commovente messaggio di #Messi: Il sette volte Pallone d'oro ha voluto omaggiare il su… https://t.co/d12QKGywVL",
 'RT @messi10_rey: Leo #Messi🗣️: Prácticamente toda una carrera juntos, Kun… Vivimos momentos muy lindos y otros q no lo fueron tanto, todos…',
 '#Messi despidió a su mejor amigo dentro de la cancha: "Voy a extrañar muchísimo estar con vos"\n\nhttps://t.co/xMmadqcCWc',
 'RT @tonydebiase10: El Mural más grande del mundo para el mejor jugador del mundo\n#Messi \n#rosario\n#MessiahHasCome \n#messi7 https://t.co/Ipi…',
 'RT @PabloFMarino: "Maradona conquistó Italia, Messi lo hizo en España y Agüero lo hizo en Inglaterra"...\nUna frase espectacular de Pep G

# 1.2 Preprocess each tweet

In [44]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk import download
# download('stopwords')

def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    tweets_eng = []
    for word in tweets_clean:
        flag = True
        for i in word:
            if ord(i) >= 256:
                flag = False
                break
        if flag:
            tweets_eng.append(word)
    return tweets_eng

In [45]:
processed_tweets = [process_tweet(tweet) for tweet in tweets]
print(tweets[0])
print(processed_tweets[0])

RT @ActualiteBarca: 💣  Nouvelle vidéo ! 

Le #Barca en négociations avec CVC, si le deal se fait, le club pourra recruter « gros » en été.…
['nouvel', 'vidéo', 'le', 'barca', 'en', 'négociat', 'avec', 'cvc', 'si', 'le', 'deal', 'se', 'fait', 'le', 'club', 'pourra', 'recrut', '«', 'gro', '»', 'en', 'été']


# 2. Feature Extraction

## 2.1 Word counts  

In [46]:
def count_words(tweet:list, freqs:dict, wordToTweet:dict):
    for word in tweet:
        if word in freqs:
            freqs[word] += 1
            wordToTweet[word].append(tweet)
        else:
            freqs[word] = 1
            wordToTweet[word] = [tweet]
    return freqs

In [47]:
freqs = dict()
wordToTweet = dict()
for tweet in processed_tweets:
    count_words(tweet, freqs, wordToTweet)

In [48]:
max(freqs.values())

86

# 3. Analysis

In [49]:
#!pip3 install numpy pandas seaborn
import numpy as np
import pandas as pd
import seaborn as sb

In [50]:
# freqs.keys()
# list(freqs.items())

In [51]:
len(freqs.keys())

361

In [52]:
most_freq_word = np.argmax(list(freqs.values()))
list(freqs.items())[most_freq_word]

('messi', 86)

In [53]:

# Sorted in descending order of frequencies
freq_sorted = list(freqs.items())
freq_sorted.sort(key = lambda x : -x[1])
freq_sorted[:10]

[('messi', 86),
 ('que', 50),
 ('¡', 45),
 ('de', 41),
 ('amigo', 38),
 ('le', 37),
 ('su', 35),
 ('para', 33),
 ('la', 32),
 ('en', 30)]

# Clustering

In [18]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v

def normalize(v):
    norm = np.power(np.sum(np.power(v,2)), 0.5)
    if norm == 0:
        return 0
    return v / norm

In [19]:
from collections import Counter
def len_counts(clusters):
    lens = [len(cluster) for cluster in clusters.values()]
    return dict(Counter(lens))

In [20]:
def get_avg_dist(clusterA, clusterB, vectors):
    dist_sum = 0
    for i in clusterA:
        for j in clusterB:
            dist_sum += np.sum(np.power(vectors[i] - vectors[j], 2))
    
    dist_avg = dist_sum / (len(clusterA) * len(clusterB))
    return dist_avg

In [21]:
def mergeOnce(clusters, vectors):
    min_dist = 2**30
    min_loc = (0, 0)
    for clusterInxA in clusters:
        for clusterInxB in clusters:
            if clusterInxA != clusterInxB:
                dist = get_avg_dist(clusters[clusterInxA], clusters[clusterInxB], vectors)
                if dist < min_dist:
                    min_loc = (clusterInxA, clusterInxB)
                    min_dist = dist

    clusters[min_loc[0]] = clusters[min_loc[0]] + clusters[min_loc[1]]
    clusters.pop(min_loc[1])

def mergeToK(clusters, vectors, K):
    while len(clusters.keys()) > K:
        mergeOnce(clusters, vectors)

In [22]:
def display_unique_tweets(tweets, cluster):
    c_tweets = [tweets[i] for i in cluster]
    for i in range(len(c_tweets)):
        if not c_tweets[i] in c_tweets[:i]:
            print(c_tweets[i])
        

In [23]:
alphabet = [i[0] for i in freq_sorted]
n = len(processed_tweets)
m = len(alphabet)
clusters = dict()
vectors = dict()
for i in range(n):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
    clusters[i] = [i]
mergeToK(clusters, vectors, 15)


In [24]:
# print("lengths:")
# print({cluster:len(clusters[cluster]) for cluster in clusters})
# print("len counts:")
# print(len_counts(clusters))
print(clusters)

{0: [0, 51, 81, 93, 55, 11, 25, 3, 13, 35, 39, 52, 59, 63, 64, 69, 71, 72, 73, 74, 75, 76, 77, 9, 21, 54, 12, 18, 28, 33, 40, 47, 50, 53, 57, 66, 67, 68, 84, 85, 91, 94, 95, 97, 99, 100, 38, 80, 20, 45, 78, 88, 79, 65, 48, 62, 49, 6, 42, 36, 26, 1, 5, 10, 8, 14, 82, 37, 24, 83, 15, 17, 23, 29, 30, 89, 27, 44, 43, 58, 46, 34, 61, 92], 2: [2], 4: [4], 7: [7, 31, 41, 60], 16: [16], 19: [19], 22: [22], 32: [32], 56: [56], 70: [70], 86: [86], 87: [87], 90: [90], 96: [96], 98: [98]}


In [25]:
display_unique_tweets(processed_tweets, clusters[2])

['pm', 'modi', 'lost', 'farmlaw', 'battl', 'jan', '28th', 'night', 'polic', 'reach', 'rakesh', 'tikait', 'ghazipur', 'border']


# Divisive Clustering

In [54]:
def vectorize(tweet, alphabet):
    v = np.zeros(len(alphabet))
    for i in range(len(alphabet)):
        if alphabet[i] in tweet:
            v[i] += 1
    return v

def normalize(v):
    norm = np.power(np.sum(np.power(v,2)), 0.5)
    if norm == 0:
        return 0
    return v / norm

In [55]:
def findMostDistant(cluster, vectors):
    if len(cluster)<=1:
        return -1
    maxDist = -1
    item = 0
    n = len(cluster)
    for i in cluster:
        dist = 0
        for j in cluster:
            dist += np.linalg.norm(vectors[i] - vectors[j])
        dist /= n
        if dist>maxDist:
            maxDist = dist
            item = i
    return item

In [56]:
def isCloserToFirstCluster(cluster1, cluster2, item, vectors):
    dist1 = 0
    dist2 = 0
    n1 = len(cluster1)
    n2 = len(cluster2)
    for i in cluster1:
        dist1 += np.linalg.norm(vectors[i] - vectors[item])
    dist1 /= n1
    for i in cluster2:
        dist2 += np.linalg.norm(vectors[i] - vectors[item])
    dist2 /= n2
    if dist1<=dist2:
        return True
    return False

In [57]:
def dClusteringOnce(clusters, vectors):
    tmp = []
    for key in clusters:
        tmp.append(key)
    for key in tmp:
        item = findMostDistant(clusters[key], vectors)
        if item == -1:
            continue
        if item==key:
            item = clusters[key][1]
            clusters[item] = clusters[key][1:]
            for i in clusters[key][1:]:
                if isCloserToFirstCluster(clusters[item], clusters[key], i, vectors):
                    clusters[item].append(i)
                    clusters[key].remove(i)
            continue
        clusters[item] = [item]
        clusters[key].remove(item)
        for i in clusters[key]:
            if isCloserToFirstCluster(clusters[item], clusters[key], i, vectors):
                clusters[item].append(i)
                clusters[key].remove(i)

In [58]:
def dClusteringMinK(clusters, vectors, k):
    while len(clusters)<k:
        dClusteringOnce(clusters, vectors)

In [59]:
while [] in processed_tweets:
    processed_tweets.remove([])
print(processed_tweets)

[['nouvel', 'vidéo', 'le', 'barca', 'en', 'négociat', 'avec', 'cvc', 'si', 'le', 'deal', 'se', 'fait', 'le', 'club', 'pourra', 'recrut', '«', 'gro', '»', 'en', 'été'], ['special', 'announc', 'sharp', 'mind', '365score', 'decid', 'pitch-in', 'opinion', 'pick', 'playeroftheyear'], ['aguero', 'si', 'ritira', 'arriva', 'il', 'commovent', 'messaggio', 'di', 'messi', 'il', 'sett', 'volt', 'pallon', "d'oro", 'ha', 'voluto', 'omaggiar', 'il', 'su'], ['leo', 'messi', 'prácticament', 'toda', 'una', 'carrera', 'junto', 'kun', 'vivimo', 'momento', 'muy', 'lindo', 'otro', 'q', 'lo', 'fueron', 'tanto', 'todo'], ['messi', 'despidió', 'su', 'mejor', 'amigo', 'dentro', 'de', 'la', 'cancha', 'voy', 'extrañar', 'muchísimo', 'estar', 'con', 'vo'], ['el', 'mural', 'má', 'grand', 'del', 'mundo', 'para', 'el', 'mejor', 'jugador', 'del', 'mundo', 'messi', 'rosario', 'messiahhascom', 'messi', '7'], ['maradona', 'conquistó', 'italia', 'messi', 'lo', 'hizo', 'en', 'españa', 'agüero', 'lo', 'hizo', 'en', 'inglate

In [60]:
alphabet = [i[0] for i in freq_sorted]
clusters = dict()
vectors = dict()
for i in range(len(processed_tweets)):
    vectors[i] = vectorize(processed_tweets[i], alphabet)
clusters[0] = [i for i in range(len(processed_tweets))]

In [61]:
dClusteringMinK(clusters, vectors, 7)
print(clusters)
print(len(clusters))

{0: [0, 6, 9, 10, 14, 15, 16, 17, 18, 20, 21, 22, 24, 25, 27, 29, 31, 34, 35, 45, 51, 52, 54, 62, 63, 67, 73, 75, 78, 80, 85, 86, 90, 91, 95, 99], 1: [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99, 1, 11, 12, 13, 19, 23, 26, 28, 30, 32, 33, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 53, 55, 57, 58, 59, 60, 61, 64, 65, 66, 68, 69, 70, 71, 72, 74, 76, 77, 79, 81, 82, 83, 84, 87, 88, 93, 97, 98], 3: [3], 62: [62], 4: [4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 18, 20, 21, 22, 24, 25, 27, 29, 31, 34, 35, 45, 51, 52, 54, 56, 62, 63, 67, 73, 75, 78, 80, 85, 86, 89, 90, 91, 92, 94, 95, 96, 99, 4, 5, 7, 8, 56, 89, 92, 94, 96], 2: [2, 2], 67: [67, 91, 67, 91]}
7


In [62]:
len(alphabet)

361

In [1]:
len(processed_tweets)

NameError: name 'processed_tweets' is not defined

In [64]:
for a in clusters.values():
    print(len(a))

36
148
1
1
53
2
4
