In [None]:
#Import packages needed for processing
import re
import json
import xml
import numpy as np
from collections import Counter
from TwitterAPI import TwitterAPI # in case you need to install this package, see practical 6
from sklearn.cluster import KMeans

import requests

# disabling urllib3 warnings
requests.packages.urllib3.disable_warnings()

import matplotlib.pyplot as plt
%matplotlib inline

#If you need add any additional packages, then add them below


In [None]:
#Enter the list of three keywords you selected to the variables keywords below
#e.g. keywords = ["abc", "def", "ghi"]

keywords =  ["rain", "LPL", "gas price"]

group_id = "OVA"

In [None]:
CONSUMER_KEY = "" #API key
CONSUMER_SECRET = "" #API Key Secret
OAUTH_TOKEN = ""
OAUTH_TOKEN_SECRET = ""

# Authenticating with your application credentials
api = TwitterAPI(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET, api_version='2')

print(api)

In [None]:
# geo coordinations of the desired place

PLACE_LAT = 7.8731
PLACE_LON = 80.7718
DELTA_LAT = 1.0
DELTA_LON = 1.0

In [None]:
def retrieve_tweets(api, keyword, batch_count, total_count, latitude, longitude, delta_lat, delta_lon):

    # the collection of tweets to be returned
    tweets_unfiltered = []
    tweets = []
    total_tweets_pulled = 0; 
    
    # the number of tweets within a single query
    batch_count = str(batch_count)
      
    resp = api.request('tweets/search/recent', 
                                    {'query': keyword,
                                        'max_results': batch_count,                                     
                                        'tweet.fields': {'lang':'en'},      
                                        'place.fields':{
                                        'geo': {
                                        "type": "Feature",
                                        "bbox": [
                                        longitude - delta_lon,
                                        latitude - delta_lat,
                                        longitude + delta_lon,
                                        latitude + delta_lat
                                        ],
                                        "properties": {}
                                        }}})
    
    
    #if the resonse had an error
    if ('errors' in resp.json()):
        errors = resp.json()['title']
        if (errors == 'Invalid Request'):
            print('Too many attempts to load tweets or too many tweets to load.')
            print('You need to wait for a few minutes before accessing Twitter API again or reduce max_results.')
    
    else:
        tweets_unfiltered += resp
        
        ids = [int(tweet['id']) for tweet in tweets_unfiltered]
        max_id_str = str(min(ids))
        
        tweets = [tweet for tweet in tweets_unfiltered if (('RT @' not in tweet['text']) & (tweet['lang'] == 'en'))]
        total_tweets_pulled
        
        # loop until as many tweets as total_count is collected
        number_of_tweets = len(tweets)
        
        while number_of_tweets < total_count:

        
            resp = api.request('tweets/search/recent', 
                                            {'query': keyword,
                                            'max_results': batch_count,                                    
                                            'until_id': max_id_str,
                                            'tweet.fields': {'lang':'en'},      
                                            'place.fields':{
                                            'geo': {
                                            "type": "Feature",
                                            "bbox": [
                                            longitude - delta_lon,
                                            latitude - delta_lat,
                                            longitude + delta_lon,
                                            latitude + delta_lat
                                            ],
                                            "properties": {}
                                            }}})

                
            tweets_unfiltered += resp
            tweets = [tweet for tweet in tweets_unfiltered if (('RT @' not in tweet['text']) & (tweet['lang'] == 'en'))]
    
            ids = [int(tweet['id']) for tweet in tweets_unfiltered]
            max_id_str = str(min(ids))
                
            number_of_tweets = len(tweets)
            
            print("{} tweets are collected for keyword {}. ".format(number_of_tweets, keyword))
            
        print("{} total tweets pulled. ".format(len(tweets_unfiltered)))
        return tweets

In [None]:
# Collecting the tweets for three assigned keywords, 
# Your function call should look like this:  
#      retrieve_tweets(api, keyword, batch_count, total_count, latitude, longitude, delta_lat, delta_lon)

k1_tweets = retrieve_tweets(api, keywords[0], 50, 200, PLACE_LAT, PLACE_LON, DELTA_LAT, DELTA_LON)
k2_tweets = retrieve_tweets(api, keywords[1], 50, 200, PLACE_LAT, PLACE_LON, DELTA_LAT, DELTA_LON)
k3_tweets = retrieve_tweets(api, keywords[2], 50, 200, PLACE_LAT, PLACE_LON, DELTA_LAT, DELTA_LON)

# PLEASE NOTE THAT IF YOU RUN THIS CELL, IT MIGHT TAKE A WHILE TO DOWNLOAD ALL THE TWEETS REQUIRED.
# MAKE SURE THAT YOU WAIT UNTILL THE CELL FINISHES RUNNING.

In [None]:
print("Total of {} Tweets for the Keyword {}.".format(len(k1_tweets), keywords[0]))
print("Total of {} Tweets for the Keyword {}.".format(len(k2_tweets), keywords[1]))
print("Total of {} Tweets for the Keyword {}.".format(len(k3_tweets), keywords[2]))

In [None]:
# Data type of tweets
print(type(k1_tweets[0]))

In [None]:
print(k1_tweets[0].keys())

'''
Code to print out the text of the first  tweet collected for each keyword.
'''

print("\nThe text of the first tweet for \"{}\":\n".format(keywords[0]))

print(k1_tweets[0]["text"])


print("\nThe text of the first tweet for \"{}\":\n".format(keywords[1]))

print(k2_tweets[0]["text"])


print('\nThe text of the first tweet for \"{}\":\n'.format(keywords[2]))

print(k3_tweets[0]["text"])

In [None]:
def save_to_json(obj, filename):
    """
    saves a list of dictionaries into a json file
    
    obj: list of dictionaries
    filename: filename
    """
    with open(filename, 'w') as fp:
        json.dump(obj, fp, indent=4, sort_keys=True)   

In [None]:
# saving the tweets in three json files, one for each keyword

save_to_json(k1_tweets, "{}_{}.json".format(group_id, keywords[0].replace(" ", "_")))
save_to_json(k2_tweets, "{}_{}.json".format(group_id, keywords[1].replace(" ", "_")))
save_to_json(k3_tweets, "{}_{}.json".format(group_id, keywords[2].replace(" ", "_")))

In [None]:
def read_json_file(filename):
    """
    reads from a json file and saves the result in a list named data
    """
    with open(filename, 'r') as fp:
        content = fp.read()

    
    data = json.loads(content)
    return data     

In [None]:
'''
Three function calls to load data from three json files you have saved from Part 1.
'''

k1_tweets = read_json_file("{}_{}.json".format(group_id, keywords[0].replace(" ", "_")))
k2_tweets = read_json_file("{}_{}.json".format(group_id, keywords[1].replace(" ", "_")))
k3_tweets = read_json_file("{}_{}.json".format(group_id, keywords[2].replace(" ", "_")))


In [None]:
'''
Code to print out the number of tweets containes in three variables: 
k1_tweets, k2_tweets and k3_tweets
'''


print("Total of {} Tweets for the Keyword {}.".format(len(k1_tweets), keywords[0]))
print("Total of {} Tweets for the Keyword {}.".format(len(k2_tweets), keywords[1]))
print("Total of {} Tweets for the Keyword {}.".format(len(k3_tweets), keywords[2]))

In [None]:
def is_short_tweet(tweet):
    '''
    Check if the text of "tweet" has less than 50 characters
    '''    
    if(len(tweet["text"]) < 50):
        return True
    return False

    

In [None]:
'''
Codes to remove all tweets which have less than 50 characters in variables 
k1_tweets, k2_tweets and k3_tweets and store the results in the new variables 
k1_tweets_filtered, k2_tweets_filtered and k3_tweets_filtered respectively
'''

k1_tweets_filtered = [tweet for tweet in k1_tweets if not is_short_tweet(tweet)]
k2_tweets_filtered = [tweet for tweet in k2_tweets if not is_short_tweet(tweet)]
k3_tweets_filtered = [tweet for tweet in k3_tweets if not is_short_tweet(tweet)]




# these lines below print the number of tweets for each keyword before and after filtered.
print(len(k1_tweets), len(k1_tweets_filtered))
print(len(k2_tweets), len(k2_tweets_filtered))
print(len(k3_tweets), len(k3_tweets_filtered))

In [None]:
'''
For each keyword, print out the number of tweets that have been removed.
'''

print("{} from keyword {} removed.".format(len(k1_tweets) - len(k1_tweets_filtered), keywords[0]))
print("{} from keyword {} removed.".format(len(k2_tweets) - len(k2_tweets_filtered), keywords[1]))
print("{} from keyword {} removed.".format(len(k3_tweets) - len(k3_tweets_filtered), keywords[2]))

In [None]:
'''
Code to print out the first 5 tweets for each keyword.
'''

print('The first 5 tweets for \"{}\":\n'.format(keywords[0]))

for i in range(5):print(k1_tweets_filtered[i])

print('\nThe first 5 tweets for \"{}\":\n'.format(keywords[1]))

for i in range(5):print(k2_tweets_filtered[i])


print('\nThe first 5 tweets for \"{}\":\n'.format(keywords[2]))

for i in range(5):print(k3_tweets_filtered[i])

In [None]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
def pre_process(doc):
    """
    pre-processes a doc
      * Converts the tweet into lower case,
      * removes the URLs,
      * removes the punctuations
      * tokenizes the tweet
      * removes words less that 3 characters
    """
    
    doc = doc.lower()
    # getting rid of non ascii codes
    doc = remove_non_ascii(doc)
    
    # replacing URLs
    url_pattern = "http://[^\s]+|https://[^\s]+|www.[^\s]+|[^\s]+\.com|bit.ly/[^\s]+"
    doc = re.sub(url_pattern, 'url', doc) 

    # removing dollars and usernames and other unnecessary stuff
    userdoll_pattern = "\$[^\s]+|\@[^\s]+|\&[^\s]+|\*[^\s]+|[0-9][^\s]+|\~[^\s]+"
    doc = re.sub(userdoll_pattern, '', doc)
    
    
    # removing punctuation
    punctuation = r"\(|\)|#|\'|\"|-|:|\\|\/|!|\?|_|,|=|;|>|<|\.|\@"
    doc = re.sub(punctuation, ' ', doc)
    
    return [w for w in doc.split() if len(w) > 2]

In [None]:
tweet_k1 = k1_tweets_filtered[0]['text']
tweet_k1_processed = pre_process(tweet_k1)

print(tweet_k1)
# tweet_k1_processed is now a list of words. 
# We use ' '.join() method to join the list to a string.
print(' '.join(tweet_k1_processed))

In [None]:
'''
Code to display the first tweets stored in 
the variables k2_tweets_filtered and k3_tweets_filtered before and after they 
have been pre-processed using the function pre_process() supplied earlier.
'''

print((tweet := k2_tweets_filtered[0]['text']) + "\n" + ' '.join(pre_process(tweet)) + "\n")

print((tweet := k3_tweets_filtered[0]['text']) + "\n" + ' '.join(pre_process(tweet)) + "\n")


In [None]:
'''
Code to pre-process and clean up all tweets 
stored in the variable k1_tweets_filtered, k2_tweets_filtered and k3_tweets_filtered using the 
function pre_process() to result in new variables k1_tweets_processed, k2_tweets_processed 
and k3_tweets_processed.
'''

k1_tweets_processed = [pre_process(tweet["text"]) for tweet in k1_tweets_filtered]
k2_tweets_processed = [pre_process(tweet["text"]) for tweet in k2_tweets_filtered]
k3_tweets_processed = [pre_process(tweet["text"]) for tweet in k3_tweets_filtered]

In [None]:
'''
Code to print out the first 5 processed tweets for each keyword.
Hint: Each tweet in tweets_processed is now a list of words, not a string. 
      To print a string, you might need to use ' '.join(tweet), 
      when tweet is a processed tweet

'''


print('The first 5 processed tweets for k1_tweets_processed:')

for i in range(5):print(' '.join(k1_tweets_processed[i]))



print('\nThe first 5 processed tweets for k2_tweets_processed:')

for i in range(5):print(' '.join(k2_tweets_processed[i]))


print('\nThe first 5 processed tweets for k3_tweets_processed:')

for i in range(5):print(' '.join(k3_tweets_processed[i]))


In [None]:
def construct_termdoc(docs, vocab=[]):
    """
    Construct a term-by-document-matrix
    
    docs: corpus
    vocab: pre-defined vocabulary
           if not supplied it will be automatically induced from the data
    
    returns the term-by-document matrix and the vocabulary of the passed corpus
    """
    
    # vocab is not passed
    if vocab == []:
        vocab = set()
        termdoc_sparse = []

        for doc in docs:       
            # computes the frequencies of doc
            doc_sparse = Counter(doc)    
            termdoc_sparse.append(doc_sparse)
            
            # update the vocab
            vocab.update(doc_sparse.keys())  

        vocab = list(vocab)
        vocab.sort()
    
    else:
        termdoc_sparse = []        
        for doc in docs:
            termdoc_sparse.append(Counter(doc))
            

    n_docs = len(docs)
    n_vocab = len(vocab)
    termdoc_dense = np.zeros((n_docs, n_vocab), dtype=int)

    for j, doc_sparse in enumerate(termdoc_sparse):
        for term, freq in doc_sparse.items():
            try:
                termdoc_dense[j, vocab.index(term)] = freq
            except:
                pass
            
    return termdoc_dense, vocab

In [None]:
'''
compute the term-by-document matrix and the the dictionary from the collection of 
tweets collected for the first keyword
'''

k1_termdoc, k1_vocab = construct_termdoc(k1_tweets_processed)

# print out the term-by-document matrix
print(k1_termdoc)
# print out the first 5 vocabulary entries
print(' '.join(k1_vocab[0:5]))  # print out only the first 5 vocabulary entries

# visualise the term-by-document matrix
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(k1_termdoc)
ax.set_xlabel('term (vocabulary)')
ax.set_ylabel('documents (tweets)')
ax.set_title('Term-by-Document matrix from tweets collected for keyword \"{}\"'.format(keywords[0]))

In [None]:
def Euclidean_distance(x,y):
    '''
    Compute and return the Euclidean distance between two vectors x and y
    '''
    
    return np.sqrt(np.sum(np.power(np.subtract(x, y), 2))) 

In [None]:
def cosine_distance(x,y):
    '''
    Compute and return the cosine distance between two vectors x and y
    '''

    dot_product = np.dot(x, y)
    x_magnitude = np.sqrt(np.sum(np.power(x, 2)))
    y_magnitude = np.sqrt(np.sum(np.power(y, 2)))

    return  1 - (dot_product / (x_magnitude * y_magnitude))

In [None]:
'''
The function takes the termdoc matrix as the input and computes variables called "euclidean_distance_matrix" 
and "cosine_distance_matrix", which are matrices whose elements (i,j) store the Eulidean distance 
and the cosine distance between tweet i-th and i-jth.
'''

def compute_distance_matrices(termdoc):    
    euclidean_distance_matrix = []
    cosine_distance_matrix = []

    for tweet_i in termdoc:
        euclidean_distance_array = []
        cosine_distance_array = []

        for tweet_j in termdoc:
            euclidean_distance_array.append(Euclidean_distance(tweet_i, tweet_j))
            cosine_distance_array.append(cosine_distance(tweet_i, tweet_j))

        euclidean_distance_matrix.append(np.array(euclidean_distance_array))
        cosine_distance_matrix.append(np.array(cosine_distance_array))
    
    return np.array(euclidean_distance_matrix), np.array(cosine_distance_matrix)


In [None]:
# compute the distance matrices for k1_termdoc using the function "compute_distance_matrices"

k1_euclidean_distance, k1_cosine_distance = compute_distance_matrices(k1_termdoc)

# Visualise the distance matrices for this keyword

plt.figure(figsize=(10,10))
plt.title("Euclidean Distance Matrix")
plt.imshow(k1_euclidean_distance)
plt.colorbar()

plt.show()

plt.figure(figsize=(10,10))
plt.title("Cosine Distance Matrix")
plt.imshow(k1_cosine_distance)
plt.colorbar()

plt.show()


In [None]:
plt.figure(figsize=(10, 10))
plt.scatter(k1_euclidean_distance, k1_cosine_distance)
plt.xlabel('Euclidean Distance')
plt.ylabel('Cosine Distance')
plt.title('Relationship between Euclidean Distance and Cosine Distance')
plt.show()

In [None]:

'''
Compute the term-by-document matrix and the vocabulary for tweets stored 
in k2_tweets_processed
'''

k2_termdoc, k2_vocab = construct_termdoc(k2_tweets_processed)

'''
Code print out the first 5 vocabularies 
'''

print(' '.join(k2_vocab[0:5]))

'''
code to visualise the term-by-document matrix
'''
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(k2_termdoc)
ax.set_xlabel('term (vocabulary)')
ax.set_ylabel('documents (tweets)')
ax.set_title('Term-by-Document matrix from tweets collected for keyword \"{}\"'.format(keywords[1]))



In [None]:
k2_euclidean_distance, k2_cosine_distance = compute_distance_matrices(k2_termdoc)

# Visualise the distance matrix for this keyword


plt.figure(figsize=(10,10))
plt.title("Euclidean Distance Matrix")
plt.imshow(k2_euclidean_distance)
plt.colorbar()

plt.show()

plt.figure(figsize=(10,10))
plt.title("Cosine Distance Matrix")
plt.imshow(k2_cosine_distance)
plt.colorbar()

plt.show()

In [None]:

'''
Codes to compute the term-by-document matrix and the vocabulary for tweets stored 
in k3_tweets_processed
'''


k3_termdoc, k3_vocab = construct_termdoc(k3_tweets_processed)


'''
Code print out the first 5 vocabularies 
'''

print(' '.join(k3_vocab[0:5]))


'''
Code to visualise the term-by-document matrix
'''

fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(k3_termdoc)
ax.set_xlabel('term (vocabulary)')
ax.set_ylabel('documents (tweets)')
ax.set_title('Term-by-Document matrix from tweets collected for keyword \"{}\"'.format(keywords[2]))


In [None]:
# compute the distance matrices for k1_termdoc using the function "compute_distance_matrices"

k3_euclidean_distance, k3_cosine_distance = compute_distance_matrices(k3_termdoc)

# Visualise the distance matrix for this keyword


plt.figure(figsize=(10,10))
plt.title("Euclidean Distance Matrix")
plt.imshow(k3_euclidean_distance)
plt.colorbar()

plt.show()

plt.figure(figsize=(10,10))
plt.title("Cosine Distance Matrix")
plt.imshow(k3_cosine_distance)
plt.colorbar()

plt.show()


In [None]:
# 1. Produce a scatter plot of Euclidean vs cosine distance for all tweets.
# INSERT YOUR CODE HERE

plt.figure(figsize=(15, 10))
plt.scatter(all_euclidean_distances, all_cosine_distances, label="Default")
plt.xlabel('Euclidean Distance')
plt.ylabel('Cosine Distance')
plt.title('Relationship between Euclidean Distance and Cosine Distance')



# 2. Fit first and second order polynomials to the data in the scatter plot and overplot it. 

first_order = [np.polyfit(all_euclidean_distances[index], all_cosine_distances[index], deg=1) for index in range(len(all_euclidean_distances))]
second_order = [np.polyfit(all_euclidean_distances[index], all_cosine_distances[index], deg=2) for index in range(len(all_euclidean_distances))]
first_order_x = [float(dd[0]) for dd in first_order]
first_order_y = [float(dd[1]) for dd in first_order]

plt.scatter(first_order_x, first_order_y, label="First Order")  

second_order_x = [float(dd[0]) for dd in second_order]
second_order_y = [float(dd[1]) for dd in second_order]

plt.scatter(second_order_x, second_order_y, label="Second Order")

plt.legend()
plt.show()

In [None]:
'''
Initialise a kmeans object  from scikit-lean package
'''

n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=5, max_iter=3000,
                verbose=True, tol=0.000001, random_state=123456)


In [None]:
'''
Perform clustering on the data stored in the variable all_termdoc

'''


kmeans.fit(all_termdoc)


In [None]:
'''
Write your codes to print out the cluster centers.
'''

cluster_centers = kmeans.cluster_centers_

print(cluster_centers)

In [None]:
# 1. Plot bar charts for each of the three clusters, obtained from KMeans, 
# where each bar chart has 20 strongest words sorted by their presence strength.
# INSERT YOUR CODE HERE

color_codes = ['deepskyblue', 'lime', 'darkorange']
for i in range(3):
    sort_index = np.argsort(-cluster_centers[i])
    words = (np.array(all_vocab)[sort_index])[:20]
    strength = (np.array(cluster_centers[i])[sort_index])[:20]
    plt.figure(figsize=(15, 10))
    plt.bar(words, strength, color = color_codes[i])
    plt.title("Top 20 most used words in tweets of keyword {}.".format(keywords[i]))
    plt.xlabel("Words")
    plt.ylabel("Strength")

plt.show()


In [None]:
'''
Codes to print out the first **200** cluster labels assigned to the first 200 tweets.
'''

labels = kmeans.labels_

print(labels[:200])


In [None]:
'''
Code to obtain the labels of tweets for each keyword
and store the labels of the first keyword in ***k1_labels***, 
the labels of the second keyword in ***k2_labels*** and
the labels of the third keyword in ***k3_labels***.
'''

k1_labels = labels[:len(k1_termdoc)]
k2_labels = labels[len(k1_termdoc):len(k1_termdoc) + len(k2_termdoc)]
k3_labels = labels[len(k1_termdoc)+len(k2_termdoc):]


In [None]:
'''
Code to obtain the list of tweet indices of each keyword that are assigned to the first cluster.

'''
# obtain the list of tweet indices of keyword k1 that are assigned to the first cluster
# means that to find tweet indices that have label 0 in k1_labels



k1_idx_label0 = 0 
for i in k1_labels:
    if i == 0:
        k1_idx_label0 += 1
# obtain the list of tweet indices of keyword k2 that are assigned to the first cluster
# means that to find tweet indices that have label 0 in k2_labels
k2_idx_label0 = 0 
for i in k2_labels:
    if i == 0:
        k2_idx_label0 += 1

# obtain the list of tweet indices of keyword k3 that are assigned to the first cluster
# means that to find tweet indices that have label 0 in k3_labels
k3_idx_label0 = 0 
for i in k3_labels:
    if i == 0:
        k3_idx_label0 += 1


In [None]:

'''
Plotted a bar chart to visualise the number of tweets of each keyword that are assigned to the first cluster.

'''

idx_label0 = [k1_idx_label0, k2_idx_label0, k3_idx_label0]

plt.figure(figsize=(7, 7))
plt.bar(keywords, idx_label0, width=0.5, color=color_codes)
plt.title("Number of Tweets in cluster 0 from each Keyword")
plt.xlabel("Keyword")
plt.ylabel("Number of Tweets")

plt.show()



In [None]:
# obtain the list of tweet indices of keyword k1 that are assigned to the second cluster
# means that to find tweet indices that have label 1 in k1_labels



k1_idx_label1 = 0 
for i in k1_labels:
    if i == 1:
        k1_idx_label1 += 1

# obtain the list of tweet indices of keyword k2 that are assigned to the second cluster
# means that to find tweet indices that have label 1 in k2_labels
k2_idx_label1 = 0 
for i in k2_labels:
    if i == 1:
        k2_idx_label1 += 1


# obtain the list of tweet indices of keyword k3 that are assigned to the second cluster
# means that to find tweet indices that have label 1 in k3_labels
k3_idx_label1 = 0 
for i in k3_labels:
    if i == 1:
        k3_idx_label1 += 1

# Plot a bar chart to visualise the number of tweets of each keyword that are assigned to the second cluster

idx_label1 = [k1_idx_label1, k2_idx_label1, k3_idx_label1]


plt.figure(figsize=(7, 7))
plt.bar(keywords, idx_label1, width=0.5, color=color_codes)
plt.title("Number of Tweets in cluster 1 from each Keyword")
plt.xlabel("Keyword")
plt.ylabel("Number of Tweets")

plt.show()


In [None]:

# obtain the list of tweet indices of keyword k1 that are assigned to the third cluster
# means that to find tweet indices that have label 2 in k1_labels


k1_idx_label2 = 0 
for i in k1_labels:
    if i == 2:
        k1_idx_label2 += 1

# obtain the list of tweet indices of keyword k2 that are assigned to the third cluster
# means that to find tweet indices that have label 2 in k2_labels
k2_idx_label2 = 0 
for i in k2_labels:
    if i == 2:
        k2_idx_label2 += 1

# obtain the list of tweet indices of keyword k3 that are assigned to the third cluster
# means that to find tweet indices that have label 2 in k3_labels
k3_idx_label2 = 0 
for i in k3_labels:
    if i == 2:
        k3_idx_label2 += 1

# Plot a bar chart to visualise the number of tweets of each keyword that are assigned to the third cluster


idx_label2 = [k1_idx_label2, k2_idx_label2, k3_idx_label2]

plt.figure(figsize=(7, 7))
plt.bar(keywords, idx_label2, width=0.5, color=color_codes)
plt.title("Number of Tweets in cluster 2 from each Keyword")
plt.xlabel("Keyword")
plt.ylabel("Number of Tweets")

plt.show()
