# Using KMeans clustering to find Topics in Tweets about the Covid Vaccine

## Import needed libraries

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
##from sklearn.model_selection import GridSearchCV
from datetime import datetime
from gensim.models import Phrases
from gensim.models.phrases import Phraser

### Gather the Data

In [None]:
df = pd.read_json('test.json')

## Preprocess Data
I choose to keep numbers for now, as they may fit into bigrams and provide some meaning
Will delete any numbers that dont fit into bigrams later

In [None]:
stop = set(stopwords.words('english'))
lm = WordNetLemmatizer()
def remove_stop(text):
    text = text.lower().split()
    text = [word for word in text if word not in stop]
    return ' '.join(text)

def get_processed_data():
    processed_tweets = []
    for tweet in df['content']:
        prep_tweet = re.sub(r'(\w+://\S+)','',tweet.lower()) #remove links
        prep_tweet = re.sub(r'&amp','',prep_tweet) #I saw a number of these; dont want them showing up
        prep_tweet = re.sub(r'[^a-zA-Z0-9\s]','',prep_tweet) #remove emojis and punctuation
        prep_tweet = remove_stop(prep_tweet) #remove the stopwords - maybe this is too early
        prep_tweet = ' '.join([lm.lemmatize(word) for word in prep_tweet.split(' ')]) #lemmatise the data
        prep_tweet = prep_tweet.replace(' nh ',' nhs ') #I noticed this was being lemmatised, even though it is an important term
        processed_tweets.append(prep_tweet)
    return processed_tweets

### Need to cutdown the data in my set - Dont have enough RAM at the moment

In [None]:
processed_tweets = get_processed_tweets()[:10000]

# Create bigrams and conduct more preprocessing

## define the functions to preprocess the data

### Consider moving this to a seperate python file - all the methods here are used by each modelling technique

In [None]:
def make_bigrams(texts): #Train the model on our data, then return words based on our data. Iunno I dont make the rules here
    bigram = Phrases(texts, min_count=5, threshold=100)
    bigram_mod = Phraser(bigram)
    return [bigram_mod[doc] for doc in texts]

In [None]:
def popular_word_culler(doc_list): #doc_list is a list of strings
    cvec = CountVectorizer(analyzer='word',       
                                 min_df=10,
                                 token_pattern='[\w]{3,}',
                                )
    bow = cvec.fit_transform(doc_list)
    bow_df = pd.DataFrame(bow.toarray(),columns=cvec.get_feature_names_out())
    occurs = dict(zip([bow_df.T.iloc[x].name for x in range(len(bow_df.T))],
                      [len(bow_df) - bow_df.T.iloc[x].to_list().count(0) for x in range(len(bow_df.T))]))
    wanted_words = ['biden', 'boosted', 'booster', 'case', 
      'child', 'country', 'death', 'died', 
      'everyone', 'first', 'fully', 'good', 
      'health', 'kid', 'know', 'long', 
      'mandate', 'mask',  'new', 'pandemic', 
      'pfizer', 'rate', 'realcandaceo', 
      'risk', 'sorry', 'think', 'trump', 'work']
    unwanted_words = [word for word in occurs.keys() if occurs[word] > 600 and word not in wanted_words]
    new_tweets = []
    for tweet in doc_list:
        for word in unwanted_words:
            tweet = tweet.replace(word,'')
        new_tweets.append(tweet)
    new_tweets_list = [[word for word in tweet.split()] for tweet in new_tweets]
    return new_tweets, new_tweets_list

In [None]:
def short_word_culler(doc_list): #doc_list is a list of strings
    long_word_tweets = []
    for tweet in doc_list:
        long_word_tweets.append(
            ' '.join([word for word in tweet.split() if len(word)>3])
        )

    long_word_tweets_list = [[word for word in tweet.split()] for tweet in long_word_tweets]
    return long_word_tweets, long_word_tweets_list

In [None]:
def number_culler(doc_list): #doc_list is a list of strings
    a=[re.sub('\b\d+\b','',doc) for doc in doc_list]
    return a, [[word for word in doc.split()] for doc in a]

## Preprocess the data

In [None]:
processed_tweets = make_bigrams(processed_tweets)
processed_tweets = popular_word_culler(processed_tweets)
processed_tweets = short_word_culler(processed_tweets)
processed_tweets = number_culler(processed_tweets)

# Create a dtm using TF\*IDF

In [None]:
tfidf = TfidfVectorizer()
tfidf_vecs = tfidf.fit_transform(processed_tweets)
feature_names = tfidf.get_feature_names_out()
dense = tfidf_vecs.todense()
lst1 = dense.tolist()
tfidf_df = pd.DataFrame(lst1, columns=feature_names)

# K Means Clusstering of Tweets

## Find the optimal number of topics through Elbow Method

I have gone of the data that I used in the scratchpad - it is likely the results have changed since the processing method has also changed

In [None]:
wcss = []
for i in range(5,80,5):
    kmeans_model = KMeans(n_clusters=i, random_state=0)
    kmeans_model.fit_transform(tfidf_vecs)
    wcss.append(kmeans_model.inertia_)
plt.plot(range(5,80,5),wcss)
plt.title('elbow n clusters')
plt.xlabel('n clusters')
plt.ylabel('wcss')
plt.show()

The graph has two noticable bends - fine tune through targeted graphing

### Elbow Method - 10 to 20 clusters

In [None]:
wcss = []
for i in range(10,20,2):
    kmeans_model = KMeans(n_clusters=i, random_state=0)
    kmeans_model.fit_transform(tfidf_vecs)
    wcss.append(kmeans_model.inertia_)
plt.plot(range(10,20,2),wcss)
plt.title('elbow n clusters')
plt.xlabel('n clusters')
plt.ylabel('wcss')
plt.show()

## Display found topics  for 14 clusters

In [None]:
kmeans_model = KMeans(n_clusters=14, random_state=0)
kmeans_model.fit_transform(tfidf_vecs)
order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
pd.DataFrame([[terms[i] for i in order_centroids[j]] for j in range(len(order_centroids))],
                          index=[f'Topic {x}' for x in range(14)]).iloc[:,:9]

### Elbow Method - 30 to 40 clusters

In [None]:
wcss = []
for i in range(30,40,2):
    kmeans_model = KMeans(n_clusters=i, random_state=0)
    kmeans_model.fit_transform(tfidf_vecs)
    wcss.append(kmeans_model.inertia_)
plt.plot(range(30,40,2),wcss)
plt.title('elbow n clusters')
plt.xlabel('n clusters')
plt.ylabel('wcss')
plt.show()

## Display found topics for 32 clusters

In [None]:
kmeans_model = KMeans(n_clusters=32, random_state=0)
kmeans_model.fit_transform(tfidf_vecs)
order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
pd.DataFrame([[terms[i] for i in order_centroids[j]] for j in range(len(order_centroids))],
                          index=[f'Topic {x}' for x in range(32)]).iloc[:,:9]

# Conclusions

## Make word clouds from intersting topics found in the above data

There are too many topics to make word clouds for all of them

In [None]:
#Still need to work on this
wc = WordCloud(max_words=100,width=2000,height=1000).generate_from_frequencies()

#Figure out how to affect size of image - looks like extent but is more involved than I care to look at atm
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()