**Steps:**
- Import Libraries and Load Data into Memory
- K-Means Clustering Using HashingVectorizer
    - Clustering using Scikit-Learn's HashingVectorizer and KMeans modules
    - PCA dimension reduction to 2D in order to visualize the data points
    - Making WordClouds to show the clusters
        - Thanks to Vadim Nareyko's Kaggle Notebook: Google word2vec, KMeans, PCA https://www.kaggle.com/nareyko/google-word2vec-kmeans-pca Got inspired by his post
- TextBlob Clustering
    - Output 1: Clustering and TextBlob output
- Finding the best K for K-means Using SpaCy, TF-IDF, and Elbow Method
    - Output 2: Clustering 10 Out

# Import Libraries and Load Data into Memory

In [None]:
import numpy as np
import pandas as pd 
import os
import random
from sklearn.utils import shuffle

#for text cleaning
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter

# clustering
from gensim.models import word2vec, KeyedVectors
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

import re;
import logging;
import sqlite3;
import time;
import sys;
import multiprocessing;
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import matplotlib.pyplot as plt;
from itertools import cycle;

from tqdm import tqdm
tqdm.pandas()


In [None]:
SEED = 2021

### Import DataFrame into Memory

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


#Importing the dataset 
data = pd.read_csv("../input/covidvaccine-tweets/covidvaccine.csv")

Take a look at the Data Frame:

In [None]:
data.head()

See a random data entry:

In [None]:
random.seed(SEED)
data.iloc[random.randint(0, len(data))]

In [None]:
data.shape #183494 entries

### Data Cleaning

In [None]:
def clean_text(txt):
    '''
    cleans the input text in the following steps:
    1 - replace contractions
    2 - removing punctuation
    3 - spliting into words
    4 - removing stopwords
    5 - removing leftover punctuations
    6 - lower-case everything
    '''
    contraction_dict = {   
        "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
        "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not",
        "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", 
        "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  
        "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", 
        "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have",
        "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
        "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", 
        "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
        "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
        "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", 
        "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
        "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
        "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", 
        "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", 
        "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
        "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
        "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", 
        "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", 
        "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
        "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", 
        "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", 
        "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
        "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", 
        "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
        "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are",
        "y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
        "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)

    # replace contractions
    txt = replace_contractions(txt)
    
    #remove punctuations
    txt  = "".join([char for char in txt if char not in string.punctuation])
    txt = re.sub('[0-9]+', '', txt)
    
    # split into words
    words = word_tokenize(txt)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # removing leftover punctuations
    words = [word for word in words if word.isalpha()]
    
    # lower-case everything
    words = [w.lower() for w in words]
    
    # stem the words
    porter = PorterStemmer()
    words = [porter.stem(w) for w in words]
    
    
    cleaned_text = ' '.join(words)
    return cleaned_text

In [None]:
data = data[['text', 'hashtags']].fillna('')
data.head()

data['raw_tweet'] = data['text'] + ' ' + data['hashtags']

data.head()

In [None]:
data['tweet'] = data['raw_tweet'].progress_apply(lambda txt: clean_text(txt))
data.head()

clean_df = clean_df.str.split()
clean_df.head()

In [None]:
data.sort_values(by=['tweet']).head()
# still a bunch of empty lists in the dataset. We will have to remove them before clustering.

In [None]:
#remove the empty strings
df = data[data['tweet'].astype(bool)]
df.sort_values(by=['tweet']).head()

In [None]:
# I only want to keep hashtags and tweet
df = df[['hashtags', 'tweet']]
df.sort_values(by=['tweet']).head()

In [None]:
df['tweet_split'] = df.tweet.str.split()
df.head()

# Clustering using HushingVectorizer in sklearn

This part is reference by Vadim Nareyko's Kaggle Notebook: Google word2vec, KMeans, PCA.
Here is the link to his notebook. https://www.kaggle.com/nareyko/google-word2vec-kmeans-pca
Thank you.

In [None]:
# find out the longest tweet
max_len = df.tweet.str.len().max()
print(max_len)

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = HashingVectorizer(n_features=20)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

In [None]:
vectorizer = HashingVectorizer (n_features = max_len)
df['vector'] = df['tweet'].progress_apply(lambda t: vectorizer.fit_transform([t]).toarray())
df.head()

In [None]:
X = np.concatenate(df['vector'].values)

In [None]:
kmeans = KMeans(n_clusters = 4)
df['cluster'] = kmeans.fit_predict(X)

### PCA for visualization

In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)
df['x'] = pca_result[:, 0]
df['y'] = pca_result[:, 1]
df.head()

In [None]:
cluster_colors = pd.np.array(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
df['color'] = cluster_colors[df.cluster.values]
df['text'] = df.tweet.str[:50]

In [None]:
import bokeh.io
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet

# from bokeh.charts import Donut, HeatMap, Histogram, Line, Scatter, show, output_notebook, output_file
bokeh.io.output_notebook()

In [None]:
#visualize the data using bokeh
#output_file("top_artists.html", title="top artists")
# TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,previewsave"

source = ColumnDataSource.from_df(df[['x', 'y', 'color', 'text']])
TOOLTIPS = [("text", "@text")]
TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,save"

plot = figure(plot_width=800, plot_height=450, tooltips=TOOLTIPS, tools=TOOLS)

#draw circles
plot.circle(y='y', x='x', source=source, size=15, fill_color='color')
show(plot)

### WordClouds

In [None]:
df.head()

In [None]:
df.query('cluster == 4').tweet

In [None]:
cluster0 = df.query('cluster == 0').tweet
cluster1 = df.query('cluster == 1').tweet
cluster2 = df.query('cluster == 2').tweet
cluster3 = df.query('cluster == 3').tweet


text0 = ' '.join(tweet for tweet in cluster0)
text1 = ' '.join(tweet for tweet in cluster1)
text2 = ' '.join(tweet for tweet in cluster2)
text3 = ' '.join(tweet for tweet in cluster3)

print(f'There are {len(text0)} words in the combination of all cells in column "tweet" labeled as cluster 0.')
print(f'There are {len(text1)} words in the combination of all cells in column "tweet" labeled as cluster 1.')
print(f'There are {len(text2)} words in the combination of all cells in column "tweet" labeled as cluster 2.')
print(f'There are {len(text3)} words in the combination of all cells in column "tweet" labeled as cluster 3.')

In [None]:
# create stopwords
stopwords = set(STOPWORDS)
stopwords.update(['amp', 'covid', 'covidvaccin', 'vaccin', 'today', 'peopl', 'coronaviru', 'say', 'day', 'one']) 
# I want to see what's left after filtered out the common keywords.

In [None]:
wordcloud0 = WordCloud(stopwords=stopwords, background_color='white').generate(text0)
wordcloud1 = WordCloud(stopwords=stopwords, background_color='white').generate(text1)
wordcloud2 = WordCloud(stopwords=stopwords, background_color='white').generate(text2)
wordcloud3 = WordCloud(stopwords=stopwords, background_color='white').generate(text3)

wordcloud = [wordcloud0, wordcloud1, wordcloud2, wordcloud3]

In [None]:
# cluster 0
plt.figure(figsize=(10,5))
plt.tight_layout(pad=0)
plt.imshow(wordcloud0, interpolation='bilinear')
plt.title('Cluster 0 Key words', size=16)
plt.show()

In [None]:
# cluster 1
plt.figure(figsize=(10,5))
plt.tight_layout(pad=0)
plt.imshow(wordcloud1, interpolation='bilinear')
plt.title('Cluster 1 Key words', size=16)
plt.show()

In [None]:
# cluster 2
plt.figure(figsize=(10,5))
plt.tight_layout(pad=0)
plt.imshow(wordcloud2, interpolation='bilinear')
plt.title('Cluster 2 Key words', size=16)
plt.show()

In [None]:
# cluster 3
plt.figure(figsize=(10,5))
plt.tight_layout(pad=0)
plt.imshow(wordcloud3, interpolation='bilinear')
plt.title('Cluster 3 Key words', size=16)
plt.show()

# Using TextBlob for Sentiment Analysis

In [None]:
from textblob import TextBlob

def get_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def get_polarity(text):
    return TextBlob(text).sentiment.polarity

def get_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [None]:
df['subjectivity'] = df['text'].apply(get_subjectivity)
df['polarity'] = df['text'].apply(get_polarity)
df['sentiment'] = df['polarity'].apply(get_sentiment)
df.head(20)

In [None]:
# View sentiment counts

df.sentiment.value_counts()

## Visualizae Polarity and Subjectivity of Tweets

In [None]:
fig = plt.figure(figsize=(12,6))
plt.scatter(df['polarity'], df['subjectivity'], s=4)

plt.ylabel('Subjectivity')
plt.xlabel('Polarity')

### Upload Output 1: TextBlob Sentiments

In [None]:
textblob_output = df.loc[:,['hashtags', 'tweet','subjectivity', 'polarity', 'sentiment']].sort_values('sentiment')
textblob_output.head()

In [None]:
textblob_output.to_csv('textblog.csv',index=False)

# TF-IDF Vectorizer and Finding the Best K with Elbow Method

### Read the Data

In [None]:
#df=pd.read_csv("../input/covidvaccine-tweets/covidvaccine.csv")
df = pd.read_csv("../input/covidvaccine-tweets/covidvaccine.csv")
df.head()

In [None]:
df.info()

In [None]:
df.is_retweet.value_counts()

### Data Preprocessing

In [None]:
#We create a pandas dataframe as follows:
data = pd.DataFrame(data=df.text)
data = data.rename(columns={'text' : 'Tweets'})
data.head()

In [None]:
# We display the first 10 elements of the dataframe:
pd.set_option('max_colwidth',170)
display(data.head(10))

In [None]:
docs=df.text.head(1000).values
type(docs)

In [None]:
docs_clean = []
for doc in docs:
    doc_2 = re.sub(r':.*$', ":", doc)
    docs_clean.append(doc_2)

docs_clean[:20]

In [None]:
docs2=docs_clean

In [None]:
# remove punctuations
punctuationChars = '!@#$%^&*(){}{}|;:",./<>?' # you might choose different charcters to drop
for i in punctuationChars:
    docs2 = np.char.replace(docs2, i, ' ')
# remove apostrophe's (single quotes)
docs2 = np.char.replace(docs2,"'",' ')
# remove line feeds
docs2 = np.char.replace(docs2,"\n",' ')
# remove 'http:'
docs2 = np.char.replace(docs2,"https:",' ')
docs2 = np.char.replace(docs2,"https",' ')

# make lower case
for i,s in enumerate(docs2):
    docs2[i] = s.lower()
    
# Show the cleaned data
# Show the beginning of each document

#for i in range(len(docs2)):
#        print(f'\ndoc{i}: {docs2[i]}') 
docs2[:10]

### Define SpaCy Tokenizer

In [None]:
def spacy_tokenizer(document):
    tokens = nlp(document)
    tokens = [token for token in tokens if (
        token.is_stop == False and \
        token.is_punct == False and \
        token.lemma_.strip()!= '')]
    tokens = [token.lemma_ for token in tokens]
    return tokens

In [None]:
# test data to see what spacy tokenizer can do.
example_corpus = [
    "Monsters are bad. They likes to eat geese. I saw one goose flying away", \
    "I saw a monster yesterday. The meaning is so obvious!", \
    "Why are we talking about bad monsters? They are meanness."]

In [None]:
nlp = spacy.load("en_core_web_sm")

tfidf_vector = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)
# test
corpus=example_corpus
# fit: learns vocabulary and idf
# transform: transforms documents into document-term matrix
result_test = tfidf_vector.fit_transform(corpus)
result_test

### Successfully extraxt intended meaning of the words. 14 tokens for the example corpus.

In [None]:
dense = result_test.todense()
denselist = dense.tolist()
df_test = pd.DataFrame(
    denselist,columns=tfidf_vector.get_feature_names())
df_test

### Apply Spacy tokenizer, TF-IDF, K-means for the first 100 tweets.

In [None]:
tfidf_vector = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)
corpus = docs2

# fit: learns vocabulary and idf
# transform: transforms documents into document-term matrix
result = tfidf_vector.fit_transform(corpus)
result

It’s a sparse matrix with 1000 reviews and 3191 terms, out of those 3191000 possible numbers there are 9169 non-zero TF-IDF values. We can check which terms are actually considered from the sentences with the get_feature_names method:

In [None]:
# We can check which terms are actually considered from the sentences with the get_feature_names method:
tfidf_vector.get_feature_names()[1:20]

The sparse matrix format is an efficient way to store this information, but you might want to convert it to a more readable, dense matrix format using the todense method. To create a pandas DataFrame from the results, you can use the following code:

In [None]:
dense = result.todense()
denselist = dense.tolist()
df = pd.DataFrame(
    denselist,columns=tfidf_vector.get_feature_names())
df

### Let's see the weights for words contained in the first tweet:

In [None]:
df[["australia", "manufacture", "covid-19"]]

### Check cosine similarity

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cos_df = pd.DataFrame(columns=df.index)
for i in range(999):
    curr_cos_sim = linear_kernel(result[i:i+1], result).flatten()
    cos_df[i] = curr_cos_sim
    
cos_df

### Create the clustering table

In [None]:
kmeans_models = {}
for i in range(2,13+1):
    current_kmean = KMeans(n_clusters=i).fit(result)
    kmeans_models[i] = current_kmean

In [None]:
cluster_df = pd.DataFrame()
cluster_df['Review Texts'] = docs
for i in range(2, 13+1):
    col_name = str(i) +'means_label'
    cluster_df[col_name] = kmeans_models[i].labels_
cluster_df

### Elbow Method to determine the best K

In [None]:
Sum_of_squared_distances = []
K = range(1,18)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(result)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

### Choose K=10 to experiment

In [None]:
cluster10 = cluster_df.iloc[:,[0,9]]
cluster10_0 = cluster10.loc[cluster10["10means_label"] == 0]
cluster10_0.head(50)

In [None]:
cluster10_1 = cluster10.loc[cluster10["10means_label"] == 1]
cluster10_1.head(50)

### Cluster_2 focus on topics related to Russia Vaccine

In [None]:
cluster10_2 = cluster10.loc[cluster10["10means_label"] == 2]
cluster10_2

### Cluster 3 contains more rumors and negative reactions

In [None]:
cluster10_3 = cluster10.loc[cluster10["10means_label"] == 3]
cluster10_3.head(50)

In [None]:
cluster10_4 = cluster10.loc[cluster10["10means_label"] == 4]
cluster10_4

In [None]:
cluster10_5 = cluster10.loc[cluster10["10means_label"] == 5]
cluster10_5

### Upload Output 2: 10 clusters Tweets

In [None]:
cluster10 = cluster10.sort_values(by='10means_label')
cluster10.to_csv('cluster10.csv',index=False)