In [None]:
import numpy as np
import pandas as pd 
import os
from sklearn.feature_extraction.text import TfidfVectorizer


from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from sklearn.decomposition import PCA

import re;
import logging;
import sqlite3;
import time;
import sys;
import multiprocessing;
import matplotlib.pyplot as plt;


# Tokenisation & TF-IDF
import spacy
nlp = spacy.load("en_core_web_sm")


Read the data

In [None]:
df=pd.read_csv("../input/covidvaccine-tweets/covidvaccine.csv")
#df = pd.read_csv("covidvaccine.csv")
df.head(20)

In [None]:
df.shape

## There is no row with is_retweet = True

In [None]:
df.info()

In [None]:
df.is_retweet.value_counts()

## **Data Preprocessing**

In [None]:
#We create a pandas dataframe as follows:
data = pd.DataFrame(data=df.text)
data = data.rename(columns={'text' : 'Tweets'})
data.head()

In [None]:
# We display the first 10 elements of the dataframe:
pd.set_option('max_colwidth',170)
display(data.head(10))

In [None]:
docs=df.text.head(1000).values
type(docs)

In [None]:
docs_clean = []
for doc in docs:
    doc_2 = re.sub(r':.*$', ":", doc)
    docs_clean.append(doc_2)

docs_clean[:20]


In [None]:
docs2=docs_clean

In [None]:
# remove punctuations
punctuationChars = '!@#$%^&*(){}{}|;:",./<>?' # you might choose different charcters to drop
for i in punctuationChars:
    docs2 = np.char.replace(docs2, i, ' ')
# remove apostrophe's (single quotes)
docs2 = np.char.replace(docs2,"'",' ')
# remove line feeds
docs2 = np.char.replace(docs2,"\n",' ')
# remove 'http:'
docs2 = np.char.replace(docs2,"https:",' ')
docs2 = np.char.replace(docs2,"https",' ')

# make lower case
for i,s in enumerate(docs2):
    docs2[i] = s.lower()
    
# Show the cleaned data
# Show the beginning of each document

#for i in range(len(docs2)):
#        print(f'\ndoc{i}: {docs2[i]}') 

for i in range(100):
       print(f'\ndoc{i}: {docs2[i]}') 

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(docs2[0:10])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

### Define Spacy Tokenizer

In [None]:
def spacy_tokenizer(document):
    tokens = nlp(document)
    tokens = [token for token in tokens if (
        token.is_stop == False and \
        token.is_punct == False and \
        token.lemma_.strip()!= '')]
    tokens = [token.lemma_ for token in tokens]
    return tokens

In [None]:
# test data to see what spacy tokenizer can do.
example_corpus = [
    "Monsters are bad. They likes to eat geese. I saw one goose flying away", \
    "I saw a monster yesterday. The meaning is so obvious!", \
    "Why are we talking about bad monsters? They are meanness."]

In [None]:
tfidf_vector = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)
# test
corpus=example_corpus
# fit: learns vocabulary and idf
# transform: transforms documents into document-term matrix
result_test = tfidf_vector.fit_transform(corpus)
result_test

### Successfully extraxt intended meaning of the words. 14 tokens for the example corpus.

In [None]:
dense = result_test.todense()
denselist = dense.tolist()
df_test = pd.DataFrame(
    denselist,columns=tfidf_vector.get_feature_names())
df_test

## Apply Spacy tokenizer, TF-IDF, K-means for our first 1000 tweets.

In [None]:
tfidf_vector = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)
corpus = docs2

# fit: learns vocabulary and idf
# transform: transforms documents into document-term matrix
result = tfidf_vector.fit_transform(corpus)
result

It’s a sparse matrix with 1000 reviews and 3191 terms, out of those 3191000 possible numbers there are 9169 non-zero TF-IDF values. We can check which terms are actually considered from the sentences with the get_feature_names method:

In [None]:
# We can check which terms are actually considered from the sentences with the get_feature_names method:
tfidf_vector.get_feature_names()[1:500]

The sparse matrix format is an efficient way to store this information, but you might want to convert it to a more readable, dense matrix format using the todense method. 
To create a pandas DataFrame from the results, you can use the following code:

In [None]:
dense = result.todense()
denselist = dense.tolist()
df = pd.DataFrame(
    denselist,columns=tfidf_vector.get_feature_names())
df


### Let's see the weights for words contained in the first Tweet.

In [None]:
df[["australia", "manufacture", "covid-19"]]

### check the cosine similarity

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cos_df = pd.DataFrame(columns=df.index)
for i in range(999):
    curr_cos_sim = linear_kernel(result[i:i+1], result).flatten()
    cos_df[i] = curr_cos_sim
    
cos_df

## Create the clustering table

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans_models = {}
for i in range(2,13+1):
    current_kmean = KMeans(n_clusters=i).fit(result)
    kmeans_models[i] = current_kmean

In [None]:
cluster_df = pd.DataFrame()
cluster_df['Review Texts'] = docs
for i in range(2, 13+1):
    col_name = str(i) +'means_label'
    cluster_df[col_name] = kmeans_models[i].labels_
cluster_df

## Elbow Method to determine the best K

In [None]:
Sum_of_squared_distances = []
K = range(1,18)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(result)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

### Choose K = 10 to do the experiment

In [None]:
cluster10 = cluster_df.iloc[:,[0,9]]
cluster10_0 = cluster10.loc[cluster10["10means_label"] == 0]
cluster10_0.head(50)

In [None]:
cluster10_1 = cluster10.loc[cluster10["10means_label"] == 1]
cluster10_1.head(50)

## Cluster_2 focus on topics related to Russia Vaccine.

In [None]:
cluster10_2 = cluster10.loc[cluster10["10means_label"] == 2]
cluster10_2

## CLuster_3 contains more rumors and negtive reactions.

In [None]:
cluster10_3 = cluster10.loc[cluster10["10means_label"] == 3]
cluster10_3.head(50)

In [None]:
cluster10_4 = cluster10.loc[cluster10["10means_label"] == 4]
cluster10_4

In [None]:
cluster10_5 = cluster10.loc[cluster10["10means_label"] == 5]
cluster10_5

## What's Next?
### 1. Using KNN or cosine similarity to classify the new tweets
### 2. Based on the insight we get from the existing clusters, extract the useful Information that related to the topic you are interested.