In [None]:
!pip install kneed
!pip install raceplotly

<p style = "font-family:courier,arial,helvetica;font-size:350%;">
Reddit Vaccine Myths Analysis</p>

![](https://media0.giphy.com/media/iFgzUCWgxj7B22ik2K/giphy.gif)

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Importing Packages</p>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import colorama
from colorama import Fore as F
from time import sleep
from nlp_package_pv import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import normalize
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from warnings import filterwarnings
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score
from kneed import KneeLocator
from brown_clustering_yangyuan import *
from sklearn.cluster import Birch
from sklearn.cluster import *
from sklearn.decomposition import TruncatedSVD
import plotly.express as px
from nltk.tokenize import RegexpTokenizer
import umap
import plotly
from raceplotly.plots import barplot
plotly.offline.init_notebook_mode (connected = True)
filterwarnings("ignore")

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Importing Data</p>

In [None]:
print(F.YELLOW+'Importing Data ....')
sleep(2)
data=pd.read_csv('../input/reddit-vaccine-myths/reddit_vm.csv')
print(F.YELLOW+'Imported Data Successfully !!!!')

In [None]:
# Checking for NAN values
data.isna().sum()

In [None]:
# Removing URLand ID columns and removing nan values after that
data.drop(columns=['url','id'],axis=1,inplace=True)
data.dropna(inplace=True)

# Changing the timestamp to datetime format
data['timestamp']=pd.to_datetime(data['timestamp'])
data.head()

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Preprocessing The Data</p>

In [None]:
# Preprocessing the data

title=data['title'].values.copy()

# Removing the word comment from the title columns since this seems to be the default value which is not needed

data['title'].replace({'Comment':''},inplace=True)

# Joining both the title and the body together

data['text']=data[['title', 'body']].agg(' '.join, axis=1)

# Adding the data for the title columns back to it

data['title']=title

# Deleting the title variable 

del title

# Removing Links from the data

data['text']=data['text'].apply(lambda x:re.sub(r"http\S+", "", x))

# Code to remove the Special characters from the text 

data['text']=data['text'].apply(lambda x:' '.join(re.findall(r'\w+', x)))

# Removing the stopwords and tokenizing the data

rem_stopwords_tokenize(data,'text')

# Lemmatizing the sentences

lemmatize_all(data,'text')

# Making all the tokens back to sentences

make_sentences(data,'text')


# Having a look at the data

data.head()

<p style = "font-family:'Brush Script MT', cursive;font-size:200%;">
We have succesfully preprocessed the data</p>

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
what is the length of the data ??</p>

In [None]:
# What is the length of the data we got here ??
print (F.YELLOW + "The length of the dataframe is :" , F.CYAN + str(len(data)))

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Applying Brown Clustering On The Data</p>

In [None]:
sample_data = data.text.astype('str').tolist()

# toeknize
tokenizer = RegexpTokenizer(r'\w+')
sample_data_tokenized = [w.lower() for w in sample_data]
sample_data_tokenized = [tokenizer.tokenize(i) for i in sample_data_tokenized]
corpus = Corpus(sample_data_tokenized, 0.001)
clustering = BrownClustering(corpus, 6)
clustering.train()


<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Closest Word Clusters To the word Vaccine</p>

![](https://img.etimg.com/thumb/width-1200,height-900,imgsize-261105,resizemode-1,msid-79592510/prime/pharma-and-healthcare/2021-is-all-about-vaccine-transportation-piramal-schott-kaisha-are-ready-with-sturdy-vials.jpg)

In [None]:
clustering.get_similar('vaccine')

<p style = "font-family:courier,arial,helvetica;font-size:200%;">
Brown clustering is used for word clustering but we need to cluster the sentences 🤔🤔🤔🤔
    <br> We can just use kmeans clustering on the score we got using the brown clustering to perform the sentence clustering
</p>

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Tokenizing and Padding the data </p>

In [None]:
# Let's use both kmeans and brown clustering together :)
for i,j in enumerate(sample_data_tokenized) :
    for n,m in enumerate(j):
        sample_data_tokenized[i][n]=clustering.vocabulary[m]
        
# Padding the sequences
padded_sequence=pad_sequences(sample_data_tokenized,maxlen=20,padding='post')

print(F.YELLOW+"The vocabulary of the data is " +F.CYAN + str(len(corpus.vocabulary))+F.YELLOW+' words long :)')

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Applying K Means </p>

In [None]:
# Function to get the best k for the image
def get_k(X,print_plot=False):
    arr=X
    wcss=[]
    for i in range(1,11):
        kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
        kmeans.fit(arr)
        wcss.append(kmeans.inertia_)
    x=[i for i in range(1,11)]
    kn = KneeLocator(x, wcss, curve='convex', direction='decreasing')
    if print_plot==True:
        plt.xlabel('number of clusters k')
        plt.ylabel('Sum of squared distances')
        plt.plot(x, wcss, 'rx-')
        plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
        plt.show()
        print('The elbow is formed at :',kn.knee)
    else:
        return kn.knee

In [None]:
get_k(padded_sequence,print_plot=True)

In [None]:
clusters=KMeans(n_clusters=6,random_state=20).fit_predict(padded_sequence)

In [None]:
def plot_tsne_pca(data, labels):
    max_label = max(labels)+1
    max_items = np.random.choice(range(data.shape[0]), size=1000, replace=False)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:])
    tsne = TSNE().fit_transform(PCA(n_components=10).fit_transform(data[max_items,:]))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=320, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')
    
plot_tsne_pca(padded_sequence, clusters)

In [None]:
data['Kmeans clusters']=clusters

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Having a look at cluster 2 </p>

In [None]:
data[data['Kmeans clusters']==2].head()

<p style = "font-family:courier,arial,helvetica;font-size:200%;">
These looks like questions about the vaccine and kind of shows some worries and myths in the minds of the people</p>


<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 A breif Look at Cluster 5</p>


In [None]:
data[data['Kmeans clusters']==5].head()

<p style = "font-family:courier,arial,helvetica;font-size:200%;">
This cluster is more about the people telling about the experience before or after the vaccine shot</p>



<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 Using LSA for topic modelling</p>


In [None]:
 def get_me_topics(cluster_id=1):
    # Let's work on Cluster 1 and find topics for it :)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data[data['Kmeans clusters']==cluster_id]['text'])
    # SVD represent documents and terms in vectors 
    svd_model = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)

    svd_model.fit(X)
    terms = vectorizer.get_feature_names()
    for i, comp in enumerate(svd_model.components_):
        terms_comp = zip(terms, comp)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
        print()
        print("Topic "+str(i)+": ")
        for t in sorted_terms:
            print(t[0],end=' ')
            print(" ",end=' ')

In [None]:
get_me_topics(cluster_id=1)

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
Plotting The Clusters</p>


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'])
svd_model = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)
X_topics = svd_model.fit_transform(X)
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = data['Kmeans clusters'],
s = 10, # size
edgecolor='none'
)
plt.show()

<p style = "font-family:courier,arial,helvetica;font-size:200%;">As you can see above, the result is quite beautiful. Each dot represents a document and the colours represent the clusters. Our LSA model seems to have done a good job.</p>


<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 More Detailed Plot For Topics</p>


In [None]:
def make_topic_plot(cluster=1,topics=1,n=20) :
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data[data['Kmeans clusters']==cluster]['text'])
    svd_model = TruncatedSVD(n_components=topics, algorithm='randomized', n_iter=100, random_state=122)
    svd_model.fit(X)
    comp=svd_model.components_
    terms=vectorizer.get_feature_names()
    plot_frame=pd.DataFrame(columns=['x','y','text','score','topic'])
    for i in range(topics):
        x=np.random.randint(10,200,n)
        y=np.random.randint(10,200,n)
        score=comp[i]
        terms_comp = zip(terms, score)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:n]
        sorted_terms=np.array(sorted_terms)
        sorted_terms=np.array(sorted_terms)
        dn=pd.DataFrame()
        dn['x']=x
        dn['y']=y
        dn['text']=sorted_terms[:,0]
        dn['score']=sorted_terms[:,1].astype('float32')
        dn['topic']=[i+1]*n
        plot_frame=plot_frame.append(dn,ignore_index=True)
    titl='Topic Plot For Cluster'+str(cluster)
    fig=px.scatter(plot_frame,x='x',y='y',text='text',size='score',size_max=40,color='score',
               color_continuous_scale='sunset',labels={'x':'','y':''},title=titl,animation_frame='topic')
    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False)
    fig.show()


In [None]:
make_topic_plot(cluster=1,topics=4)

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 How Topics Change Over Time For Different Clusters</p>


In [None]:
# Important Topics in a cluster changing by time 
def make_race_plot(cluster=1,top_n=4):
    data_time=pd.DataFrame(columns=['text','score','time'])
    for i in data[data['Kmeans clusters']==cluster].sort_values('timestamp')['timestamp'].values[4:] :
            vectorizer = TfidfVectorizer()
            X = vectorizer.fit_transform((data[(data['Kmeans clusters']==cluster) & (data['timestamp']<=i)].sort_values('timestamp')['text']))
            svd_model = TruncatedSVD(n_components=1, algorithm='randomized', n_iter=100, random_state=122)
            svd_model.fit(X)
            comp=svd_model.components_[0]
            terms=vectorizer.get_feature_names()
            terms_comp = zip(terms, comp)
            sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:top_n]
            sorted_terms=np.array(sorted_terms)
            dn=pd.DataFrame()
            dn['text']=sorted_terms[:,0]
            dn['score']=sorted_terms[:,1].astype('float32')
            dn['time']=[i]*top_n
            data_time=data_time.append(dn)


    my_raceplot = barplot(data_time,
                          item_column='text',
                          value_column='score',
                          time_column='time')

    fig=my_raceplot.plot(title = 'Change in most common word for a cluster over time',
                     item_label = 'Text',
                     value_label = 'Score',
                     time_label='Creation Time :',
                     frame_duration = 1600)
    fig.show()

In [None]:
make_race_plot(cluster=2)

<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 Relation btw clusters and number of comments</p>


In [None]:
# Cluster vs Score
grouped=data.groupby('Kmeans clusters').mean()
px.bar(grouped,x=grouped.index+1,y='comms_num',labels={'x':'Cluster'},color=(grouped.index+1).astype(str))

<p style = "font-family:courier,arial,helvetica;font-size:200%;">We can clearly see that the maximum amount of people have commented on the cluster 1 which shows how controversial that cluster may be unlike cluster 3 which seems to have the least mean amount of comments on it</p>



<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 Relation btw clusters and score</p>


In [None]:
# Cluster vs Score
grouped=data.groupby('Kmeans clusters').mean()
px.bar(grouped,x=grouped.index+1,y='score',labels={'x':'Cluster'},color=(grouped.index+1).astype(str))

<p style = "font-family:courier,arial,helvetica;font-size:200%;">There seems to be not much relation between clusters and the score .</p>




<p style = "font-family:courier,arial,helvetica;font-size:300%;">
 Hoping you liked it :) <br> Would really like to know your feedback :) <br>
Will try to explore this data more :) </p>


![](https://www.icegif.com/wp-content/uploads/thank-you-icegif-10.gif)