In [1]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
import re

In [2]:
data = pd.read_excel('corona_news_adj.xlsx')
data

Unnamed: 0,title
0,British coronavirus evacuee who was 'taken ill' on last flight out of Wuhan
1,CDC files for emergency approval of its coronavirus test
2,Can ANY face mask really protect you from the coronavirus?
3,Experts: You won't catch coronavirus from packages from China
4,Seventh American diagnosed with coronavirus as US declares public health emergency
...,...
161649,Why have deaths stayed steady while infections are spiking?
161650,Trump says three novel coronavirus vaccine candidates looking really good
161651,Coronavirus: Trump touts response as COVID-19 daily cases surge
161652,"Woman, 21, 'attacked five airline ticketing agents, coughed on cops while claiming to have COVID-19'"


In [3]:
def preprocess(text):
    text_input = re.sub('[^a-zA-Z1-9]+', ' ', str(text))
    output = re.sub(r'\d+', '',text_input)
    return output.lower().strip()

data['title'] = data.title.map(preprocess)
data

Unnamed: 0,title
0,british coronavirus evacuee who was taken ill on last flight out of wuhan
1,cdc files for emergency approval of its coronavirus test
2,can any face mask really protect you from the coronavirus
3,experts you won t catch coronavirus from packages from china
4,seventh american diagnosed with coronavirus as us declares public health emergency
...,...
161649,why have deaths stayed steady while infections are spiking
161650,trump says three novel coronavirus vaccine candidates looking really good
161651,coronavirus trump touts response as covid daily cases surge
161652,woman attacked five airline ticketing agents coughed on cops while claiming to have covid


In [4]:
stop = set(stopwords.words("english"))

def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

data['title'] = data.title.map(remove_stopwords)
data

Unnamed: 0,title
0,british coronavirus evacuee taken ill last flight wuhan
1,cdc files emergency approval coronavirus test
2,face mask really protect coronavirus
3,experts catch coronavirus packages china
4,seventh american diagnosed coronavirus us declares public health emergency
...,...
161649,deaths stayed steady infections spiking
161650,trump says three novel coronavirus vaccine candidates looking really good
161651,coronavirus trump touts response covid daily cases surge
161652,woman attacked five airline ticketing agents coughed cops claiming covid


In [5]:
def build_corpus(data):
    corpus = []
    for sentence in data.iteritems():
        word_list = sentence[1].split(" ")
        corpus.append(word_list)
    return corpus

corpus = build_corpus(data['title'])

[['british',
  'coronavirus',
  'evacuee',
  'taken',
  'ill',
  'last',
  'flight',
  'wuhan'],
 ['cdc', 'files', 'emergency', 'approval', 'coronavirus', 'test'],
 ['face', 'mask', 'really', 'protect', 'coronavirus'],
 ['experts', 'catch', 'coronavirus', 'packages', 'china'],
 ['seventh',
  'american',
  'diagnosed',
  'coronavirus',
  'us',
  'declares',
  'public',
  'health',
  'emergency'],
 ['rep',
  'paul',
  'gosar',
  'coronavirus',
  'save',
  'lives',
  'must',
  'restrict',
  'travel',
  'affected',
  'areas',
  'us'],
 ['cambodian',
  'pm',
  'says',
  'visit',
  'students',
  'china',
  'wuhan',
  'moral',
  'support'],
 ['burdened',
  'sanctions',
  'north',
  'korea',
  'sees',
  'coronavirus',
  'threaten',
  'economic',
  'lifelines'],
 ['two', 'coronavirus', 'cases', 'confirmed', 'uk'],
 ['coronavirus', 'worldwide', 'cases', 'surpass', 'sars', 'outbreak'],
 ['coronavirus', 'catching', 'diseases', 'animals'],
 ['coronavirus',
  'cement',
  'mixers',
  'become',
  'cel

In [6]:
model = Word2Vec(corpus, size=100, min_count=1)

collecting all words and their counts
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 85390 words, keeping 8357 word types
PROGRESS: at sentence #20000, processed 168698 words, keeping 12104 word types
PROGRESS: at sentence #30000, processed 250568 words, keeping 15077 word types
PROGRESS: at sentence #40000, processed 333508 words, keeping 17739 word types
PROGRESS: at sentence #50000, processed 417802 words, keeping 20054 word types
PROGRESS: at sentence #60000, processed 502255 words, keeping 22136 word types
PROGRESS: at sentence #70000, processed 587489 words, keeping 23953 word types
PROGRESS: at sentence #80000, processed 672141 words, keeping 25530 word types
PROGRESS: at sentence #90000, processed 756843 words, keeping 26976 word types
PROGRESS: at sentence #100000, processed 841520 words, keeping 28343 word types
PROGRESS: at sentence #110000, processed 926732 words, keeping 29602 word types
PROGRESS: at sentence #1200

In [7]:
# fit a 2D PCA model to the vectors
vectors = model[model.wv.vocab]
words = list(model.wv.vocab)
pca = PCA(n_components=2)
PCA_result = pca.fit_transform(vectors)

  vectors = model[model.wv.vocab]


In [8]:
# prepare a dataframe
words = pd.DataFrame(words)
PCA_result = pd.DataFrame(PCA_result)
PCA_result['x_values'] =PCA_result.iloc[0:, 0]
PCA_result['y_values'] =PCA_result.iloc[0:, 1]
PCA_final = pd.merge(words, PCA_result, left_index=True, right_index=True)
PCA_final['word'] =PCA_final.iloc[0:, 0]
PCA_data_complet =PCA_final[['word','x_values','y_values']]
PCA_data_complet.to_csv('PCA_data_complet.csv',index=False)

In [9]:
print(PCA_data_complet)

              word  x_values  y_values
0          british  2.523623  1.836101
1      coronavirus  2.056741 -0.672322
2          evacuee  0.525597  0.406559
3            taken  1.963146  1.201577
4              ill  2.184186  2.426750
...            ...       ...       ...
35170    masterjee -0.517581 -0.025725
35171     netizens -0.514347 -0.050055
35172     untraced -0.598825 -0.076903
35173      furough -0.539792 -0.037048
35174  explosively -0.521675 -0.076559

[35175 rows x 3 columns]


In [10]:
# word frequencies calculation
from collections import Counter
# count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter_all = counter_word(data.title)
words_all = counter_all.most_common(100)

words_all= pd.DataFrame(words_all)
words_all['word'] = words_all.iloc[0:, 0]
words_all['count'] = words_all.iloc[0:, 1]
words_all = words_all[['word','count']]
words_all.to_csv('top100_words.csv', columns=['word','count'], index=False)

In [11]:
# restriction to 100 most frequent words
top100_words = pd.read_csv('top100_words.csv')
PCA_data_top_100 = PCA_data_complet.merge(top100_words, how='inner', left_on='word',right_on='word')
PCA_data_top_100.to_csv('PCA_data_top_100.csv', index=False)

In [None]:
# K-means cluster visualisation
K_means_data = pd.read_excel('K-means_clustering.xlsx')

sns.set_style('ticks')
fig = sns.lmplot(x='x_values', y='y_values',
           data = K_means_data,
           fit_reg=False,
           legend=True,
           hue='Cluster')
fig = plt.gcf()
fig.set_size_inches(8, 6)

plt.savefig('word2vec_clustering.png')

plt.show()

In [18]:
K_means_data

Unnamed: 0,x_values,y_values,count,Selected,word,Cluster,Silhouette,Selected (1),Selected (2)
0,2.114283,-0.601364,101954,No,coronavirus,C3,0.500000,No,No
1,2.701490,-0.471515,1627,No,wuhan,C8,0.493042,No,No
2,3.716232,-3.177167,1511,No,emergency,C6,0.592801,No,No
3,2.604488,0.994713,2819,No,test,C4,0.579329,No,No
4,3.222806,-1.000762,2132,No,face,C8,0.620955,No,No
...,...,...,...,...,...,...,...,...,...
95,3.978523,-4.784225,1962,No,rise,C5,0.639390,No,No
96,4.510755,2.439026,1504,No,boris,C7,0.618911,No,No
97,4.282936,-4.727693,1697,No,reopen,C5,0.664938,No,No
98,3.357510,-4.645325,1589,No,states,C6,0.619798,No,No
