This notebook will explore the use of Word2Vec by training on: the sms corpus, and our consolidated ns reviews data. After that, the model will be fitted to our ns-reviews data to get word vectors for each word.
Clustering will be explored using KMeans and visualisations of the word vectors in a 2-Dimensional space.

### Word2Vec

In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import regex
import collections, re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer 
import itertools
from sklearn.cluster import KMeans
from tqdm import tqdm
from gensim.models import Word2Vec 
#NLP libraries
from sklearn.feature_extraction.text import CountVectorizer
#for visualization
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE

In [2]:
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt 

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

In [3]:
#loading the sms corpus
with open("smsCorpus_en_2015.03.09_all.json") as f:
    data = json.load(f)

In [8]:
def preprocess(df, colname):
    tester = df[colname]
    #colname = str(colname)
    #convert to lowercase
    tester = tester.apply(lambda x: " ".join(str(x).lower() for x in str(x).split()))
    #remove punctuations
    tester = tester.str.replace('[^\w\s]','')
    #remove stopwords
    stop = stopwords.words('english')
    tester = tester.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
    #remove common words
    freq = pd.Series(' '.join(tester).split()).value_counts()
    freq = list(freq.index)
    df[colname] = df[colname].apply(lambda x: " ".join(x for x in str(x).split() if x not in freq))
    #remove rare words
    rare = pd.Series(' '.join(df[colname]).split()).value_counts()[-10:]
    rare = list(rare.index)
    df[colname] = df[colname].apply(lambda x: " ".join(x for x in str(x).split() if x not in rare))
    return tester

In [9]:
#progress bar
def progress(count, total, status = ''):
    bar_len = 60
    filled_len = int(round(bar_len*count/float(total)))
    percents = round(100.0*count/float(total),1)
    bar = '='*filled_len + '-'*(bar_len-filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar,percents,'%',status))
    sys.stdout.flush()

In [10]:
#sms corpus preprocessing
listofDict = data['smsCorpus']['message']
fullData = pd.DataFrame(listofDict)
smsData = fullData[['@id','text']]

In [12]:
smsData

Unnamed: 0,@id,text
0,10120,{'$': 'Bugis oso near wat...'}
1,10121,"{'$': 'Go until jurong point, crazy.. Availabl..."
2,10122,{'$': 'I dunno until when... Lets go learn pil...
3,10123,{'$': 'Den only weekdays got special price... ...
4,10124,{'$': 'Meet after lunch la...'}
...,...,...
55830,45714,{'$': 'I LOVE YOU TOO'}
55831,45715,{'$': 'C-YA'}
55832,45716,{'$': ':-)'}
55833,45717,{'$': 'BE MY GUEST'}


In [13]:
tester = smsData['text']
#colname = str(colname)
#convert to lowercase
tester = tester.apply(lambda x: " ".join(str(x).lower() for x in str(x).split()))
#remove punctuations
tester = tester.str.replace('[^\w\s]','')
#remove stopwords
stop = stopwords.words('english')
tester = tester.apply(lambda x: " ".join(x for x in str(x).split() if x not in stop))
#remove common words
freq = pd.Series(' '.join(tester).split()).value_counts()
freq = list(freq.index)
tester = tester.apply(lambda x: " ".join(x for x in str(x).split() if x not in freq))
#remove rare words
rare = pd.Series(' '.join(dtester).split()).value_counts()[-10:]
rare = list(rare.index)
tester = dtester.apply(lambda x: " ".join(x for x in str(x).split() if x not in rare))


  tester = tester.str.replace('[^\w\s]','')


KeyboardInterrupt: 

In [11]:
sms_corpus = preprocess(smsData, 'text')
length = len(sms_corpus)
token_sms = [nltk.word_tokenize(t) for t in sms_corpus]
for index, t in enumerate(sms_corpus):
    text = nltk.word_tokenize(t)
    token_sms.append(text)
    progress(index, length, status = 'tokenising')

  tester = tester.str.replace('[^\w\s]','')


KeyboardInterrupt: 

In [None]:
ns_reviews = pd.read_csv('corpus-full-review.csv')
ns_reviews.dropna(inplace = True)

In [None]:
#ns reviews pre processing
ns_corpus = preprocess(ns_reviews, 'content')

In [None]:
#reviews = ns_corpus['content']
token_ns = [nltk.word_tokenize(r) for r in ns_corpus]

In [None]:
#word2vec model building and training
model_sms = Word2Vec(window = 3, min_count = 2,  negative = 10, # for negative sampling
                          alpha=0.03, min_alpha=0.0007,seed = 14)
model_sms.build_vocab(token_sms, progress_per=200)
model_sms.train(token_ns, total_examples=model_sms.corpus_count,epochs=model_sms.epochs)
#to end the model
model_sms.init_sims(replace=True)

In [None]:
print(model_sms)

In [None]:
#words in the sms model
model_sms.wv.vocab.keys()

In [None]:
#top 5 similar words to experience
model_sms.most_similar('experience')[:5]

In [None]:
#to 5 similar words to medical
model_sms.most_similar('medical')[:5]

In [None]:
#plot of word vectors
words = model_sms.wv.index2word
wvs = model_sms.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(15, 10))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
#taking the word vectors from sms model
X = np.array([model_sms[word] for word in model_sms.wv.vocab.keys()])

In [None]:
#initialise KMeans to find optimal number of clusters, based on sms model
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))

visualizer.fit(X)    # Fit the data to the visualizer
visualizer.poof() 

In [None]:
#from the plot, the optimal number of clusters is: 
#fit into KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

In [None]:
#building word2vec model with ns data (same parameters as sms model)
model_ns = Word2Vec(window = 3, min_count = 2,  negative = 10, # for negative sampling
                          alpha=0.03, min_alpha=0.0007,seed = 14)
model_ns.build_vocab(tok_reviews, progress_per=200)
model_ns.train(tok_reviews, total_examples=model_ns.corpus_count,epochs=model_ns.epochs)
#to end the model
model_ns.init_sims(replace=True)

In [None]:
print(model_ns)

In [None]:
model_ns.wv.vocab.keys()

In [None]:
model_ns.most_similar('experience')[:5]

In [None]:
model_ns.most_similar('medical')[:5]

In [None]:
#plot of word vectors
words = model_ns.wv.index2word
wvs = model_ns.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(15, 10))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
#taking the word vectors from ns model
X = np.array([model_ns[word] for word in model_ns.wv.vocab.keys()])

In [None]:
#initialise KMeans to find optimal number of clusters, based on ns model
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10))

visualizer.fit(X)    # Fit the data to the visualizer
visualizer.poof() 

In [None]:
#from the plot, the optimal number of clusters is: 
#fit into KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);

In [None]:
def cosine_distance (model, word,target_list , num) :
    cosine_dict ={}
    word_list = []
    a = model[word]
    for item in target_list :
        if item != word :
            b = model [item]
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            cosine_dict[item] = cos_sim
    dist_sort=sorted(cosine_dict.items(), key=lambda dist: dist[1],reverse = True) ## in Descedning order 
    for item in dist_sort:
        word_list.append((item[0], item[1]))
    return word_list[0:num]

In [None]:
cosine_distance (model_sms,'checkup',features,5)

In [None]:
cosine_distance (model_ns,'checkup',features,5)

### Occurrence Pairing

In [None]:
tokenizer = dict()
ns_reviews['Content'] = ns_reviews['Content'].apply(
    lambda Content: [tokenizer.setdefault(named_entitie, len(tokenizer))
                            for named_entitie in Content.split(' ')])
ns_reviews.head()

In [None]:
pairs_df = ns_reviews['Content'].apply(lambda Content: list(itertools.combinations(Content, 2)))
pairs_df = pairs_df[pairs_df.apply(len) > 0]
pairs_df = pd.DataFrame(np.concatenate(pairs_df.values), columns=['named_entity_1', 'named_entity_2'])
pairs_df.head()

In [None]:
NAMED_ENTITIES_CO_OCCURENCE_THRESHOLD = 1

edges_df = pairs_df.groupby(['named_entity_1', 'named_entity_2']).size().reset_index(name='weight')
edges_df = edges_df[edges_df['weight'] > NAMED_ENTITIES_CO_OCCURENCE_THRESHOLD]
edges_df[['named_entity_1', 'named_entity_2', 'weight']].to_csv('edges.csv', header=False, index=False, sep=' ')
edges_df

In [None]:
nodes = pd.DataFrame(list(tokenizer.items()), columns = ['word','number'])
nodes

In [None]:
edges_df.to_csv('edges.csv')
nodes.to_csv('nodes.csv')

### with reddit data

In [None]:
reddit = pd.read_csv('nationalservicesg_combineddata.csv')

In [None]:
reddit['content']

In [None]:
reddit = preprocess(reddit, 'content')
tok_reddit = [nltk.word_tokenize(r) for r in reddit]

In [None]:
reddit = preprocess(reddit, 'content')
tok_reddit = [nltk.word_tokenize(r) for r in reddit]
redditmodel = Word2Vec(window = 3, min_count = 2,  negative = 10, # for negative sampling
                          alpha=0.03, min_alpha=0.0007,seed = 14)
redditmodel.build_vocab(tok_reddit, progress_per=200)
redditmodel.train(tok_reddit, total_examples=redditmodel.corpus_count,epochs=redditmodel.epochs)

In [None]:
redditmodel.init_sims(replace=True)
print(redditmodel)

In [None]:
words = redditmodel.wv.index2word
wvs = redditmodel.wv[words]

tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words

plt.figure(figsize=(25, 20))
plt.scatter(T[:, 0], T[:, 1], c='orange', edgecolors='r')
for label, x, y in zip(labels, T[:, 0], T[:, 1]):
    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')

In [None]:
redditmodel.most_similar('experience')[:5]

In [None]:
redditmodel.most_similar('medical')[:5]