In [57]:
from gensim.models import word2vec
from gensim.models import KeyedVectors
# from glove import Corpus, Glove
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet as wn
import nltk
# from sklearn.svm import SVC
# import numpy as np

from copy import deepcopy
# from collections import Counter, defaultdict
# from tabulate import tabulate
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import BernoulliNB, MultinomialNB

# from sklearn.cross_validation import cross_val_score
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import ExtraTreesClassifier

In [2]:
def create_glove(data, LEARNING_RATE=0.05, EPOCHS=5, NO_THREADS=4, EMBEDDING_DIM=100):
    model = None
    corpus = Corpus()
    corpus.fit(data, window=10)
    model = Glove(no_components=EMBEDDING_DIM,learning_rate=LEARNING_RATE)
    model.fit(corpus.matrix, epochs=EPOCHS,no_threads=NO_THREADS,verbose=True)
    model.add_dictionary(corpus.dictionary)
    return model

def create_word2vec(data,EMBEDDING_DIM=100):
    model = word2vec.Word2Vec(data, size=EMBEDDING_DIM)
    return model

def preprocess_text(posts):
    text = str(posts['post_title'])+'. '+ str(posts['post_text'])
    text =  re.sub('tl[;]?dr','',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+[0-9]+[s]?[ /\(,)]*f[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+[0-9]+[s]?[ /\(,)]*m[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+f[ /\(,)]*[0-9]+[s]?[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+m[ /\(,)]*[0-9]+[s]?[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[0-9]+','NUM',text,flags=re.IGNORECASE)
    text = re.sub('u/[^\s]+','AT_USER',text,flags=re.IGNORECASE)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text,flags=re.IGNORECASE)  #Convert www.* or https?://* to <url>
    text = text.split("[.]?\n[\* \[\(/]*[eE]dit")[0]
    text = text.split("[.]?\n[\* \[\(/]*EDIT")[0]
    text = text.split("[.]?\n[\* \[\(/]*big edit")[0]
    text = text.split("[.]?\n[\* \[\(/]*important edit")[0]
    text = text.split("[.]?\n[\* \[\(/]*[uU]pdate")[0]
    text = text.split("[.]?\n[\* \[\(/]*UPDATE")[0]
    text = text.split("[.]?\n[\* \[\(/]*big update")[0]
    text = text.split("[.]?\n[\* \[\(/]*important update")[0]
    text = text.split("[.]?\nfor an update")[0]
    text = text.replace('\r', '')
    return text

#calculate two things,
#specificness how good is the cluster and the elements similar to each other/ how well can other elements be described by their neighbors
#uniqueness is how differnet this cluster to others
def cluster_score(clean_cluster,c_syn,k):
    unique_clusters = clean_cluster.loc[:,['cluster']]
    unique_clusters = unique_clusters.drop_duplicates()
    tot_spec = 0
    tot_uni = 0
    for num,c in unique_clusters.iterrows():
        union = clean_cluster[clean_cluster['cluster']==c['cluster']].merge(c_syn[c_syn['cluster']==c['cluster']],how='inner',left_on=['word'],right_on=['syn'])
        specificness = len(union)/len(clean_cluster[clean_cluster['cluster']==c['cluster']])
        union = clean_cluster[clean_cluster['cluster']!=c['cluster']].merge(c_syn[c_syn['cluster']==c['cluster']],how='inner',left_on=['word'],right_on=['syn'])
        uniqueness = 1 - (len(union)/(len(clean_cluster)-len(clean_cluster[clean_cluster['cluster']==c['cluster']])))
        tot_spec = tot_spec + specificness
        tot_uni = tot_uni + uniqueness
    tot_spec = tot_spec/len(unique_clusters)
    tot_uni = tot_uni/len(unique_clusters)
    return {'spec':tot_spec,'uni':tot_uni,'k':k}

    
def generate_syn_info(cluster):
    cluster_syn = pd.DataFrame()
    unique_clusters = cluster.loc[:,['cluster']]
    unique_clusters = unique_clusters.drop_duplicates()
    for cnum,c in unique_clusters.iterrows():
#         print('starting cluster...',c[0])
        cur_cluster = cluster[cluster['cluster']==c['cluster']]
        syns = []
        for wnum,word in cur_cluster.iterrows():
            for s in wn.synsets(word['word']):
                syn = s.name().split('.')[0]
                if syn.find('_')<0:  #filter out composed words
                    syns.append(syn)
            #syns.append(word['word'])
        this_cluster = pd.DataFrame(syns,columns=['syn'])
        this_cluster['cluster'] = c[0]
        this_cluster = this_cluster.drop_duplicates()
        cluster_syn = pd.concat([cluster_syn,this_cluster])         
    return cluster_syn

In [3]:
#prepare sentences
c_train = pd.read_csv('data/c_train2.csv')
c_test = pd.read_csv('data/c_test2.csv')
c_data = pd.concat([c_train,c_test],sort=False)
c_data = c_data.loc[:,['post_created_utc', 'full_link', 'post_id', 'post_num_comments',
       'post_score', 'subreddit', 'post_title', 'post_text']]

nc_train = pd.read_csv('data/nc_train2.csv')
nc_test = pd.read_csv('data/nc_test2.csv')
nc_data = pd.concat([nc_train,nc_test],sort=False)
nc_data = nc_data.loc[:,['post_created_utc', 'full_link', 'post_id', 'post_num_comments',
       'post_score', 'subreddit', 'post_title', 'post_text']]

full_data = pd.concat([c_data,nc_data],sort=False)
full_data = full_data.sample(len(full_data))
posts = full_data.apply(preprocess_text,axis=1)
data_sentences = []
for post in posts:
    sent_tokenize_list = sent_tokenize(post)
    data = [nltk.word_tokenize(sentence) for sentence in sent_tokenize_list]
    data_sentences = data_sentences + data 
len(data_sentences)

359557

In [4]:
print('start training...')
# gloveModel = create_glove(data_sentences)
w2vModel = create_word2vec(data_sentences)
#is ok to train the model with the full dataset as we are not providing labels.
w2v = {w: vec for w, vec in zip(w2vModel.wv.index2word, w2vModel.wv.syn0)}
# glove = {w: vec for w, vec in zip(gloveModel.dictionary, gloveModel.word_vectors)}

start training...




In [5]:
#clustering w2v
print (w2vModel.similarity('this', 'is'))
# w2vModel.wv.index2word
# print (w2vModel.most_similar(positive=['hello'], negative=[], topn=10))

# w2v

  from ipykernel import kernelapp as app


-0.22433679958269076


In [324]:
# from nltk.cluster import KMeansClusterer
# import nltk
# X = w2vModel[w2vModel.wv.index2word]
# NUM_CLUSTERS=10
# kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
# # print (assigned_clusters)

In [325]:
# words = pd.DataFrame(list(w2vModel.wv.index2word),columns=['word'])
# clusters = pd.DataFrame(list(assigned_clusters),columns=['cluster'])
# features = pd.DataFrame(w2vModel.wv.syn0)

# result = words.merge(clusters,left_index=True,right_index=True)
# result = result.merge(features,left_index=True,right_index=True)
# result.to_csv('nltk_clusters10.csv',index=False,encoding='utf-8')

In [12]:
# from sklearn import cluster
# from sklearn import metrics


# X = w2vModel[w2vModel.wv.index2word]
# NUM_CLUSTERS = 150
# kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
# kmeans.fit(X)
# labels = kmeans.labels_
# words = pd.DataFrame(list(w2vModel.wv.index2word),columns=['word'])
# clusters = pd.DataFrame(list(labels),columns=['cluster'])
# features = pd.DataFrame(w2vModel.wv.vectors)
# result = words.merge(clusters,left_index=True,right_index=True)
# result = result.merge(features,left_index=True,right_index=True)
    
# clean_cluster = result
# clean_cluster = clean_cluster[clean_cluster['word']==clean_cluster['word']]

# for num,c in result.iterrows():
#     try:
#         w1 = wn.synsets(c['word'])
#         if len(w1)==0:
#             clean_cluster = clean_cluster[clean_cluster['word']!=c['word']]
#     except:
#         clean_cluster = clean_cluster[clean_cluster['word']!=c['word']]
#     clean_cluster['word'] = clean_cluster.apply(lambda row: row['word'].lower(),axis=1)
#     clean_cluster = clean_cluster.loc[:,['word','cluster']]
#     clean_cluster = clean_cluster.drop_duplicates()
    
#     c_syn = generate_syn_info(clean_cluster)
#     score = cluster_score(clean_cluster,c_syn,NUM_CLUSTERS)
#     final_vals = final_vals.append(score,ignore_index=True)
# #     final_vals.to_csv('cluster_eval.csv',encoding='utf-8',index=False)




In [67]:
# final_vals
wn.synsets('sorry')

[Synset('regretful.a.01'),
 Synset('deplorable.s.01'),
 Synset('good-for-nothing.s.01'),
 Synset('blue.s.08')]

In [85]:
result[result['word']=='ran']

result[(result['cluster']==79) & (~result['word'].isin(['example','excuse','opportunity','option','incident']))]

Unnamed: 0,word,cluster,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
108,got,79,0.053381,-1.936365,0.561727,-1.378119,0.132917,5.273778,-0.025414,0.23115,...,-2.206438,-2.622548,0.020733,-0.913719,0.762811,-2.355414,-0.921598,-2.747892,0.625829,1.551328
160,went,79,0.042044,-1.70798,1.142836,-0.004915,0.386709,3.767817,-1.365376,3.451374,...,-0.63577,-0.976915,3.296836,-0.683126,1.684834,1.090186,-0.055105,-0.434413,0.496267,0.416987
257,left,79,-1.456957,-1.699758,1.478573,-1.753642,-0.371985,2.490468,-0.260113,1.041301,...,-1.268819,1.306178,0.626147,-1.218425,0.605352,0.32012,0.584418,-0.003564,-0.069379,-0.068601
267,found,79,2.265918,-0.968295,-1.647641,0.484722,-1.52275,4.67881,0.581851,-1.220346,...,-0.228626,-0.747722,1.109239,-1.819456,0.687684,0.321745,-2.132119,-1.349787,-1.3791,0.301172
277,came,79,-0.463607,-2.136055,1.442872,0.372457,0.354493,4.310445,0.089965,1.507177,...,-0.663894,-0.929912,1.81884,-0.628678,0.912052,-0.169193,-0.065466,-2.231793,-0.408411,1.179361
354,took,79,1.114481,-0.710591,-0.304834,-1.296521,0.667023,2.175258,-1.702568,1.774679,...,0.351849,-0.873941,2.864876,0.223393,1.311705,0.827495,0.23313,-2.768774,0.957672,-1.084769
423,comes,79,-1.403478,-1.069754,0.811411,1.503335,0.00912,2.599219,-0.858298,-0.852993,...,-0.244975,0.126007,-0.506222,-0.422775,0.281179,0.217244,1.608713,0.26321,0.448479,2.090702
435,goes,79,0.390273,-0.666014,0.277956,1.113418,0.499086,2.934326,-3.656014,-0.174089,...,0.013967,1.840238,1.573067,-1.18924,1.262,0.248559,1.868664,1.695083,0.504797,1.674304
553,turned,79,1.706041,0.922434,2.38613,1.205237,1.970052,2.095382,-1.556233,-1.698806,...,-1.63546,0.741163,0.593997,0.541251,2.336679,1.149196,-0.686093,0.338912,0.658016,0.825516
610,worked,79,1.471462,-2.06994,0.900406,1.020264,-0.695869,2.575898,-0.334781,0.034287,...,-1.307653,0.215437,0.973429,-1.771111,1.192339,1.518461,0.604623,-0.800338,1.954622,1.228762


In [93]:
len(result)/1400

12.609285714285715

In [104]:
import numpy as np
cand_vocab = np.array(w2vModel.wv.index2word)
synset_vocab = [wn.synsets(word) for word in cand_vocab]

In [187]:
synset_vocab[:2]

[[Synset('iodine.n.01'),
  Synset('one.n.01'),
  Synset('i.n.03'),
  Synset('one.s.01')],
 []]

In [185]:
last_cluster_id = 0
syn_cluster = [None] * len(synset_vocab)
syn_set_list = []
for i, syns_x in enumerate(synset_vocab):
    if syns_x == []: continue
    syns_x = set(syns_x)
    overlapped_dict = {}
    for j, syn_set in enumerate(syn_set_list):
        intsec = syns_x.intersection(syn_set)
        if intsec:
            overlapped_dict[j] = len(intsec)
    if len(overlapped_dict) == 0:
        syn_set_list.append(syns_x)
        syn_cluster[i] = len(syn_set_list)-1
        continue
    sorted_intsec = sorted(overlapped_dict.items(), key=lambda x: x[1], reverse=True)
    top_set_id = sorted_intsec[0][0]
    syn_set_list[top_set_id].update(syns_x) 
    syn_cluster[i] = top_set_id

In [200]:
#now that we have the synonym clusters, we put the names back
df_syn_olp_cluster = pd.DataFrame(syn_cluster, index=cand_vocab)
df_n = pd.DataFrame({'cluster': df_syn_olp_cluster[0], 'word': df_syn_olp_cluster.index})
df_n = df_n[df_n.cluster >= 0]

In [216]:
from sklearn.metrics import homogeneity_score, completeness_score
import pandas as pd

def eval_syn_cluster(df_word2cluster, df_syn_cluster):
    val_vocab = list(df_syn_cluster[df_syn_cluster.cluster >= 0].index)
    y_pred = list(df_word2cluster[df_word2cluster.word.isin(val_vocab)].cluster)
    y_true = list(df_syn_cluster[df_syn_cluster.cluster >= 0].cluster.apply(int))
    return {'homogeneity_score':homogeneity_score(labels_pred=y_pred, labels_true=y_true),
            'completeness_score':completeness_score(labels_pred=y_pred, labels_true=y_true)}

eval_syn_cluster(result,df_n)

{'completeness_score': 0.4268017890729324,
 'homogeneity_score': 0.16331547703777072}

In [236]:
from sklearn import cluster
from sklearn import metrics


cand_vocab = np.array(w2vModel.wv.index2word)
synset_vocab = [wn.synsets(word) for word in cand_vocab]
last_cluster_id = 0
syn_cluster = [None] * len(synset_vocab)
syn_set_list = []
for i, syns_x in enumerate(synset_vocab):
    if syns_x == []: continue
    syns_x = set(syns_x)
    overlapped_dict = {}
    for j, syn_set in enumerate(syn_set_list):
        intsec = syns_x.intersection(syn_set)
        if intsec:
            overlapped_dict[j] = len(intsec)
    if len(overlapped_dict) == 0:
        syn_set_list.append(syns_x)
        syn_cluster[i] = len(syn_set_list)-1
        continue
    sorted_intsec = sorted(overlapped_dict.items(), key=lambda x: x[1], reverse=True)
    top_set_id = sorted_intsec[0][0]
    syn_set_list[top_set_id].update(syns_x) 
    syn_cluster[i] = top_set_id
df_n = pd.DataFrame({'cluster': df_syn_olp_cluster[0], 'word': df_syn_olp_cluster.index})
df_n = df_n[df_n.cluster >= 0]
    
    
final_vals = pd.read_csv('cluster_eval_sysnet.csv')
init_val = int(final_vals.clusters.max() - 1)
kfold = 2000
for k in range(init_val,kfold):
    X = w2vModel[w2vModel.wv.index2word]
    NUM_CLUSTERS = k+2
    kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
    kmeans.fit(X)
    labels = kmeans.labels_
    words = pd.DataFrame(list(w2vModel.wv.index2word),columns=['word'])
    clusters = pd.DataFrame(list(labels),columns=['cluster'])
    features = pd.DataFrame(w2vModel.wv.vectors)
    result = words.merge(clusters,left_index=True,right_index=True)
    result = result.merge(features,left_index=True,right_index=True)

    score = eval_syn_cluster(result,df_n)
    score['clusters'] = NUM_CLUSTERS
    final_vals = final_vals.append(score,ignore_index=True)
    final_vals.to_csv('cluster_eval_sysnet.csv',encoding='utf-8',index=False)




KeyboardInterrupt: 

4
