In [375]:
from gensim.models import word2vec
from gensim.models import KeyedVectors
# from glove import Corpus, Glove
import pandas as pd
import re
from nltk.tokenize import sent_tokenize
import nltk
# from sklearn.svm import SVC
# import numpy as np

from copy import deepcopy
# from collections import Counter, defaultdict
# from tabulate import tabulate
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import BernoulliNB, MultinomialNB

# from sklearn.cross_validation import cross_val_score
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import ExtraTreesClassifier

In [376]:
def create_glove(data, LEARNING_RATE=0.05, EPOCHS=5, NO_THREADS=4, EMBEDDING_DIM=100):
    model = None
    corpus = Corpus()
    corpus.fit(data, window=10)
    model = Glove(no_components=EMBEDDING_DIM,learning_rate=LEARNING_RATE)
    model.fit(corpus.matrix, epochs=EPOCHS,no_threads=NO_THREADS,verbose=True)
    model.add_dictionary(corpus.dictionary)
    return model

def create_word2vec(data,EMBEDDING_DIM=100):
    model = word2vec.Word2Vec(data, size=EMBEDDING_DIM)
    return model

def preprocess_text(posts):
    text = str(posts['post_title'])+'. '+ str(posts['post_text'])
    text =  re.sub('tl[;]?dr','',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+[0-9]+[s]?[ /\(,)]*f[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+[0-9]+[s]?[ /\(,)]*m[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+f[ /\(,)]*[0-9]+[s]?[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[ \(\[]+m[ /\(,)]*[0-9]+[s]?[ \]\)]+',' ',text,flags=re.IGNORECASE)
    text = re.sub('[0-9]+','NUM',text,flags=re.IGNORECASE)
    text = re.sub('u/[^\s]+','AT_USER',text,flags=re.IGNORECASE)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text,flags=re.IGNORECASE)  #Convert www.* or https?://* to <url>
    text = text.split("[.]?\n[\* \[\(/]*[eE]dit")[0]
    text = text.split("[.]?\n[\* \[\(/]*EDIT")[0]
    text = text.split("[.]?\n[\* \[\(/]*big edit")[0]
    text = text.split("[.]?\n[\* \[\(/]*important edit")[0]
    text = text.split("[.]?\n[\* \[\(/]*[uU]pdate")[0]
    text = text.split("[.]?\n[\* \[\(/]*UPDATE")[0]
    text = text.split("[.]?\n[\* \[\(/]*big update")[0]
    text = text.split("[.]?\n[\* \[\(/]*important update")[0]
    text = text.split("[.]?\nfor an update")[0]
    text = text.replace('\r', '')
    return text

#calculate two things,
#specificness how good is the cluster and the elements similar to each other/ how well can other elements be described by their neighbors
#uniqueness is how differnet this cluster to others
def cluster_score(clean_cluster,c_syn,k):
    unique_clusters = clean_cluster.loc[:,['cluster']]
    unique_clusters = unique_clusters.drop_duplicates()
    tot_spec = 0
    tot_uni = 0
    for num,c in unique_clusters.iterrows():
        union = clean_cluster[clean_cluster['cluster']==c['cluster']].merge(c_syn[c_syn['cluster']==c['cluster']],how='inner',left_on=['word'],right_on=['syn'])
        specificness = len(union)/len(clean_cluster[clean_cluster['cluster']==c['cluster']])
        union = clean_cluster[clean_cluster['cluster']!=c['cluster']].merge(c_syn[c_syn['cluster']==c['cluster']],how='inner',left_on=['word'],right_on=['syn'])
        uniqueness = 1 - (len(union)/(len(clean_cluster)-len(clean_cluster[clean_cluster['cluster']==c['cluster']])))
        tot_spec = tot_spec + specificness
        tot_uni = tot_uni + uniqueness
    tot_spec = tot_spec/len(unique_clusters)
    tot_uni = tot_uni/len(unique_clusters)
    return {'spec':tot_spec,'uni':tot_uni,'k':k}

    
def generate_syn_info(cluster):
    cluster_syn = pd.DataFrame()
    unique_clusters = cluster.loc[:,['cluster']]
    unique_clusters = unique_clusters.drop_duplicates()
    for cnum,c in unique_clusters.iterrows():
#         print('starting cluster...',c[0])
        cur_cluster = cluster[cluster['cluster']==c['cluster']]
        syns = []
        for wnum,word in cur_cluster.iterrows():
            for s in wn.synsets(word['word']):
                syn = s.name().split('.')[0]
                if syn.find('_')<0:  #filter out composed words
                    syns.append(syn)
            #syns.append(word['word'])
        this_cluster = pd.DataFrame(syns,columns=['syn'])
        this_cluster['cluster'] = c[0]
        this_cluster = this_cluster.drop_duplicates()
        cluster_syn = pd.concat([cluster_syn,this_cluster])         
    return cluster_syn

In [377]:
#prepare sentences
c_train = pd.read_csv('data/c_train2.csv')
c_test = pd.read_csv('data/c_test2.csv')
c_data = pd.concat([c_train,c_test],sort=False)
c_data = c_data.loc[:,['post_created_utc', 'full_link', 'post_id', 'post_num_comments',
       'post_score', 'subreddit', 'post_title', 'post_text']]

nc_train = pd.read_csv('data/nc_train2.csv')
nc_test = pd.read_csv('data/nc_test2.csv')
nc_data = pd.concat([nc_train,nc_test],sort=False)
nc_data = nc_data.loc[:,['post_created_utc', 'full_link', 'post_id', 'post_num_comments',
       'post_score', 'subreddit', 'post_title', 'post_text']]

full_data = pd.concat([c_data,nc_data],sort=False)
full_data = full_data.sample(len(full_data))
posts = full_data.apply(preprocess_text,axis=1)
data_sentences = []
for post in posts:
    sent_tokenize_list = sent_tokenize(post)
    data = [nltk.word_tokenize(sentence) for sentence in sent_tokenize_list]
    data_sentences = data_sentences + data 
len(data_sentences)

359557

In [378]:
print('start training...')
# gloveModel = create_glove(data_sentences)
w2vModel = create_word2vec(data_sentences)
#is ok to train the model with the full dataset as we are not providing labels.
w2v = {w: vec for w, vec in zip(w2vModel.wv.index2word, w2vModel.wv.syn0)}
# glove = {w: vec for w, vec in zip(gloveModel.dictionary, gloveModel.word_vectors)}

start training...




In [379]:
#clustering w2v
print (w2vModel.similarity('this', 'is'))
# w2vModel.wv.index2word
# print (w2vModel.most_similar(positive=['hello'], negative=[], topn=10))

# w2v

-0.21456987528821345


  from ipykernel import kernelapp as app


In [324]:
# from nltk.cluster import KMeansClusterer
# import nltk
# X = w2vModel[w2vModel.wv.index2word]
# NUM_CLUSTERS=10
# kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
# assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
# # print (assigned_clusters)

In [325]:
# words = pd.DataFrame(list(w2vModel.wv.index2word),columns=['word'])
# clusters = pd.DataFrame(list(assigned_clusters),columns=['cluster'])
# features = pd.DataFrame(w2vModel.wv.syn0)

# result = words.merge(clusters,left_index=True,right_index=True)
# result = result.merge(features,left_index=True,right_index=True)
# result.to_csv('nltk_clusters10.csv',index=False,encoding='utf-8')

In [None]:
from sklearn import cluster
from sklearn import metrics

final_vals = pd.read_csv('cluster_eval.csv')
init_val = int(final_vals.k.max() - 1)
kfold = 2000
for i in range(init_val,kfold):
    print(i)
    X = w2vModel[w2vModel.wv.index2word]
    NUM_CLUSTERS=i+2
    kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
    kmeans.fit(X)
    labels = kmeans.labels_
    words = pd.DataFrame(list(w2vModel.wv.index2word),columns=['word'])
    clusters = pd.DataFrame(list(labels),columns=['cluster'])
    features = pd.DataFrame(w2vModel.wv.vectors)
    result = words.merge(clusters,left_index=True,right_index=True)
    result = result.merge(features,left_index=True,right_index=True)
    
    clean_cluster = result
    clean_cluster = clean_cluster[clean_cluster['word']==clean_cluster['word']]

    for num,c in result.iterrows():
        try:
            w1 = wn.synsets(c['word'])
            if len(w1)==0:
                clean_cluster = clean_cluster[clean_cluster['word']!=c['word']]
        except:
            clean_cluster = clean_cluster[clean_cluster['word']!=c['word']]
    clean_cluster['word'] = clean_cluster.apply(lambda row: row['word'].lower(),axis=1)
    clean_cluster = clean_cluster.loc[:,['word','cluster']]
    clean_cluster = clean_cluster.drop_duplicates()
    
    c_syn = generate_syn_info(clean_cluster)
    score = cluster_score(clean_cluster,c_syn,NUM_CLUSTERS)
    final_vals = final_vals.append(score,ignore_index=True)
    final_vals.to_csv('cluster_eval.csv',encoding='utf-8',index=False)


340




In [373]:
final_vals

Unnamed: 0,k,spec,uni
0,2.0,0.705829,0.581182
1,3.0,0.714255,0.730760
2,4.0,0.689490,0.799573
3,5.0,0.685451,0.830179
4,6.0,0.694096,0.858262
5,7.0,0.714711,0.875957
6,8.0,0.685145,0.890216
7,9.0,0.713638,0.903003
8,10.0,0.719913,0.913042
9,11.0,0.699059,0.919799


In [369]:
cc = clean_cluster[clean_cluster['cluster']==1]
cs = c_syn[c_syn['cluster']==1]

len(cc.merge(cs,how='inner',left_on='word',right_on='syn'))
(845/1052 + 7481/12359 )/2
# len(cc)

0.7042699059843707

In [336]:

cluster_score(clean_cluster,c_syn)



{'spec': 1.1523101492404357, 'uni': 1.0679645566819487}

In [198]:
good = wn.synsets('goodness')
good1 = good[0]
good1.name().split('.')[0]

'good'

In [245]:
wn.synsets('do')

[Synset('bash.n.02'),
 Synset('do.n.02'),
 Synset('doctor_of_osteopathy.n.01'),
 Synset('make.v.01'),
 Synset('perform.v.01'),
 Synset('do.v.03'),
 Synset('do.v.04'),
 Synset('cause.v.01'),
 Synset('practice.v.01'),
 Synset('suffice.v.01'),
 Synset('do.v.08'),
 Synset('act.v.02'),
 Synset('serve.v.09'),
 Synset('do.v.11'),
 Synset('dress.v.16'),
 Synset('do.v.13')]

In [None]:
# t = 'gola como star'
# t.split()
print(range(4:5))