In [62]:
#turn off logging
import logging, sys
logging.disable(sys.maxsize)

import numpy as np
import pandas as pd
import hdbscan
import operator
import operator
import pylab
import json
import spacy
import pickle

from sklearn.preprocessing import MinMaxScaler
from itertools import combinations
from nltk import Tree
from nltk.corpus import wordnet as wn
from pytrips.ontology import load
ont = load()
en_nlp = spacy.load('en')

In [45]:
#vector file 
with open("./data/countries.json", "r") as file:
    countryFile = file.read()
countries = json.loads(countryFile)
vectorDir = "/Users/aeshaanwahlang/Documents/QuantitativeSemantics/NounVectors/"
file = "nounVectorFull.csv"
vectors = pd.read_csv(vectorDir+file)
data = pd.DataFrame(vectors)
dataz = vectors.copy()

In [454]:
data.shape

(10093, 249)

In [450]:
# Remove useless columns and normalization (min-max)
def clean(df):
    data = df.copy()
    data = data.drop(columns=['Noun', 'ROOT;', 'punct;'])
    scaler = MinMaxScaler() 
    scaled_values = scaler.fit_transform(data) 
    data.loc[:,:] = scaled_values
    return data

In [78]:
# for c in vectors.columns:
#     print("\""+c+"\"")

In [373]:
# Clustering parameters
metric = 'braycurtis'
min = 3
# initialize clusterer
clusterer = hdbscan.HDBSCAN(min_cluster_size = min, metric = metric, prediction_data=True,
                           cluster_selection_method = "leaf", alpha=0.8)

In [833]:
# class definition
class Cluster:
    @staticmethod
    def getAgreement(tType, data):
        c = 0
        found = False
        for word in data:
            for t in word.tType['lex']:
                if t == tType:
                    c += 1
                    found = True
                    break
            if not found:
                for t in word.tType['wn']:
                    if t == tType:
                        c += 1
                        break
            else: found = False
                
        return c/len(data)
    
    def __init__(self, tType, content):
        self.__words = []
        self.tType = ont.get_trips_type(tType)
        for w in content:
            self.__words.append(Word(w))
        self.agreement = Cluster.getAgreement(self.tType, self.__words)
        self.size = len(self.__words)
        
    def __str__(self):
        print("-----------------")
        print("TRIPS type: " + str(self.tType))
        print("Agreement: " + str(self.acceptance))
        print("Size: " + str(self.size))
        print("")
        for w in self.__words: print(w)
        return "-----------------"

    def to_dict(self):
        dump = {}
        dump['trips'] = str(self.tType)
        dump['agreement'] = self.agreement
        dump['size'] = self.size
        dump['words'] = []
        for w in self.__words: dump['words'].append(str(w))
        
        return dump
    
class Word:
    tType = None
    string = None
    
    def __init__(self, string):
        self.string = string
        self.tType = tripsType(string)
        
    def __str__(self):
        return self.string
        

def organizeClusters(df):
    clustering = []
    group = list(df['Noun'].groupby(df["TRIPS_Node"]))
    for g in group:
        clustering.append(Cluster(g[0], g[1]))
    return clustering


In [631]:
# Clutering results
# find tripsType with PyTrips
def tripsType(word):
    if not isinstance(word, str): return ("N/A", [])
    ontType = ont[("q::"+word, "n")]
    country = countryCheck(word)
    if country: ontType["lex"].append(ont.get_trips_type("ont::geographic-region"))
    return ontType

# count frequencies to dic
def countToDic(dic, val):
    if val in dic: dic[val] += 1
    else: dic[val] = 1

# check if word is a country
def countryCheck(word):
    for c in countries:
        if c['name']['common'].lower() == word: return "ont::geographic-region"
        if c['name']['official'].lower() == word: return "ont::geographic-region"
        if c['region'].lower() == word: return "ont::geographic-region"
        alt = list(map(str.lower,c['altSpellings']))
        if word in alt: return "ont::geographic-region"
         
# find frequencies 
def findFreq(freqDict, nodes):
    l = 0
    for node in nodes:
        if isinstance(node, list):
            for n in node:
                countToDic(freqDict, n)
                l +=1
        else:
            countToDic(freqDict, node)
            l +=1
    for k in freqDict: freqDict[k] = freqDict[k]/l
    return

# Assign TRIPS node to cluster
def clusterTtype(cluster):
    freq = {}
    
    def addToDFreq(tType):
        resnicPath = [(2**-i, j) for i, j in enumerate(tType.path_to_root())]
        for node in resnicPath:
            sNode = str(node[1])
            if sNode in freq: freq[sNode] += node[0]
            else: freq[sNode] = node[0]
    
    for word in cluster:
        ontTypes = tripsType(word)
        for tType in ontTypes["lex"]: addToDFreq(tType)
        for tType in ontTypes["wn"]: addToDFreq(tType)
    
    if not freq: return "N/A"
    val = max(freq.items(), key=operator.itemgetter(1))[0]
#     print(freq)
    return val


In [619]:
# cluster details
def showCluster_details(clusters):
    res = clusters.labels_
    unique, counts = np.unique(res, return_counts = True)
    uni = dict(zip(unique, counts))
    # print(uni)
    del uni[-1] #remove noise labels
    max = 0
    total = 0
    for key in uni:
        val = uni[key]
        if(val > max):
            max = val
        total += val
    sorted_uni = sorted(uni.items(), key=operator.itemgetter(1))
    print("Clusters found: " + str(res.max()+1))
    noise = np.count_nonzero(res == -1)
    noise2 = round(noise/len(res), 3)*100
    print("Noise: " + str(noise) + " (" + str(noise2) + "%)")
    print("Avarage cluster size: " + str(round(total/len(uni), 3)) + "\nLargest cluster: " + str(max))
#     print("10 largest clusters:")
#     for i in range(len(sorted_uni)):
#         print(sorted_uni[len(sorted_uni)-(1+i)])
#         if(i > 10):
#             break
            
        

# format output string for cluster results
def outStr(row, showTypes):
    out = row[0] + " " * (20-len(row[0]))
    out = out + str(row[1]) + " " * (5-len(str(row[1])))
    out += str(row[2]) + " " *(20 - len(str(row[2])))
    if showTypes: out += row[3]
    return out

def showClusters(dataz, showTypes=False):
    if showTypes: nf = dataz.loc[dataz['cluster'] >= 0].sort_values(['cluster'])[['Noun','cluster', 'TRIPS_Node', 'TRIPS_type']]
    else: nf = dataz.loc[dataz['cluster'] >= 0].sort_values(['cluster'])[['Noun','cluster', 'TRIPS_Node']]
    print("Noun \t\t Cluster \t TRIPS Node \t\t TRIPS Type")
    print("---------------------------------------------------------------------------------------------------------")
    c = 0
    for row in nf.iterrows():
        if row[1][1] != c:
            print("-----------------------")
            c = row[1][1]            
        print(outStr(row[1], showTypes))
        

In [825]:
# assign clusters and TRIPS type to each cluster
def assignClusters(df, res):
    df['TRIPS_type'] = df['Noun'].apply(tripsType)
    for i in range(0, res.max()+1):
        cluster = df.loc[df["cluster"]==i, "Noun"].values
        df.loc[df.cluster ==  i, "TRIPS_Node"] = clusterTtype(cluster)

def clusterAnalysis(clustering):
    size = 0
    agg = 0
    agg05 = 0
    agg03 = 0
    agg07 = 0
    l = len(clustering)
    for c in clustering:
        size += c.size
        agg += c.agreement
        if c.agreement >= 0.5: agg05 += 1
        if c.agreement >= 0.3: agg03 += 1
        if c.agreement >= 0.7: agg07 += 1
    
    print("Number of TRIPS types: " + str(l))
    print("Avg size: " + str(size/l))
    print("Avg Agreement: " + str(agg/l))
    print("Number of Agreement >= 70%: " + str(agg07))
    print("Number of Agreement >= 50%: " + str(agg05))
    print("Number of Agreement >= 30%: " + str(agg03))
    

In [559]:
# find words that TRIPS dosent assign
def findUnassigned(words):
    unassigned = []
    for word in words:
        tType = tripsType(word, returnDict=True)
        if not isinstance(tType, dict):
            continue
        if tType["lex"] == [] and tType["wn"] == []:
            if countryCheck(word) == None: unassigned.append(word)
    return unassigned

# returns how many words in a cluster actually belong to a TRIPS type
def getClusterConsistency(cluster, tType):
    found = 0
    for word in cluster:
        cType = tripsType(word, returnDict=True)
        if tType in cType["lex"] or tType in cType["wn"]: found += 1
    return round(found/len(cluster), 4)

# returns the % of words found in the word closure of the TRIPS type
def getWordClosureCount(cluster, closure):
    count = 0
    for c in cluster:
        if c in closure: count+=1
    return count/len(closure)

# return avg % of words in TRIPS type word closure for a group
def groupAnalysis(group, show = False, get = False):
    leaves = {}
    parents = 0
    
    for g in group:
        if g[0] == "None": continue
        if g[0] == "ont::geographic-region":
            parents += 1
            continue
            
        words = g[0].word_closure()
        tType = str(g[0])
        if words:
            leaves[tType] = {}
            leaves[tType]['cluster_consistency'] = getClusterConsistency(g[1], g[0])
            leaves[tType]['word_closure'] = getWordClosureCount(g[1], words)
            leaves[tType]['cluster_size'] = len(g[1])
        else: parents += 1
            
    if show:
        l = len(leaves)
        print("Number of Parent nodes: "+ str(parents))
        print("Number of Leaf nodes: "+ str(l))
        avgCC = 0
        avgWC = 0
        avgSize = 0
        for key,val in leaves.items():
            avgCC += val['cluster_consistency']
            avgWC += val['word_closure']
            avgSize += val['cluster_size']
        print("Avg Cluster Consistency: "+str(round(avgCC/l,4)))
        print("Avg Word Closure:" + str(round(avgWC/l,4)))
        print("Avg Cluster Size: " + str(round(avgSize/l, 4)))
        
    if get:
        return (leaves, parents)

    
# find how many clusters have trips_nodes 
def showClusterTRIPSanalysis(data):
    size = 0
    for c in data:
        group = list(c[1].groupby(c[1]))
        if c[0] == "None":
            print("Number of clusters with no TRIPS Node:" + str(len(group)))
        else: size += len(group)
    print("Total merged cluster:" + str(size))
    print("Avg number of cluster merged: "+ str(round(size/len(data), 2)))  

In [814]:
# function to run clustering and return cluster obj
def makeClusters(data, clusterer, threshold):
    size = data.shape[0]
    #backup may not need if all goes well
    dataz = data.copy()
    combined = None
    count = 1
    initial = True
    predictionData = []
    
    #initial clustering 
    cleanedData = clean(dataz)
    clusters = clusterer.fit(cleanedData)
    dataz['cluster'] = clusterer.labels_
    assignClusters(dataz, clusterer.labels_)
    print("Clusterings done: " + str(count) + " size:" + str(size))

    while(size > threshold):
        noise = dataz[dataz["cluster"] < 0]
        cleanNoise = noise.drop(columns=['Noun', 'cluster', 'TRIPS_type', 'TRIPS_Node'])
        noiseClusters = clusterer.fit(cleanNoise)
        noise['cluster'] = clusterer.labels_
        count += 1
        print("Clusterings done: " + str(count) + " size:" + str(cleanNoise.shape[0]))
        
        assignClusters(noise, clusterer.labels_)
        g2 = noise[noise['cluster'] >= 0]
        
        if initial:
            g1 = dataz[dataz['cluster'] >= 0]
            combined = g1.append(g2)
            initial = False
        else:
            combined = combined.append(g2)
        
        dataz = noise
        size = dataz.shape[0]
        print("combined: " + str(combined.shape[0]))
        
        
    print("Completed")
    print("leftovers: " + str(size))
    final = organizeClusters(combined)
    return (final, combined)
     
    

In [44]:
res = makeClusters(data, clusterer, 2000)

In [826]:
# group = list(res[1]['Noun'].groupby(res[1]["TRIPS_Node"]))
clusterAnalysis(res[0])

Number of TRIPS types: 336
Avg size: 26.988095238095237
Avg Agreement: 0.4031633691030206
Number of Agreement >= 70%: 20
Number of Agreement >= 50%: 100
Number of Agreement >= 30%: 261


In [888]:
# Run clustring 
cleanedData = clean(data)
clusters = clusterer.fit(cleanedData)
# predic_data = clusters.prediction_data_
showCluster_details(clusters)
# assignClusters(dataz, clusters.labels_)

  return self.partial_fit(X, y)


Clusters found: 555
Noise: 7621 (75.5%)
Avarage cluster size: 4.454
Largest cluster: 25


In [889]:
dataz['cluster'] = clusterer.labels_
dataClustered = dataz.loc[dataz['cluster'] >= 0]
assignClusters(dataClustered, clusters.labels_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [895]:
#Test prediction
iv = data.loc[data['Noun'] == "bat"]
iv = iv.drop(columns=['Noun', 'ROOT;', 'punct;'])
test = hdbscan.approximate_predict(clusterer, iv)
# print(iv)

In [14]:
import sys
print(sys.path)

['', '/anaconda3/lib/python36.zip', '/anaconda3/lib/python3.6', '/anaconda3/lib/python3.6/lib-dynload', '/anaconda3/lib/python3.6/site-packages', '/anaconda3/lib/python3.6/site-packages/aeosa', '/anaconda3/lib/python3.6/site-packages/IPython/extensions', '/Users/aeshaanwahlang/.ipython']


In [896]:
print(test)
print(dataClustered.loc[dataClustered['cluster'] == 82, 'TRIPS_Node'])

(array([-1]), array([0.]))
3664    ont::day-name
5757    ont::day-name
7838    ont::day-name
8841    ont::day-name
9178    ont::day-name
9446    ont::day-name
9860    ont::day-name
Name: TRIPS_Node, dtype: object


In [81]:
# returns combination of relelvant dependencies of word in sentence
def depParse(sentence, word):
    doc = en_nlp(sentence)
    word = word.lower()
    deps =[]
    combi = []
    for token in doc:
        if token.text.lower() == word or token.head.text.lower() == word:
            deps.append(token.dep_+";")
    for i in range(0, len(deps)+1):
        for subset in combinations(deps, i):
            combi.append(subset)
    return combi

# returns instance vector
def getInstanceVector(word, deps, df):
    with open('vector_map_full.pkl', 'rb') as f:
        vmap = pickle.load(f)
    print(len(vmap))
    wordVec = df.loc[df["Noun"] == word.lower()]
    

Object saved


In [82]:
d = depParse("The quick brow fox jumped over the lazy dog", "fox")
getInstanceVector("test", d, vectors)

186


In [842]:
r1 = organizeClusters(res[1])

In [850]:
dump = {}
for clus in r1:
    dump[str(clus.tType)] = clus.to_dict()

In [852]:
s = json.dumps(dump)
with open('/Users/aeshaanwahlang/Desktop/ClustersDump.json', "w") as f:
    f.write(s)

In [245]:
noise.shape

(7621, 252)

In [827]:
#Recluster Noise
# noise = dataz.copy()
# noise = noise[noise["cluster"] < 0]
# noise2 = noise.drop(columns=['Noun', 'cluster', 'TRIPS_type', 'TRIPS_Node'])
# noise_clusters = clusterer.fit(noise2)
# cluster_details(noise_clusters)

In [448]:
# assignClusters(noise, noise_clusters.labels_)
# showClusters(noise, showTypes=False)

In [445]:
# assignedData1 = dataz[dataz["cluster"] >= 0]
# assignedData2 = noise[noise['cluster'] >= 0]
# combinedData = assignedData1.append(assignedData2)
# mergedClusters = Clusters(combinedData)
# groups = combinedData['Noun'].groupby(combinedData["TRIPS_Node"])
# clustesToTripsGroups = combinedData['cluster'].groupby(combinedData['TRIPS_Node'])
# groups = list(groups)
# g2 = list(clustesToTripsGroups)

In [679]:
# print(mergedClusters)

In [354]:
x = g2[0][1]
y = list(x.groupby(x))
print(y[6])

(276, 1063    276
7015    276
7602    276
Name: cluster, dtype: int64)


In [368]:
# merged cluster analysis
groupAnalysis(groups, show=True)
print("*****")
showClusterTRIPSanalysis(g2)

Number of Parent nodes: 20
Number of Leaf nodes: 160
Avg Cluster Consistency: 0.5118
Avg Word Closure:0.0371
Avg Cluster Size: 14.1625
*****
Number of clusters with no TRIPS Node:323
Total merged cluster:600
Avg number of cluster merged: 3.31


In [25]:
# #print Trees and poking around with clustrer
# plot = clusterer.condensed_tree_.plot(select_clusters=False, selection_palette=sns.color_palette())
# # plot = clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
# # plt.switch_backend('QT4Agg')
# # figManager = plt.get_current_fig_manager()
# # figManager.window.showMaximized()
# plt.savefig("singleLinlkageTree.png", dpi = 500)
# plt.show()
# # fig = plot.get_fi 

# # clusterer.minimum_spanning_tree_.plot(edge_cmap='viridis',
# #                                       edge_alpha=0.2,
# #                                       node_size=8,
# #                                       edge_linewidth=1)

# # G = clusterer.condensed_tree_.to_networkx()
# # num = clusterer.condensed_tree_.to_numpy()
# # predicData = clusterer.prediction_data_.cluster_map
# # cTree = clusterer.prediction_data_.cluster_tree
# # print(predicData)
# # print("------------")
# # print(cTree)

# # for k in predic_data.keys():
# #     n = data.loc[]
# #     print(n)
# #     break

# # indeg = G.in_degree()
# # # print(G.edges)
# # nx.draw_networkx(G)
# # mng = plt.get_current_fig_manager()
# # mng.frame.Maximize(True)
# # plt.show()

In [174]:
# #Save clustring results 
# #We should maybe standardize this output also
# # dataz = dataz.loc[:, dataz.columns.isin(['Noun', 'cluster'])]
# test = True
# if(test):
#     saveName =  'TestResults/HDBSCAN_' + met + '_' + str(min) +'('+file+')'+ '.csv'
# else:
#     saveName = 'ClusteringResults/HDBSCAN_' + met + '_' + str(min) +'('+file+')'+ '.csv'
# dataz.to_csv(saveName, sep=',', encoding='utf-8', index=False)
# # print(dataz)

In [26]:
# #Soft Clustring
# vec = hdbscan.all_points_membership_vectors(clusterer)
# vec.shape

In [278]:
#save Soft Clustring results as Membership Probability

# cols = [str(x) for x in range(vec.shape[1])]
# mem_prob = pd.DataFrame(0, index = np.arange(vec.shape[0]), columns=cols)
# mem_prob['Noun'] = dataz['Noun']
# count = 0
# for row in vec:
#     for col in cols:
#         mem_prob.loc[count,col] = row[int(col)]
#     count +=1
# mem_prob.to_csv('Membership_Probability.csv', index = False)

In [314]:
# mem_prob
# print(predic_data.cluster_map)
# outliers = clusterer.outlier_scores_


In [27]:
# def save_obj(obj, name ):
#     with open('./'+ name + '.pkl', 'wb') as f:
#         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
#     print("Object saved")
# save_obj(vdict, "vector_map_full")

# vmap = ["compound;","punct;","dep;","case;","appos;","nsubj;","compound;punct;","compound;dep;","nmod:of;","det:the;","amod;","conj:and;","dobj;","nmod:other;","nmod:poss;","case;nmod:of;","det:the;nmod:of;","case;det:the;","amod;nmod:of;","conj:and;nmod:of;","dobj;nmod:of;","case;nmod:other;","case;compound;","det:the;nsubj;","nmod:for;","ROOT;","det:a/an;","dep;punct;","ROOT;punct;","ROOT;dep;","ROOT;dep;punct;","case;nmod:poss;","nmod:in;","nmod:on;","nmod:other;nmod:poss;","amod;case;","case;nmod:in;","case;nmod:other;nmod:poss;","cc;","cop;","amod;nsubj;","det:a/an;nsubj;","det:a/an;punct;","nsubj;punct;","cop;det:a/an;","cop;punct;","cop;nsubj;","ROOT;nsubj;","ROOT;cop;","det:a/an;nsubj;punct;","cop;det:a/an;punct;","cop;det:a/an;nsubj;","cop;nsubj;punct;","ROOT;nsubj;punct;","ROOT;cop;nsubj;","ROOT;cop;punct;","ROOT;cop;nsubj;punct;","acl:to;","dobj;nmod:poss;","det:the;dobj;","amod;dobj;","amod;nmod:poss;","nmod:of;nsubj;","nmod:of;nmod:other;","det:the;nmod:other;","case;nmod:for;","case;det:the;nmod:of;","det:the;nmod:of;nmod:other;","det:the;dobj;nmod:of;","amod;det:a/an;","case;det:a/an;","det:a/an;dobj;","amod;case;nmod:of;","nmod:in;nmod:of;","det:the;nmod:in;","case;nmod:in;nmod:of;","case;det:the;nmod:in;","det:the;nmod:of;nsubj;","case;conj:and;","case;cc;","cc;conj:and;","case;cc;conj:and;","appos;punct;","case;det:the;nmod:other;","det:a/an;nmod:of;","amod;det:the;","amod;nmod:other;","det:a/an;nmod:other;","case;nmod:of;nmod:other;","case;det:a/an;nmod:other;","compound;det:the;","compound;nmod:of;","compound;nmod:other;","case;compound;det:the;","case;compound;nmod:of;","case;compound;nmod:other;","case;nmod:on;","det:the;nmod:on;","case;det:the;nmod:on;","nmod:with;","amod;nmod:with;","case;nmod:with;","det:a/an;nmod:with;","amod;case;det:a/an;","amod;case;nmod:with;","nmod:to;","compound;det:a/an;","amod;conj:and;","case;nmod:to;","mwe;","nmod:poss;nsubj;","dep;nsubj;","compound;nmod:poss;","nummod;","case;nummod;","dobj;nummod;","nmod:other;nummod;","det:a/an;nmod:for;","compound;nsubj;","compound;dobj;","compound;conj:and;","acl:relcl;","ref;","det:the;punct;","amod;punct;","acl:relcl;ref;","amod;case;det:the;","conj:and;dobj;","amod;compound;","det:a/an;nmod:to;","det:the;nmod:for;","case;det:the;nmod:of;nmod:other;","conj:and;det:the;","appos;compound;","amod;nmod:in;","amod;case;nmod:in;","det:a/an;nmod:in;","case;det:a/an;nmod:in;","amod;det:a/an;dobj;","ROOT;det:a/an;","ROOT;det:a/an;nsubj;","ROOT;det:a/an;punct;","ROOT;cop;det:a/an;","cop;det:a/an;nsubj;punct;","ROOT;det:a/an;nsubj;punct;","ROOT;cop;det:a/an;nsubj;","ROOT;cop;det:a/an;punct;","ROOT;cop;det:a/an;nsubj;punct;","compound;nmod:in;","case;compound;nmod:in;","advmod;","cop;det:the;","amod;case;nmod:other;","nmod:of;nmod:poss;","case;punct;","nmod:of;punct;","cop;nmod:of;","det:a/an;nmod:of;nsubj;","amod;det:a/an;nmod:of;","cop;nmod:of;nsubj;","det:a/an;dobj;nmod:of;","appos;det:a/an;","ROOT;nmod:of;","ROOT;nmod:of;punct;","amod;det:the;nmod:of;","det:the;nmod:to;","amod;cop;","cop;det:the;nmod:of;","cop;det:the;nsubj;","nummod;punct;","det:the;nmod:poss;","case;det:the;nmod:poss;","case;det:a/an;nmod:of;","case;mwe;","case;det:a/an;nmod:with;","nmod:in;nmod:poss;","case;nmod:in;nmod:poss;","neg;","case;nsubj;","case;cop;","case;cop;nsubj;","case;det:the;nmod:to;","amod;cop;nsubj;","nmod:poss;punct;","dep;nummod;","nsubjpass;"]
# vdict = {}
# for val in vmap:
#     vdict[val] = True
