In [1]:
import os
import numpy as np
import pandas as pd
import gensim.models
from gensim import utils
from graph import *
import networkx as nx
import random

In [73]:
class Corpus:
    
    def __init__(self, pt, testProp, p, q):
        self.path = pt
        print('building train/test sets')
        self.build(pt, testProp)
        self.p = p
        self.q = q
        print('building graph')
        self.generateCorpus()
        
        pass
    
    def build(self, pt, testProp):
        #note: temp1 is the training/test set; temp2 is the graph
        df = pd.read_csv(pt).astype(str) 
        temp1 = df[df['type'] == 'ground_truth']
        temp2 = df[df['type'] != 'ground_truth']
        temp1 = temp1.drop(columns = 'type')
        temp1['label'] = 1
        
        temp1 = self.generateNegativeSamples(temp1, temp2)
        self.trainTestSplit(temp1, testProp)
        self.buildGraph(temp2)
        
    
    def generateNegativeSamples(self,temp1, temp2):
        allItems = set(temp2.source)
        negSamples = []
        for i, row in temp1.iterrows():
            source = row['source']
            target = row['target']
            negSamples.append((source, random.sample(allItems.difference(set([target])), 1)[0]))
        t = pd.DataFrame(negSamples, columns = ['source','target'])
        t['label'] = 0
        return pd.concat([t, temp1])
    
    def trainTestSplit(self, df, testProp):
        df = df.sample(len(df))
        cutoff = int(len(df) * testProp)
        test = df.iloc[0:643]
        train = df.iloc[643:]
        self.train = train
        self.test = test
    
    def buildGraph(self, df):
        G=nx.Graph()
        
        temp = set([str(x) for x in df['source']])
        temp.update(set([str(x) for x in df['target']]))
        edges = tuple(zip([str(x) for x in list(df['source'])], [str(x) for x in list(df['target'])]))
        
        G.add_nodes_from(temp)
        G.add_edges_from(edges, weight = 1)
        self.G = G
    
    def generateCorpus(self):
        self.corpus = Graph(self.G, is_directed= False, p = self.p, q = self.q)
        self.corpus.preprocess_transition_probs()

In [74]:
class Model:
    
    def __init__(self, corpus, embedder = None, model = None):
        self.corpus = corpus
        self.embedder = embedder
        self.model = model
    
    def trainW2V(self, numWalks, walkLength, embedLength):
        sentences = test.corpus.simulate_walks(numWalks,walkLength)
        if (self.embedder == None):
            self.embedder = Word2Vec(sentences, size=embedLength, workers=4)
        else:
            self.embedder.build_vocab(more_sentences, update=True)
            self.embedder.train(sentences, total_examples=self.embedder.corpus_count, epochs=self.embedder.iter)
    
    def saveW2V(self, fname):
        self.embedder.save(fname)
    
    def loadW2V(self, fname):
        self.embedder = gensim.models.Word2Vec.load(fname)
    
    def trainModel(self):
        trainData = corpus.train
        pass
    
    def saveModel(self, fname):
        pass
    
    def loadModel(self, fname):
        pass
    
    def evaluate(self, setType = 'train'):
        pass
    
    

In [75]:
#note: place model into a seperate class with calls on Corpus

In [76]:
gensim.models.doc2vec.FAST_VERSION > -1


True

In [80]:
mdl = gensim.models.Word2Vec(sentences = t, workers=-1, size=10, sg = 1, hs = 0, negative = 10)

  """Entry point for launching an IPython kernel.


In [29]:
temp

array([[-0.0582807 , -1.2342066 ],
       [-0.3527027 , -0.75731725]], dtype=float32)

In [78]:
test = Corpus("../data/gen/abt_buy_graph.csv", 0.3, 1, 2)

building train/test sets
building graph


In [79]:
t = test.corpus.simulate_walks(1000,10)

Walk iteration:
1 / 1000
2 / 1000
3 / 1000
4 / 1000
5 / 1000
6 / 1000
7 / 1000
8 / 1000
9 / 1000
10 / 1000
11 / 1000
12 / 1000
13 / 1000
14 / 1000
15 / 1000
16 / 1000
17 / 1000
18 / 1000
19 / 1000
20 / 1000
21 / 1000
22 / 1000
23 / 1000
24 / 1000
25 / 1000
26 / 1000
27 / 1000
28 / 1000
29 / 1000
30 / 1000
31 / 1000
32 / 1000
33 / 1000
34 / 1000
35 / 1000
36 / 1000
37 / 1000
38 / 1000
39 / 1000
40 / 1000
41 / 1000
42 / 1000
43 / 1000
44 / 1000
45 / 1000
46 / 1000
47 / 1000
48 / 1000
49 / 1000
50 / 1000
51 / 1000
52 / 1000
53 / 1000
54 / 1000
55 / 1000
56 / 1000
57 / 1000
58 / 1000
59 / 1000
60 / 1000
61 / 1000
62 / 1000
63 / 1000
64 / 1000
65 / 1000
66 / 1000
67 / 1000
68 / 1000
69 / 1000
70 / 1000
71 / 1000
72 / 1000
73 / 1000
74 / 1000
75 / 1000
76 / 1000
77 / 1000
78 / 1000
79 / 1000
80 / 1000
81 / 1000
82 / 1000
83 / 1000
84 / 1000
85 / 1000
86 / 1000
87 / 1000
88 / 1000
89 / 1000
90 / 1000
91 / 1000
92 / 1000
93 / 1000
94 / 1000
95 / 1000
96 / 1000
97 / 1000
98 / 1000
99 / 1000
100

755 / 1000
756 / 1000
757 / 1000
758 / 1000
759 / 1000
760 / 1000
761 / 1000
762 / 1000
763 / 1000
764 / 1000
765 / 1000
766 / 1000
767 / 1000
768 / 1000
769 / 1000
770 / 1000
771 / 1000
772 / 1000
773 / 1000
774 / 1000
775 / 1000
776 / 1000
777 / 1000
778 / 1000
779 / 1000
780 / 1000
781 / 1000
782 / 1000
783 / 1000
784 / 1000
785 / 1000
786 / 1000
787 / 1000
788 / 1000
789 / 1000
790 / 1000
791 / 1000
792 / 1000
793 / 1000
794 / 1000
795 / 1000
796 / 1000
797 / 1000
798 / 1000
799 / 1000
800 / 1000
801 / 1000
802 / 1000
803 / 1000
804 / 1000
805 / 1000
806 / 1000
807 / 1000
808 / 1000
809 / 1000
810 / 1000
811 / 1000
812 / 1000
813 / 1000
814 / 1000
815 / 1000
816 / 1000
817 / 1000
818 / 1000
819 / 1000
820 / 1000
821 / 1000
822 / 1000
823 / 1000
824 / 1000
825 / 1000
826 / 1000
827 / 1000
828 / 1000
829 / 1000
830 / 1000
831 / 1000
832 / 1000
833 / 1000
834 / 1000
835 / 1000
836 / 1000
837 / 1000
838 / 1000
839 / 1000
840 / 1000
841 / 1000
842 / 1000
843 / 1000
844 / 1000
845 / 1000

In [81]:
df = test.train.iloc[10:20]

In [95]:
embedding1 = mdl[[str(x) for x in df.source.values]]
temp = []
for i in [str(x) for x in df.target.values]:
    try:
        temp.append(mdl[i])
    except:
        temp.append(np.zeros((10,1)))


  """Entry point for launching an IPython kernel.
  """


In [97]:
#embedding1
temp

[array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 array([ 0.04878885,  0.01511762,  0.01013294, -0.01682081, -0.02100872,
         0.02279851, -0.0423637 ,  0.00705993, -0.00212572,  0.00717573],
       dtype=float32),
 array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 array([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]]),
 array([ 0.02803069, -0.03814266,  0.04944248,  0.03941452, -0.00517569,
         0.02770262,  0.0487096 , -0.03482559, -0.01133828, -0.01789218],
       dtype=float32),
 array([ 0.00349108, -0.01271948, -0.02727021,  0.04878216,  0.02787178,
        -0.00787482, -0.04069341,  0.02984492,  0.04499304, -0.04695385],
       dtype=float32),
 array([-0.00919762,  0.0304446 , -0.0255971 ,  0.0246009 , -0.0

In [84]:
df[df['target'] == '205769439']

Unnamed: 0,source,target,label
1013,35903.0,205769439,1


In [86]:
df2 = pd.read_csv("../data/gen/abt_buy_graph.csv")

In [91]:
df[df2['target'] == 'source']

  """Entry point for launching an IPython kernel.


Unnamed: 0,source,target,label


In [43]:
[str(x) for x in df.source.values]

['18524.0',
 '21875.0',
 '23325.0',
 '33569.0',
 '34009.0',
 '32228.0',
 '30159.0',
 '27920.0',
 '34815.0',
 '34666.0',
 '13580.0',
 '38798.0',
 '36859.0',
 '35519.0',
 '6493.0',
 '32405.0',
 '32112.0',
 '31635.0',
 '35591.0',
 '34156.0',
 '33601.0',
 '38697.0',
 '36808.0',
 '20448.0',
 '37934.0',
 '34824.0',
 '33160.0',
 '38726.0',
 '33458.0',
 '32512.0',
 '16877.0',
 '22727.0',
 '37462.0',
 '33581.0',
 '37825.0',
 '36458.0',
 '27778.0',
 '16329.0',
 '35983.0',
 '35475.0',
 '37091.0',
 '37856.0',
 '34662.0',
 '33971.0',
 '25990.0',
 '37301.0',
 '37141.0',
 '36990.0',
 '35983.0',
 '17191.0',
 '14563.0',
 '19112.0',
 '6742.0',
 '38726.0',
 '26137.0',
 '36722.0',
 '37055.0',
 '34976.0',
 '34881.0',
 '18837.0',
 '32340.0',
 '35987.0',
 '37650.0',
 '27729.0',
 '33801.0',
 '34392.0',
 '33464.0',
 '36906.0',
 '26525.0',
 '38806.0',
 '28981.0',
 '9071.0',
 '34174.0',
 '35332.0',
 '38096.0',
 '34659.0',
 '32027.0',
 '29493.0',
 '35012.0',
 '36620.0',
 '37859.0',
 '38640.0',
 '31887.0',
 '38957

In [None]:
test.corpus.simulate_walks(10, 10)

In [None]:
test.test.label.mean()

In [None]:
test.train.label.mean()

In [None]:
df = pd.read_csv("../data/gen/abt_buy_graph.csv")

In [None]:
temp1 = df[df['type'] == 'ground_truth']
temp2 = df[df['type'] != 'ground_truth']

In [None]:
temp1 = temp1.drop(columns = 'type')

In [None]:
graph = nx.fast_gnp_random_graph(n=100, p=0.5)

In [None]:
graph[0][4]

In [None]:
Graph()

In [None]:
allItems = set(temp2.source)

In [None]:
G=nx.Graph()

In [None]:
temp = set(df.source)
len(temp)

In [None]:
temp.update(set(df.target))

In [None]:
temp = [str(x) for x in list(temp)]

In [None]:
len(temp)

In [None]:
G.add_nodes_from(temp)

In [None]:
edges = tuple(zip([str(x) for x in list(df['source'])], [str(x) for x in list(df['target'])]))

In [None]:
G.add_edges_from(edges)

In [None]:
list(G.neighbors('nan'))

In [None]:
#G.nodes

In [None]:
#temp

In [None]:
df = pd.read_csv("../data/gen/abt_buy_graph.csv")

In [None]:
df[['target','type']]

In [None]:
utils.simple_preprocess('cat dog')

In [None]:

sentences = ['dog, cat']
model = gensim.models.Word2Vec(sentences=utils.simple_preprocess('cat dog'))