In [2]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt 
import random 
import networkx as nx 
import itertools 
import pickle 
import sklearn 
from sklearn.feature_extraction.text import CountVectorizer

# process graph 

In [2]:
with open('data/Cora_enrich/links.txt') as links:
    with open('data/Cora_enrich/idxs.txt') as idxs:
        with open('data/Cora_enrich/adjlist.txt','a') as adj:
            for line in links:
                idx=idxs.readline()
                adj.write('%s %s\n'%(idx.strip(),line.strip()))

In [3]:
G=nx.read_adjlist('data/Cora_enrich/adjlist.txt',nodetype=int,create_using=nx.DiGraph)

In [6]:
idxs=np.loadtxt('data/Cora_enrich/idxs.txt',dtype=np.int) 

In [9]:
idx_dict={n:i for i,n in enumerate(idxs)}
edges=[(idx_dict[s],idx_dict[r]) for s,r in G.edges]
G_idx=nx.DiGraph()
G_idx.add_edges_from(edges) 

In [14]:
nx.write_adjlist(G_idx,'data/Cora_enrich/idx_adjlist.txt') 

# process text

## utils

In [2]:
texts=[]
with open('data/Cora_enrich/texts.txt') as f:
    for line in f:
        texts.append(line.strip())
labels=[]
with open('data/Cora_enrich/labels.txt') as f:
    for line in f:
        labels.append(line.strip())

In [3]:
class_ids={}
for i,l in enumerate(labels):
    if l in class_ids:
        class_ids[l].append(i)
    else:
        class_ids[l]=[i]

In [4]:
def get_token_frequency(texts,class_name=None,class_ids=None,range=None,method='absolute'):
    vectorizer=CountVectorizer(stop_words='english')
    if class_name!=None:
        texts_class=vectorizer.fit_transform([texts[i] for i in class_ids[class_name]]).toarray()
    else:
        texts_class=vectorizer.fit_transform(texts).toarray()
    if range==None:
        range=(0,20)
    if method=='absolute':
        token_counts=texts_class.sum(axis=0)
    elif method=='df':
        token_counts=(texts_class>0).sum(axis=0)
    sorted_idx=np.argsort(-token_counts)[range[0]:range[1]]
    sorted_counts=[token_counts[i] for i in sorted_idx]
    sorted_tokens=[vectorizer.get_feature_names()[i] for i in sorted_idx]
    return list(zip(sorted_counts,sorted_tokens))

In [54]:
stop_words_default=sklearn.feature_extraction.text.ENGLISH_STOP_WORDS

## analysis

In [5]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)

In [6]:
X.shape 

(2708, 25793)

In [13]:
vectorizer_max = CountVectorizer(stop_words='english',max_df=0.75)
X = vectorizer_max.fit_transform(texts)

In [14]:
X.shape 

(2708, 25789)

In [15]:
vectorizer_max.stop_words_ 

{'gener', 'problem', 'result', 'use'}

In [221]:
vectorizer_min = CountVectorizer(stop_words='english',min_df=0.005)
X = vectorizer_min.fit_transform(texts)

In [222]:
X.shape 

(2708, 3890)

In [223]:
vectorizer_min.stop_words_ 

{'winograd',
 'ski',
 'weed',
 'movi',
 'fskbann',
 'mangea',
 'lyon',
 'spoke',
 'kci',
 'monodi',
 'persimmon',
 'rigoutso',
 'federhen',
 'wjh',
 'exponent',
 'clidean',
 'cantor',
 'immunolog',
 'discord',
 'idi',
 'howl',
 'cern',
 'competi',
 'rflv',
 'plural',
 'predistort',
 'consideredand',
 'subcategor',
 'preferr',
 'recherch',
 'symplect',
 'trbp',
 'ffjd',
 'pedest',
 'layman',
 'stroock',
 'pele',
 'geisser',
 'sunroof',
 'odysseu',
 'longlif',
 'career',
 'quaker',
 'sdram',
 'rossini',
 'sumner',
 'costello',
 'ltu',
 'monocular',
 'thatacross',
 'wfl',
 'muta',
 'pola',
 'typ',
 'lj',
 'bewilder',
 'sonnet',
 'ballestero',
 'bollerslev',
 'paulokat',
 'exacli',
 'hgp',
 'evinc',
 'tmc',
 'strikingli',
 'sounder',
 'uq',
 'cellagain',
 'indecis',
 'oval',
 'hussam',
 'swarm',
 'hire',
 'ccm',
 'oriet',
 'sfring',
 'fem',
 'whang',
 'cujo',
 'ssn',
 'sedgewick',
 'evident',
 'autocorrelogram',
 'pratic',
 'suspens',
 'schacter',
 'harman',
 'falk',
 'rc',
 'smale',
 'bay

In [7]:
get_token_frequency(texts,'Genetic_Algorithms',class_ids,range=(40,60),method='df')

[(204, 'size'),
 (203, 'studi'),
 (202, 'crossov'),
 (202, 'point'),
 (201, 'evalu'),
 (201, 'experi'),
 (199, 'techniqu'),
 (198, 'form'),
 (197, 'evolv'),
 (197, 'mani'),
 (196, 'complex'),
 (196, 'evolut'),
 (196, 'space'),
 (193, 'evolutionari'),
 (191, 'learn'),
 (191, 'model'),
 (189, 'solv'),
 (188, 'order'),
 (188, 'best'),
 (188, 'section')]

## process approach: remove low-frequency tokens&non-informative tokens</br>
1. remove tokens whose doc frequency is below x.
2. remove tokens which occur frequently in all the seven classes.

In [45]:
top_tokens=[]
for text_class in set(labels):
    top_tokens.append(set([j for i,j in get_token_frequency(texts,text_class,class_ids,(0,50),method='absolute')]))
overlap=set.intersection(*top_tokens)

In [59]:
overlap

{'algorithm',
 'approach',
 'base',
 'differ',
 'exampl',
 'function',
 'gener',
 'learn',
 'method',
 'model',
 'problem',
 'result',
 'set',
 'use'}

In [67]:
vectorizer = CountVectorizer(stop_words=list(stop_words_default)+list(overlap),min_df=0.005)
X = vectorizer.fit_transform(texts).toarray() 
tokens=vectorizer.get_feature_names()

In [68]:
X.shape 

(2708, 3876)

In [69]:
tokens

['aaai',
 'ab',
 'abandon',
 'abbrevi',
 'abduct',
 'abe',
 'abil',
 'abl',
 'absenc',
 'absent',
 'absolut',
 'absorb',
 'abstract',
 'abund',
 'ac',
 'acceler',
 'accept',
 'acceptor',
 'access',
 'accommod',
 'accompani',
 'accomplish',
 'accord',
 'accordingli',
 'account',
 'accumul',
 'accur',
 'accuraci',
 'achiev',
 'acid',
 'ackley',
 'acknowledg',
 'acm',
 'acoust',
 'acquir',
 'acquisit',
 'act',
 'action',
 'activ',
 'actor',
 'actual',
 'actuat',
 'acut',
 'acycl',
 'ad',
 'adaboost',
 'adapt',
 'add',
 'addit',
 'address',
 'adequ',
 'adf',
 'adjac',
 'adjust',
 'admiss',
 'admit',
 'adopt',
 'adult',
 'advanc',
 'advantag',
 'advers',
 'adversari',
 'advic',
 'advis',
 'advoc',
 'affect',
 'affer',
 'affin',
 'afford',
 'afosr',
 'afterward',
 'ag',
 'age',
 'agenc',
 'agent',
 'aggreg',
 'aggress',
 'agnost',
 'ago',
 'agre',
 'agreement',
 'aha',
 'ahead',
 'ai',
 'aic',
 'aid',
 'aim',
 'air',
 'aircraft',
 'airplan',
 'akaik',
 'akin',
 'al',
 'alarm',
 'albeit',
 'a

In [71]:
np.savetxt('data/Cora_enrich/BOW_texts_3876.txt',X,fmt='%i')

In [73]:
with open('data/Cora_enrich/tokens_3876.pickle','wb') as f:
    pickle.dump(tokens, f, protocol=pickle.HIGHEST_PROTOCOL)

# generate subgraphs

generate subgraphs from the original network, according to labels 

In [21]:
G=nx.read_adjlist('data/Cora_enrich/idx_adjlist.txt',nodetype=int,create_using=nx.DiGraph)

In [62]:
labels=[]
with open('data/Cora_enrich/labels.txt') as f:
    for line in f:
        labels.append(line.strip())
labels=np.array(labels)

with open('data/Cora_enrich/tokens_3876.pickle','rb') as f:
    tokens=pickle.load(f)
tokens=np.array(tokens)

texts=np.loadtxt('data/Cora_enrich/BOW_texts_3876.txt',dtype=np.int)

In [27]:
set(labels)  

{'Case_Based',
 'Genetic_Algorithms',
 'Neural_Networks',
 'Probabilistic_Methods',
 'Reinforcement_Learning',
 'Rule_Learning',
 'Theory'}

In [28]:
classes=['Neural_Networks','Probabilistic_Methods','Theory']

In [67]:
def get_sub_input(classes,G,labels,texts):
    '''
    args:
    classes: list of strs
    G: original graph (DiGraph)
    labels: original labels
    texts: original BOW texts
    
    return:
    G_sub_idx: subgraph from G containing nodes in the specified classes and edges between them. 
        Isolates are removed.
    labels_sub: labels consistent with G_sub_idx
    texts_sub: BOW texts consistent with G_sub_idx 
    sub_idx_dict: the map of (original idx: new_idx) 
    '''
    sub_idx_list=[]
    for i,l in enumerate(labels):
        if l in classes:
            sub_idx_list.append(i)
            
    G_sub=nx.DiGraph(nx.subgraph(G,sub_idx_list)) 
    G_sub.remove_nodes_from(list(nx.isolates(G_sub)))
    
    sub_idx_dict={v:k for k,v in enumerate(list(G_sub.nodes))}
    sub_idx_list=list(sub_idx_dict.keys())
    
    G_sub_idx=nx.DiGraph()
    edges=[(sub_idx_dict[i],sub_idx_dict[j]) for i,j in G_sub.edges]
    G_sub_idx.add_edges_from(edges)
    
    labels_sub=labels[sub_idx_list]
    texts_sub=texts[sub_idx_list] 
    
    return G_sub_idx,labels_sub,texts_sub,sub_idx_dict 