In [72]:
import sys;sys.path.append('..')
from ppanlp import *
import networkx as nx
ppa = PPA()
pd.options.display.max_rows=100

In [70]:
def graph_cooccurrences(path, top_n_neighb=None, lim_edges_by_frac=None, lim_edges_by_n=None, only_signif=False, force=False):
    import networkx as nx

    gpath=f'{path}.top_n_neighb__{top_n_neighb}.lim_edges_by_frac__{lim_edges_by_frac}.only_signif={only_signif}.graphml'
    if not force and os.path.exists(gpath):
        return nx.read_graphml(gpath)
    
    df=pd.read_csv(path)
    G=nx.Graph()

    with logwatch('preparing data for graph'):
        mentioned = set(df[df.type_x=='Person'].entity_x) | set(df[df.type_y=='Person'].entity_y)
        def cleanent(x):
            x=x.replace('_',' ')
            if x.startswith('AUTHOR'): 
                dat=x.split()
                lname=dat[1].strip(punctuation)
                dat=tokenize_agnostic(x)
                nums=[y[:4] for y in dat if y[0].isdigit() and len(y)>=4]
                date=f'\n({nums[0]}-{nums[1]})' if len(nums)>=2 else (f'\n({nums[0]})' if nums else '')
                return f'{lname}{date}'
            if x.startswith('PERIOD'):
                return ' '.join(x.split()[1:])
            if x.startswith('TOPIC'):
                return '[' + ('\n'.join(x.split()[2:]).lower()) + ']'
            return x
        df['entity_x']=df.entity_x.apply(cleanent)
        df['entity_y']=df.entity_y.apply(cleanent)
    
        def is_person_mentioned(x):
            if x in mentioned: return True
            for p in mentioned:
                if p in x:
                    return True
            return False
    
        def is_valid_ent(ent,type):
            if type=='Author' and is_person_mentioned(ent): return False
            return True
        
        df=df[[is_valid_ent(ent,type) for ent,type in zip(df.entity_x,df.type_x)]]
        df=df[[is_valid_ent(ent,type) for ent,type in zip(df.entity_y,df.type_y)]]
        
        dfpos = df[df.odds_ratio>1].sort_values('odds_ratio',ascending=False)
        if only_signif: dfpos=dfpos[dfpos.fisher_exact_p<=.05]
        ents = set(df.entity_x) | set(df.entity_y)
    
    def ensure_node(d, xy='x'):
        node=d[f'entity_{xy}']
        if not G.has_node(node):
            noded={k[:-2]:v for k,v in d.items() if k.endswith('_'+xy)}
            G.add_node(node,group=noded['type'],**noded)
    
    def ensure_edge(d):
        node1,node2=d['entity_x'],d['entity_y']
        if not G.has_edge(node1,node2):
            weight = d['prob_xy_obsexp_log']
            G.add_edge(node1,node2,weight=weight,**d)

    def has_edge(d):
        return G.has_edge(d['entity_x'],d['entity_y'])
    
    def ensure_row(d):
        ensure_node(d,'x')
        ensure_node(d,'y')
        ensure_edge(d)

    with logwatch('building graph') as lw:
    
        if top_n_neighb:
            for ent in tqdm(ents,desc=f'Finding top {top_n_neighb} neighbors for entities'):
                i=0
                dfent = dfpos[(dfpos.entity_x.str==ent) or (dfpos.entity_y==ent)]
                for d in dfent.to_dict('record'):
                    if not has_edge(d):
                        ensure_row(d)
                        i+=1
                    if i>=top_n_neighb: break
        else:
            if lim_edges_by_frac:
                lim_edges=int(len(ents)*lim_edges_by_frac)
            else:
                lim_edges = lim_edges_by_n
    
            for d in dfpos.to_dict('record')[:lim_edges]:
                ensure_row(d)
        lw.log(f'Built {G} and {nx.number_connected_components(G)} components')
    
    
    with logwatch(f'saving graph to {gpath}'):
        nx.write_graphml(G,gpath)
    
    return G

In [71]:
path=os.path.join(ppa.path_data,'data.ner+topic.stats.v5.csv')
G=graph_cooccurrences(path,top_n_neighb=1,only_signif=True)
G=graph_cooccurrences(path,top_n_neighb=2,only_signif=True)
G=graph_cooccurrences(path,top_n_neighb=3,only_signif=True)
G=graph_cooccurrences(path,lim_edges_by_frac=1,only_signif=True)
G=graph_cooccurrences(path,lim_edges_by_frac=1.5,only_signif=True)
G=graph_cooccurrences(path,lim_edges_by_frac=2,only_signif=True)
G

[32m2023-12-07 10:35:35,225[0m [34m[1m| preparing data for graph[0m
[32m2023-12-07 10:35:42,046[0m [34m[1m| 6.82 seconds[0m
[32m2023-12-07 10:35:42,047[0m [34m[1m| building graph[0m
Finding top 1 neighbors for entities: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1690/1690 [00:09<00:00, 179.93it/s]
[32m2023-12-07 10:35:51,444[0m [34m[1m< Built Graph with 1639 nodes and 1459 edges and 180 components[0m
[32m2023-12-07 10:35:51,445[0m [34m[1m| 9.4 seconds[0m
[32m2023-12-07 10:35:51,446[0m [34m[1m| saving graph to /Users/ryanheuser/ppa_data/corpus/data/data.ner+topic.stats.v5.csv.top_n_neighb__1.lim_edges_by_frac__None.only_signif=True.graphml[0m
[32m2023-12-07 10:35:51,764[0m [34m[1m| 0.32 seconds[0m
[32m2023-12-07 10:35:52,771[0m [34m[1m| preparing data for graph[0m
[32m2023-12-0

<networkx.classes.graph.Graph at 0x143cab2b0>