In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import os
import csv
from ash_model import ASH
import openMindednessFunctions as omf
from ash_model import readwrite as io
import json
import tqdm

In [2]:
def prepare_data(datasetname):

    h = io.read_ash_from_json(f'datasets/in/{datasetname}/{datasetname}.json')

    timestamps = {'2017-01-01_2017-07-01':0, '2017-07-01_2018-01-01':1, '2018-01-01_2018-07-01':2, '2018-07-01_2019-01-01':3, '2019-01-01_2019-07-01':4}

    t2node2attrs = dict()
    for filename in os.listdir(f'datasets/in/{datasetname}'):
        if filename.endswith('nodelist.csv'):
            t = filename.split('_')
            t = t[1]+'_'+t[2]
            if t not in t2node2attrs:
                t2node2attrs[timestamps[t]] = dict()
            file = open(f'datasets/in/{datasetname}/'+filename, 'r')
            csvreader = csv.reader(file)
            header = []
            header = next(csvreader)
            rows = []
            for row in csvreader:
                rows.append(row)
            file.close()
            for row in rows:
                t2node2attrs[timestamps[t]][int(row[0])] = {'opinion': round(float(row[1]), 2), 'cluster':row[2]}

    # with open('datasets/in/guncontrol/'+'guncontrol.json', 'r') as file:
    #     h = json.load(file)

    t2he2attrs = dict()

    for tid in [0, 1, 2, 3, 4]:
        t2he2attrs[tid] = dict()
        for he in (h.get_hyperedge_id_set(tid=tid)):
            t2he2attrs[tid][he] = dict()
            nodes = h.get_hyperedge_nodes(he)
            for node in nodes:
                t2he2attrs[tid][he][node] = t2node2attrs[tid][node]['opinion']


    t2node2avgs = dict()

    for tid in [0, 1, 2, 3, 4]:
        t2node2avgs[tid] = dict()
        for u in h.get_node_set(tid=tid):
            if (tid+1) in h.get_node_presence(node=u):
                t2node2avgs[tid][u] = dict()
                t2node2avgs[tid][u]['avgs'] = dict()
                t2node2avgs[tid][u]['stds'] = dict()
                #posso calcolare open mindedness
                hes = h.get_star(node=u, tid=tid)
                for he in hes:
                    nodes = t2he2attrs[tid][he]
                    ops = []
                    for node in nodes:
                        if node != u:
                            ops.append(nodes[node])
                    avg, std = np.average(np.array(ops)), np.std(np.array(ops))
                    t2node2avgs[tid][u]['avgs'][he] = avg
                    t2node2avgs[tid][u]['stds'][he] = std 
    
    return h, t2node2attrs, t2node2avgs

In [3]:
def estimation(opvt, opvt1, sorted_vals):
    errs = []
    estimated_opinions = []
    est_opvt1=opvt
    for oput in sorted_vals:
        est_opvt1 = (est_opvt1 + oput)/2
        err = abs(est_opvt1 - opvt1)
        estimated_opinions.append(est_opvt1)
        errs.append(err)
    try:
        i = len(errs) - 1 - errs[::-1].index(min(errs))
        last_op = sorted_vals[i]
        cb = abs(last_op - opvt) 
    
        if errs[i] < abs(opvt-opvt1):
            return cb, errs[i], estimated_opinions[i]        
        else:
            return 0.0, abs(opvt-opvt1), opvt
    except:
        return -1.0, abs(opvt-opvt1), opvt

In [4]:
def createDataset():
    timestamps = {0:'2017-01-01_2017-07-01', 1:'2017-07-01_2018-01-01', 2:'2018-01-01_2018-07-01', 3:'2018-07-01_2019-01-01', 4:'2019-01-01_2019-07-01'}
    data = {}
    for dataset_name in ['minority', 'guncontrol', 'politics']:
        print('doing ', dataset_name)
        h, t2node2attrs, t2node2avgs = prepare_data(dataset_name)
        '''
        per ogni time step mi ritorna un dizionario nodo: {dizionario con delle info}
        '''
        data[dataset_name] = {}
        for t in tqdm.tqdm([0, 1, 2, 3]):
            data[dataset_name][timestamps[t]] = {}
            t1=t+1
            print('there are ', len(h.get_node_set(t)), ' nodes at time ', t)
            a = 0
            b = 0
            c = 0
            for v in h.get_node_set(tid=t):
                if t1 in h.get_node_presence(node=v) and h.get_star(node=v, tid=t):
                    c+=1
                    opvt = t2node2attrs[t][v]['opinion']
                    opvt1 = t2node2attrs[t1][v]['opinion']
                    sortedNeighOps = sorted(t2node2avgs[t][v]['avgs'].values())
                    eps, err, estOp = estimation(opvt, opvt1, sortedNeighOps)
                    orientation = omf.politicalLeaning(opvt) 
                    nactiveint = len([op for op in sortedNeighOps if op <= eps])
                    data[dataset_name][timestamps[t]][v] = {'dataset':dataset_name, 'opt':opvt, 'opt1': opvt1, 'neighbors':list(t2node2avgs[t][v]['avgs'].keys()), 
                                                    'opChange':opvt1-opvt, 'homophily':np.average(np.array(list(t2node2avgs[t][v]['stds'].values()))), 'orientation':orientation, 
                                                    'neighborsOps':sortedNeighOps, 'error':err, 'estOp':estOp, 
                                                    'openMindedness':eps, 'nactiveint':nactiveint}    
                if not t1 in h.get_node_presence(node=v): a+=1
                if not h.get_star(v, t): b+=1
            print('we were able to estimate ', c, ' values at time ', t)
            print(a, b)
    return data

In [5]:
data = createDataset()

doing  minority


 25%|█████████████████████                                                               | 1/4 [00:00<00:00,  8.69it/s]

there are  1204  nodes at time  0
we were able to estimate  195  values at time  0
997 1204
there are  1221  nodes at time  1


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.26it/s]

we were able to estimate  208  values at time  1
1001 1221
there are  1372  nodes at time  2
we were able to estimate  222  values at time  2
1137 314
there are  1392  nodes at time  3
we were able to estimate  214  values at time  3
1162 783


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.34it/s]


doing  guncontrol


 25%|█████████████████████                                                               | 1/4 [00:00<00:00,  7.49it/s]

there are  990  nodes at time  0
we were able to estimate  138  values at time  0
845 990
there are  1051  nodes at time  1


 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:00<00:00,  7.57it/s]

we were able to estimate  148  values at time  1
892 1051
there are  1281  nodes at time  2
we were able to estimate  166  values at time  2
1106 301
there are  1229  nodes at time  3


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.61it/s]


we were able to estimate  162  values at time  3
1054 713
doing  politics


 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 13.42it/s]

there are  1204  nodes at time  0
we were able to estimate  162  values at time  0
1022 1204
there are  1101  nodes at time  1
we were able to estimate  167  values at time  1
895 1101
there are  1089  nodes at time  2
we were able to estimate  162  values at time  2
903 283
there are  1044  nodes at time  3


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 13.18it/s]

we were able to estimate  164  values at time  3
861 679





In [6]:
import pickle
with open(f'openMindednessData_h.pickle', 'wb') as ofile:
    pickle.dump(data, ofile)