In [16]:
from src import *

@dataclasses.dataclass
class Analysis(Base):
    nodes_tbl            : str
    stack_size           : int = 10
    pop_imbalance_thresh : float = 10.0
        
    def __post_init__(self):
        self.results_stem = self.nodes_tbl.split('.')[-1][6:]
        self.abbr, self.yr, self.level, self.district_type = self.results_stem.split('_')
        self.results_bq = f'{root_bq}.{self.results_stem}'
        self.tbl = f'{self.results_bq}.{self.results_stem}_0000000_allresults'
        self.pq = root_path / f'results/{self.results_stem}/{self.results_stem}_0000000_allresults.parquet'
        delete_table(self.tbl)

    def compute_results(self):
        self.tbls = dict()
        for src_tbl in bqclient.list_tables(self.results_bq, max_results=1000):
            full  = src_tbl.full_table_id.replace(':', '.')
            short = src_tbl.table_id
            seed = short.split('_')[-2]
            key  = short.split('_')[-1]
            if seed.isnumeric():
                try:
                    self.tbls[seed][key] = full
                except:
                    self.tbls[seed] = {key : full}
        
#         cols = [c for c in get_cols(self.nodes) if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        cols = [c for c in ['total_white', 'total_black', 'total_native', 'total_asian', 'total_pacific', 'total_other'] if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        
        def join(d):
            query = f"""
select
    A.seed,
    A.plan,
    A.{self.district_type},
    A.geoid,
    C.hash as hash_plan,
    C.pop_imbalance as pop_imbalance_plan,
    C.polsby_popper as polsby_popper_plan,
    B.polsby_popper as polsby_popper_district,
    B.aland,
    B.total_pop,
    B.total_pop / B.aland as density
from
    {d['plans']} as A
inner join
    {d['stats']} as B
on
    A.seed = B.seed and A.plan = B.plan and A.{self.district_type} = B.{self.district_type}
inner join (
    select
        *
    from
        {d['summaries']}
    where
        pop_imbalance < {self.pop_imbalance_thresh}
    ) as C
on
    A.seed = C.seed and A.plan = C.plan
"""
            return query
        
        temp_tbls = list()
        u = '\nunion all\n'
        k = len(self.tbls)
        for seed, tbls in self.tbls.items():
            k -= 1
            if len(tbls) == 3:
                try:
                    stack_query = stack_query + u + join(tbls)
                except:
                    stack_query = join(tbls)
                
            if k % self.stack_size == 0:
                query = f"""
select
    A.seed,
    A.plan,
    A.{self.district_type},
    max(A.hash_plan) as hash_plan,
    max(A.pop_imbalance_plan) as pop_imbalance_plan,
    max(A.polsby_popper_plan) as polsby_popper_plan,
    max(A.polsby_popper_district) as polsby_popper_district,
    max(A.aland) as aland,
    max(A.total_pop) as total_pop,
    max(A.density) as density,
    {join_str(1).join([f'sum(B.{c}) as {c}' for c in cols])}
from (
    select
        *
    from (
        select
            *,
            row_number() over (partition by hash_plan order by plan asc, seed asc) as r
        from (
            {subquery(stack_query, indents=3)}
            )
        )
    where r = 1
    ) as A
inner join
    {self.nodes_tbl} as B
on
    A.geoid = B.geoid
group by
    seed, plan, {self.district_type}
"""
                temp_tbls.append(self.tbl+f'_{k}')
                print(temp_tbls)
                load_table(tbl=temp_tbls[-1], query=query)
        stack_query = u.join([f'select * from {tbl}' for tbl in temp_tbls])
        query = f"""
select
    *
from (
    select
        *,
        row_number() over (partition by hash_plan order by plan asc, seed asc) as r
    from (
        {subquery(stack_query, indents=3)}
        )
    )
where
    r = 1
order by
    seed, plan, {self.district_type}
"""
        load_table(tbl=self.tbl, query=query)
        for t in temp_tbls:
            delete_table(t)
        self.df = read_table(self.tbl)
        self.df.to_parquet(self.pq)
        to_gcs(self.pq)
        


start = time.time()
A = Analysis('cmat-315920.redistricting_data.nodes_TX_2020_cntyvtd_cd')
A.compute_results()
self=A


print(time_formatter(time.time() - start))

['cmat-315920.TX_2020_cntyvtd_cd.TX_2020_cntyvtd_cd_0000000_allresults_110']
['cmat-315920.TX_2020_cntyvtd_cd.TX_2020_cntyvtd_cd_0000000_allresults_110', 'cmat-315920.TX_2020_cntyvtd_cd.TX_2020_cntyvtd_cd_0000000_allresults_100']


KeyboardInterrupt: 

In [14]:
self.df

Unnamed: 0,seed,plan,cd,hash_plan,pop_imbalance_plan,polsby_popper_plan,polsby_popper_district,aland,total_pop,density,total_white,total_black,total_native,total_asian,total_pacific,total_other
0,0001,650,08,-5803358598168886262,4.114933,18.398440,20.134636,1719.209394,754823,439.052394,407926,35394,6049,21830,732,63994
1,0001,645,08,-7152189766319209742,4.184817,18.658303,20.134636,1719.209394,754823,439.052394,407926,35394,6049,21830,732,63994
2,0001,643,08,2034784780069066746,4.184817,18.507514,20.134636,1719.209394,754823,439.052394,407926,35394,6049,21830,732,63994
3,0001,644,08,-7491253030417255574,4.184817,18.613251,20.134636,1719.209394,754823,439.052394,407926,35394,6049,21830,732,63994
4,0004,329,37,-3720688398094604924,8.388148,18.256944,40.949254,1119.545023,744235,664.765583,407926,35394,6049,21830,732,63994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2257,0004,901,37,-4436318378517418056,0.604704,18.211240,31.264807,1136.458965,764206,672.444869,407926,35394,6049,21830,732,63994
2258,0004,905,37,-1931022233685493302,0.604704,18.245265,31.264807,1136.458965,764206,672.444869,407926,35394,6049,21830,732,63994
2259,0004,889,37,5540242426213104142,0.604704,18.164538,31.264807,1136.458965,764206,672.444869,407926,35394,6049,21830,732,63994
2260,0004,909,37,8908305054695077262,0.604704,18.066432,31.264807,1136.458965,764206,672.444869,407926,35394,6049,21830,732,63994


In [None]:
from src import *

@dataclasses.dataclass
class Analysis(Base):
    nodes_tbl            : str
    stack_size           : int = 50
    pop_imbalance_thresh : float = 10.0
        
    def __post_init__(self):
        self.results_stem = self.nodes_tbl.split('.')[-1][6:]
        self.abbr, self.yr, self.level, self.district_type = self.results_stem.split('_')
        self.results_bq = f'{root_bq}.{self.results_stem}'
        self.tbl = f'{self.results_bq}.{self.results_stem}_0000000_allresults'
        self.pq = root_path / f'results/{self.results_stem}/{self.results_stem}_0000000_allresults.parquet'
        delete_table(self.tbl)

    def compute_results(self):
        self.tbls = dict()
        for src_tbl in bqclient.list_tables(self.results_bq, max_results=1000):
            full  = src_tbl.full_table_id.replace(':', '.')
            short = src_tbl.table_id
            seed = short.split('_')[-2]
            key  = short.split('_')[-1]
            if seed.isnumeric():
                try:
                    self.tbls[seed][key] = full
                except:
                    self.tbls[seed] = {key : full}
        
#         cols = [c for c in get_cols(self.nodes) if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        cols = [c for c in ['total_white', 'total_black', 'total_native', 'total_asian', 'total_pacific', 'total_other'] if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        
        def join(d):
            query = f"""
select
    A.seed,
    A.plan,
    A.{self.district_type},
    C.hash as hash_plan,
    C.pop_imbalance as pop_imbalance_plan,
    C.polsby_popper as polsby_popper_plan,
    B.polsby_popper as polsby_popper_district,
    B.aland,
    B.total_pop
from
    {d['plans']} as A
inner join
    {d['stats']} as B
on
    A.seed = B.seed and A.plan = B.plan and A.{self.district_type} = B.{self.district_type}
inner join (
    select
        *
    from
        {d['summaries']}
    where
        pop_imbalance < {self.pop_imbalance_thresh}
    ) as C
on
    A.seed = C.seed and A.plan = C.plan
"""
            return query
        
        temp_tbls = list()
        u = '\nunion all\n'
        k = len(self.tbls)
        for seed, tbls in self.tbls.items():
            k -= 1
            if len(tbls) == 3:
                try:
                    query = query + u + join(tbls)
                except:
                    query = join(tbls)
                
            if k % self.stack_size == 0:
                temp_tbls.append(self.tbl+f'_{k}')
                load_table(tbl=temp_tbls[-1], query=query)
        stack_query = u.join([f'select * from {tbl}' for tbl in temp_tbls]) + f' order by seed, plan, {self.district_type}'
        query = f"""
select
    C.seed,
    C.plan,
    C.{self.district_type},
    max(C.hash_plan) as hash_plan,
    max(C.pop_imbalance) as pop_imbalance_plan,
    max(C.polsby_popper) as polsby_popper_plan,
    max(C.polsby_popper) as polsby_popper_district,
    max(C.aland) as aland,
    max(C.total_pop) as total_pop,
    max(C.total_pop) / sum(D.aland) as density,
    {join_str(1).join([f'sum(D.{c}) as {c}' for c in cols])}
from (
    select
        *
    from (
        select
            *,
            row_number() over (partition by A.hash order by plan asc, seed asc) as r
        from (
            {stack_query}
            ) as A
        ) as B
    where r = 1
    ) as C
inner join
    {self.nodes_tbl} as D
on
    C.geoid = D.geoid
group by
    seed, plan, {self.district_type}
"""
        
#         inner join

        load_table(tbl=self.tbl, query=query)
        for t in temp_tbls:
            delete_table(t)
        df = read_table(self.tbl)
        df.to_parquet(self.pq)
        to_gcs(self.pq)
        


start = time.time()
A = Analysis('cmat-315920.redistricting_data.nodes_TX_2020_cntyvtd_cd')
A.compute_results()
self=A


print(time_formatter(time.time() - start))

In [None]:
from src import *

@dataclasses.dataclass
class Analysis(Base):
    nodes_tbl            : str
    stack_size           : int = 10
    pop_imbalance_thresh : float = 10.0
        
    def __post_init__(self):
        self.results_stem = self.nodes_tbl.split('.')[-1][6:]
        self.abbr, self.yr, self.level, self.district_type = self.results_stem.split('_')
        self.results_bq = f'{root_bq}.{self.results_stem}'
        self.tbl = f'{self.results_bq}.{self.results_stem}_0000000_allresults'
        self.pq = root_path / f'results/{self.results_stem}/{self.results_stem}_0000000_allresults.parquet'
        delete_table(self.tbl)

    def compute_results(self):
        self.tbls = dict()
        for src_tbl in bqclient.list_tables(self.results_bq, max_results=1000):
            full  = src_tbl.full_table_id.replace(':', '.')
            short = src_tbl.table_id
            seed = short.split('_')[-2]
            key  = short.split('_')[-1]
            if seed.isnumeric():
                try:
                    self.tbls[seed][key] = full
                except:
                    self.tbls[seed] = {key : full}
        
#         cols = [c for c in get_cols(self.nodes) if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        cols = [c for c in ['total_white', 'total_black', 'total_native', 'total_asian', 'total_pacific', 'total_other'] if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        
        def join(d):
            query = f"""
select
    A.seed,
    A.plan,
    A.{self.district_type},
    max(C.hash) as hash_plan,
    max(C.pop_imbalance) as pop_imbalance_plan,
    max(C.polsby_popper) as polsby_popper_plan,
    max(B.polsby_popper) as polsby_popper_district,
    max(B.aland) as aland,
    max(B.total_pop) as total_pop,
    max(B.total_pop) / sum(D.aland) as density,
    {join_str(1).join([f'sum(D.{c}) as {c}' for c in cols])}
from
    {d['plans']} as A
inner join
    {d['stats']} as B
on
    A.seed = B.seed and A.plan = B.plan and A.{self.district_type} = B.{self.district_type}
inner join (
    select
        *
    from
        {d['summaries']}
    where
        pop_imbalance < {self.pop_imbalance_thresh}
    ) as C
on
    A.seed = C.seed and A.plan = C.plan
inner join
    {self.nodes_tbl} as D
on
    A.geoid = D.geoid
group by
    seed, plan, {self.district_type}
"""
            return query
        
        temp_tbls = list()
        u = '\nunion all\n'
        k = len(self.tbls)
        for seed, tbls in self.tbls.items():
            k -= 1
            if len(tbls) == 3:
                try:
                    query = query + u + join(tbls)
                except:
                    query = join(tbls)
                
            if k % self.stack_size == 0:
                temp_tbls.append(self.tbl+f'_{k}')
                load_table(tbl=temp_tbls[-1], query=query)
        query = u.join([f'select * from {tbl}' for tbl in temp_tbls]) + f' order by seed, plan, {self.district_type}'
        load_table(tbl=self.tbl, query=query)
        for t in temp_tbls:
            delete_table(t)
        df = read_table(self.tbl)
        df.to_parquet(self.pq)
        to_gcs(self.pq)
        


start = time.time()
A = Analysis('cmat-315920.redistricting_data.nodes_TX_2020_cntyvtd_cd')
A.compute_results()
self=A


print(time_formatter(time.time() - start))

In [None]:
from src import *

@dataclasses.dataclass
class Analysis(Base):
    nodes_tbl : str
        
    def __post_init__(self):
        self.results_stem = self.nodes_tbl.split('.')[-1][6:]
        self.abbr, self.yr, self.level, self.district_type = self.results_stem.split('_')
        self.results_bq = f'{root_bq}.{self.results_stem}'
        self.tbl = f'{self.results_bq}.{self.results_stem}_0000000_allresults'
        delete_table(self.tbl)
#         ot_path / f'results/{self.results_stem}/{self.results_stem}_0000000_allresults.parquet'


    def compute_results(self):
        self.tbls = {'plans':list(), 'stats':list(), 'summaries':list()}
        for src_tbl in bqclient.list_tables(self.results_bq, max_results=1000):
            key = src_tbl.table_id.split('_')[-1]
            self.tbls[key].append(src_tbl.full_table_id.replace(':', '.'))
        
        u = "\nunion all\n"
        self.stack = {key: u.join([f'select * from {tbl}' for tbl in tbl_list[100:]]) for key, tbl_list in self.tbls.items()}
        
#         cols = [c for c in get_cols(self.nodes) if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        cols = [c for c in ['total_white', 'total_black', 'total_native', 'total_asian', 'total_pacific', 'total_other'] if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]

    
    
        query = f"""
select
    B.seed,
    B.plan,
    C.{self.district_type},
    max(B.hash) as hash_plan,
    max(B.pop_imbalance) as pop_imbalance_plan,
    max(B.polsby_popper) as polsby_popper_plan,
    max(C.polsby_popper) as polsby_popper_district,
    max(C.aland) as aland,
    max(C.total_pop) as total_pop,
    max(C.total_pop) / sum(E.aland) as density,
    {join_str(1).join([f'sum(E.{c}) as {c}' for c in cols])}
from (
    select
        *
    from (
        select
            *,
            row_number() over (partition by A.hash order by plan asc, seed asc) as r
        from (
            select
                *
            from (
                {subquery(self.stack['summaries'], indents=4)}
                )
            where
                pop_imbalance < 10.0
            ) as A
        )
    where r = 1
    ) as B
inner join (
    {subquery(self.stack['stats'], indents=1)}
    ) as C
on
    B.seed = C.seed and B.plan = C.plan
inner join (
    select
        *
    from (
        {subquery(self.stack['plans'], indents=2)}
        )
    ) as D
on
    C.seed = D.seed and C.plan = D.plan and C.{self.district_type} = D.{self.district_type}
inner join
    {self.nodes_tbl} as E
on
    D.geoid = E.geoid
group by
    seed, plan, {self.district_type}
order by
    seed, plan, {self.district_type}
"""
        load_table(tbl=self.tbl, query=query)
        
start = time.time()
A = Analysis('cmat-315920.redistricting_data.nodes_TX_2020_cntyvtd_cd')
A.compute_results()
print(time_formatter(time.time() - start))


In [None]:
self.pq

In [None]:
self.df.to_parquet(root_path / 'temp.parquet')

In [None]:
s = '00001'
s.isnumeric()

In [None]:
self.df['hash_plan'].nunique()
# self.tbls

In [None]:
list(self.tbls.keys())

In [None]:
self.tbls

In [None]:
            
            key = src_tbl.table_id.split('_')[-1]
            self.tbls[key].append(src_tbl.full_table_id.replace(':', '.'))
        
        u = "\nunion all\n"
        self.stack = {key: u.join([f'select * from {tbl}' for tbl in tbl_list[100:]]) for key, tbl_list in self.tbls.items()}
        
#         cols = [c for c in get_cols(self.nodes) if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]
        cols = [c for c in ['total_white', 'total_black', 'total_native', 'total_asian', 'total_pacific', 'total_other'] if c not in Levels + District_types + ['geoid', 'county', 'total_pop', 'polygon', 'aland', 'perim', 'polsby_popper', 'density', 'point']]

    
    
        query = f"""
select
    B.seed,
    B.plan,
    C.{self.district_type},
    max(B.hash) as hash_plan,
    max(B.pop_imbalance) as pop_imbalance_plan,
    max(B.polsby_popper) as polsby_popper_plan,
    max(C.polsby_popper) as polsby_popper_district,
    max(C.aland) as aland,
    max(C.total_pop) as total_pop,
    max(C.total_pop) / sum(E.aland) as density,
    {join_str(1).join([f'sum(E.{c}) as {c}' for c in cols])}
from (
    select
        *
    from (
        select
            *,
            row_number() over (partition by A.hash order by plan asc, seed asc) as r
        from (
            select
                *
            from (
                {subquery(self.stack['summaries'], indents=4)}
                )
            where
                pop_imbalance < 10.0
            ) as A
        )
    where r = 1
    ) as B
inner join (
    {subquery(self.stack['stats'], indents=1)}
    ) as C
on
    B.seed = C.seed and B.plan = C.plan
inner join (
    select
        *
    from (
        {subquery(self.stack['plans'], indents=2)}
        )
    ) as D
on
    C.seed = D.seed and C.plan = D.plan and C.{self.district_type} = D.{self.district_type}
inner join
    {self.nodes_tbl} as E
on
    D.geoid = E.geoid
group by
    seed, plan, {self.district_type}
order by
    seed, plan, {self.district_type}
"""
        load_table(tbl=self.tbl, query=query)
        
start = time.time()
A = Analysis('cmat-315920.redistricting_data.nodes_TX_2020_cntyvtd_cd')
A.compute_results()
print(time_formatter(time.time() - start))

In [None]:
self=A
k = 1 + len(self.stack['plans']) // 500
for 

len(self.stack['plans']), k
# for tbl in stack['plans']

In [None]:
df = read_table(self.tbl)

In [None]:
df['hash_plan'].nunique()
# df['pop_imbalance_plan'].max()

In [None]:
X = np.linspace(0,10,100)
Y = [df.query('pop_imbalance_plan < @x')['hash_plan'].nunique() for x in X]

In [None]:
plt.plot(X,Y)

In [None]:
gcs_path  = 'math_for_unbiased_maps_tx'
gcs_bucket = gcsclient.get_bucket(gcs_path)
self=A
bqclient.extract_table(self.tbl, f'gs://{gcs_path}/hi.parquet')

In [None]:
reversed(list(bqclient.list_tables(f'{proj_id}.results')))