In [1]:
%%typecheck

from pprint import pprint
from math import fsum,sqrt
from typing import Iterable, Tuple, List,Sequence,Dict,DefaultDict
from dis import dis
from collections import defaultdict
from functools import partial
from random import sample

Point = Tuple[int,...]
Centroid = Point


points: List[Point] = [
        (10, 41, 23),
        (22, 30, 29),
        (11, 42, 5),
        (20, 32, 4),
        (12, 40, 12),
        (21, 36, 23),
    ]

def transpose(data: Sequence[Tuple[int,...]]):
    return list(zip(*data))

def mean(data: Iterable[float])-> float:
    data = list(data)
    return fsum(data)/len(data)
    
def dist(p:Point,q:Point, fsum=fsum, sqrt=sqrt, zip=zip)-> float:
    return sqrt(fsum( (p-q)**2 for p,q in zip(p,q))) 

def assign_data(centroids: Sequence[Centroid] , data: Iterable[Point])-> Dict[Centroid, List[Point]]:
    d:DefaultDict = defaultdict(list)
    
    for point in data:        
        closest_centroid = min(centroids, key= partial( dist, point))
        d[closest_centroid].append(point)
    return dict(d)

def compute_centroids(groups:Iterable[Sequence[Point]])->List[Centroid]:
    return[tuple(map(mean,transpose(group))) for group in groups]

def k_means(data: Iterable[Point], k:int=3, iterations:int=1000)-> List[Centroid]:
    data = list(data)
    centroids = sample(data, k)
    for i in range(iterations):        
        labeled = assign_data(centroids, data)
        centroids = compute_centroids(labeled.values())
    return centroids

    
    

# dis(dist.__code__)

groups= [
    [(10, 41, 23),
     (11, 42, 5),
     (20, 32, 4),
     (12, 40, 12)],
    [(22, 30, 29), 
     (21, 36, 23)]
]

centroids = k_means(points)
d = assign_data(centroids, points)
pprint(d)

<string>:38: error: No overload variant of "min" matches argument types [typing.Sequence[builtins.tuple[builtins.int]], functools.partial[builtins.float*]]
<string>:43: error: Argument 1 to "map" has incompatible type "Callable[[Iterable[float]], float]"; expected "Callable[[Any], int]"

{(10.0, 41.0, 23.0): [(10, 41, 23)],
 (14.333333333333334, 38.0, 7.0): [(11, 42, 5), (20, 32, 4), (12, 40, 12)],
 (21.5, 33.0, 26.0): [(22, 30, 29), (21, 36, 23)]}


## Congress Data Analysis

####  Load and Accumulate votes by Senator

In [30]:
%%typecheck 

import csv
from collections import defaultdict
from pprint import pprint
from glob import glob
from typing import DefaultDict,NamedTuple,Dict,List,Tuple


from math import fsum,sqrt
from typing import Iterable, Tuple, List,Sequence,Dict,DefaultDict
from dis import dis
from collections import defaultdict,Counter
from functools import partial
from random import sample

NUM_SENATORS=100
VoteValue = int
VoteHistory = Tuple[VoteValue,...]

Senator = NamedTuple('Senator', [('name', str),('party','str'),('state',str)])


accumulated_record: DefaultDict[Senator, List[VoteValue]] = defaultdict(list)

vote_value: Dict[str, VoteValue] ={'Yea':1, 'Not Voting':0 ,'Nay':-1}
for filename in glob("congress_data/congress_votes*.csv"):
    with open(filename, encoding='utf-8') as f:
        reader = csv.reader(f)
        vote_topic = next(reader)
        headers= next(reader)
        for person, state, district, vote, name, party in reader:
            senator = Senator(name=name, party=party, state=state)
            accumulated_record[senator].append(vote_value[vote])



records: Dict[Senator, VoteHistory] = {senator: tuple(votes) for senator, votes in accumulated_record.items()}
#pprint(record)


Point = Tuple[int,...]
Centroid = Point


def transpose(data: Sequence[Tuple[int,...]]):
    return list(zip(*data))

def mean(data: Iterable[float])-> float:
    data = list(data)
    return fsum(data)/len(data)
    
def dist(p:Point,q:Point, fsum=fsum, sqrt=sqrt, zip=zip)-> float:
    return sqrt(fsum( (p-q)**2 for p,q in zip(p,q))) 

def assign_data(centroids: Sequence[Centroid] , data: Iterable[Point])-> Dict[Centroid, List[Point]]:
    d:DefaultDict = defaultdict(list)
    
    for point in data:        
        closest_centroid = min(centroids, key= partial( dist, point))
        d[closest_centroid].append(point)
    return dict(d)

def compute_centroids(groups:Iterable[Sequence[Point]])->List[Centroid]:
    return[tuple(map(mean,transpose(group))) for group in groups]

def k_means(data: Iterable[Point], k:int=3, iterations:int=1000)-> List[Centroid]:
    data = list(data)
    centroids = sample(data, k)
    for i in range(iterations):        
        labeled = assign_data(centroids, data)
        centroids = compute_centroids(labeled.values())
    return centroids


centroids = k_means(records.values(), k=3)
clustered_votes = assign_data(centroids, records.values())

votes_to_senators:DefaultDict[VoteHistory, List[Senator]] = defaultdict(list)
for senator, vote_history in records.items():
        votes_to_senators[vote_history].append(senator)

assert sum([len(cluster) for cluster in votes_to_senators.values()]) == NUM_SENATORS


for i, votes_in_cluster in enumerate(clustered_votes.values(),start=1):
    print(f'================= Voting Cluster {i} ================= ')
    party_totals= Counter()
    for votes in set(votes_in_cluster):
        for senator in votes_to_senators[votes]:
            party_totals[senator.party]+=1
            print(senator)
    print(party_totals)  

<string>:60: error: No overload variant of "min" matches argument types [typing.Sequence[builtins.tuple[builtins.int]], functools.partial[builtins.float*]]
<string>:65: error: Argument 1 to "map" has incompatible type "Callable[[Iterable[float]], float]"; expected "Callable[[Any], int]"
<string>:88: error: Need type annotation for variable
<string>:91: error: Cannot determine type of 'party_totals'
<string>:93: error: Cannot determine type of 'party_totals'

Senator(name='Sen. Richard Blumenthal [D]', party='Democrat', state='CT')
Senator(name='Sen. Angus King [I]', party='Independent', state='ME')
Senator(name='Sen. Mark Kirk [R]', party='Republican', state='IL')
Senator(name='Sen. James Risch [R]', party='Republican', state='ID')
Senator(name='Sen. Alan “Al” Franken [D]', party='Democrat', state='MN')
Senator(name='Sen. Brian Schatz [D]', party='Democrat', state='HI')
Senator(name='Sen. Ron Johnson [R]', party='Republican', state='WI')
Senator(name='Sen. Harry Reid [D]', party='Democ