In [26]:
import feedparser
import re

# Returns title and dictionary of word counts for an RSS feed
def getwordcounts(url):
    # Parse the feed
    d=feedparser.parse(url)
    wc={}
    # Loop over all the entries
    for e in d.entries:
        if 'summary' in e: summary=e.summary
        else: summary=e.description
        # Extract a list of words
        words=getwords(e.title+' '+summary)
        for word in words:
            wc.setdefault(word,0)
            wc[word]+=1
    return getattr(d.feed, 'title', 'Unknown title'), wc

In [27]:
def getwords(html):
    # Remove all the HTML tags
    txt=re.compile(r'<[^>]+>').sub('',html)
    # Split words by all non-alpha characters
    words=re.compile(r'[^A-Z^a-z]+').split(txt)
    # Convert to lowercase
    return [word.lower( ) for word in words if word!='']

In [33]:
apcount={}
wordcounts={}
for feedurl in open('resources/feedlist.txt'):
    title,wc=getwordcounts(feedurl) 
    wordcounts[title]=wc
    for word,count in wc.items():
        apcount.setdefault(word,0)
        if count>1:
            apcount[word]+=1

In [89]:
wordlist = []
for word,wc in apcount.items():
    if ((wc/len(wordcounts) > 0.1) and (wc/len(wordcounts) < 0.4)):
        wordlist.append(word)

In [90]:
out=open('outputs/blogdata.txt','w')
out.write('Blog')
for word in wordlist: out.write('\t%s' % word) 
out.write('\n')
for blog,wc in wordcounts.items():
    out.write(blog)
    for word in wordlist:
        if word in wc: 
            out.write('\t%d' % wc[word])
        else: out.write('\t0')
    out.write('\n')

In [91]:
def readfile(filename):
    lines = [line for line in open(filename)]
    colnames=lines[0].strip( ).split('\t')[1:]
    rownames = []
    data = []
    for line in lines[1:]:
        p = line.strip().split('\t')
        rownames.append(p[0])
        data.append([int(n) for n in p[1:]])
    return rownames,colnames,data

In [92]:
from math import sqrt
def pearson(v1,v2):
    sum1 = sum(v1)
    sum2 = sum(v2)
    
    sum1Sq = sum([pow(n,2) for n in v1])
    sum2Sq = sum([pow(n,2) for n in v2])
    
    pSum = sum([v1[i] * v2[i] for i in range(len(v1))])
    
    num=pSum-(sum1*sum2/len(v1))
    den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
    if den==0: return 0
    return 1.0-num/den

In [93]:
class bicluster:
    def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
        self.left=left
        self.right=right
        self.vec=vec
        self.id=id
        self.distance=distance

In [94]:
def hcluster(rows,distance=pearson):
    distances={}
    currentclustid=-1
    
    # Clusters are initially just the rows
    clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
    while len(clust)>1:
        lowestpair=(0,1)
        closest=distance(clust[0].vec,clust[1].vec)
        
        # loop through every pair looking for the smallest distance
        for i in range(len(clust)):
            for j in range(i+1,len(clust)):
                
            # distances is the cache of distance calculations
                if (clust[i].id,clust[j].id) not in distances:
                    distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
                d=distances[(clust[i].id,clust[j].id)]
                if d<closest:
                    closest=d
                    lowestpair=(i,j)
                    
        # calculate the average of the two clusters
        mergevec=[
        (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
        
        # create the new cluster
        newcluster=bicluster(mergevec,left=clust[lowestpair[0]],right=clust[lowestpair[1]],distance=closest,id=currentclustid)
        
        # cluster ids that weren't in the original set are negative
        currentclustid-=1
        del clust[lowestpair[1]]
        del clust[lowestpair[0]]
        clust.append(newcluster)
    return clust[0]

In [95]:
blognames,words,data=readfile('outputs/blogdata.txt')
clust=hcluster(data)

In [101]:
def printclust(clust,labels=None,n=0):
    # indent to make a hierarchy layout
    for i in range(n): print(' '),
    if clust.id<0:
        # negative id means that this is branch
        print('-')
    else:
        # positive id means that this is an endpoint
        if labels==None: print(clust.id)
        else: print(labels[clust.id])
    # now print the right and left branches
    if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
    if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)

In [102]:
printclust(clust,labels=blognames)

-
 
-
 
 
COOL HUNTING
 
 
-
 
 
 
Captain's Quarters
 
 
 
-
 
 
 
 
-
 
 
 
 
 
ABC News: Investigative Unit
 
 
 
 
 
ThinkProgress
 
 
 
 
-
 
 
 
 
 
Instapundit
 
 
 
 
 
-
 
 
 
 
 
 
-
 
 
 
 
 
 
 
blog maverick
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
Joi Ito's Web
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
Pharyngula
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
The Viral Garden
 
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
 
NewsBusters
 
 
 
 
 
 
 
 
 
 
 
 
BuzzMachine
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
Eschaton
 
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
 
Boing Boing
 
 
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Signal v. Noise - Medium
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Unknown title
 
 
 
 
 
 
 
 
 
 
 
 
 
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
WIL WHEATON dot NET
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43 Folders
 
 
 
 
 
 
-
 
 
 
 
 
 
 
MichelleMalkin.com
 
 
 
 
 
 
 
Power LinePower Line
 
-
 
 
Guy Kawasaki
 
 
-
 
 
 
-
 
 
 
 
-
 
 
 

In [115]:
from PIL import Image, ImageDraw, ImageFont

In [105]:
def getheight(clust):
    if clust.left == None and clust.right == None:
        return 1
    else:
        return getheight(clust.left) + getheight(clust.right)

In [107]:
def getdepth(clust):
    # The distance of an endpoint is 0.0
    if clust.left==None and clust.right==None: return 0
    
    # The distance of a branch is the greater of its two sides
    # plus its own distance
    return max(getdepth(clust.left),getdepth(clust.right))+clust.distance

In [136]:
def drawdendrogram(clust,labels,jpeg='outputs/clusters.jpg'):
    # height and width
    h=getheight(clust)*20
    w=1200
    depth=getdepth(clust)
    
    # width is fixed, so scale distances accordingly
    scaling=float(w-150)/depth
    
    # Create a new image with a white background
    img=Image.new('RGB',(w,h),(255,255,255))
    draw=ImageDraw.Draw(img)
    text = u'\u2013'
    font = ImageFont.truetype('/Library/Fonts/Arial.ttf', 15)
    draw.text((0,0),text,"white",font=font)
    
    draw.line((0,h/2,10,h/2),fill=(255,0,0))
    
    # Draw the first node
    drawnode(draw,clust,10,(h/2),scaling,labels)
    img.save(jpeg,'JPEG')

In [137]:
def drawnode(draw,clust,x,y,scaling,labels):
    if clust.id<0:
        h1=getheight(clust.left)*20
        h2=getheight(clust.right)*20
        top=y-(h1+h2)/2
        bottom=y+(h1+h2)/2
        
        # Line length
        ll=clust.distance*scaling
        
        # Vertical line from this cluster to children
        draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))
        
        # Horizontal line to left item
        draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))
        
        # Horizontal line to right item
        draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))
        
        # Call the function to draw the left and right nodes
        drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
        drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
    else:
         # If this is an endpoint, draw the item label
        draw.text((x+5,y-7),labels[clust.id],(0,0,0))

In [138]:
drawdendrogram(clust,blognames,jpeg='outputs/blogclust.jpg')

UnicodeEncodeError: 'latin-1' codec can't encode character '\u2013' in position 9: ordinal not in range(256)