<b><font size ="6">Notebook 7 - solutions</font></b>

The following notebook contains the solutions for the seventh notebook in the tutorial. It is recommended that you check these solutions after you finish the whole notebook.

In [None]:
# Lexical analysis
# Non-OOP (object-oriented programming) definition


##############################
# Version of word counting function that does NOT
# use a stoplist to filter words that are counted

def countWords(filename):
    counts = {}
    inf = open(filename,'r')
    for line in inf:
        words = line.split()
        for word in words:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return (counts)


# Version of word counting function that DOES use
# a stoplist, to filter words that are counted

def countWordsStoplist(filename,stops):
    counts = {}
    inf = open(filename,'r')
    for line in inf:
        words = line.split()
        for word in words:
            if word not in stops:
                if word in counts:
                    counts[word] += 1
                else:
                    counts[word] = 1
    return (counts)


##############################
# Read in stop words, and return as a list

def readStopWords(stopfile):
    stops = []
    inf = open(stopfile,'r')
    for line in inf:
        word = line.strip()
        stops.append(word)
    return (stops)

##############################
# Takes a dictionary of counts (e.g. for words)
# and prints out the 20 most frequent keys

def printTop20(counts):
    words = counts.keys()
    words.sort(reverse=True,key=lambda v:counts[v])
    for i in range(20):
        word = words[i]
        print (word, '=', counts[word])

# Above definition would give an error if there
# were less than 20 words in the list of words. 
# Alternative definition given next avoid this 
# problem by using slicing to truncate word list. 

def printTop20Slicing(counts):
    words = list(counts.keys())
    words.sort(reverse=True,key=lambda v:counts[v])
    words = words[:20]
    for word in words:
        print (word, '=', counts[word])
    print ('-' * 22)

##############################
# Compute similarity score between two dictionaries
# of counts

def similarity(counts1,counts2):
    size1 = len(counts1)
    size2 = len(counts2)
    if size1 + size2 > 0: # avoid a divide-by-zero error
        overlap = 0
        for w in counts1:
            if w in counts2:
                overlap += 1
        return (float(overlap) / (size1 + size2 - overlap))
    else:
        return (0.0)

# Test for the non-OOP definiton    
file1 = 'files/george01.txt'
file2 = 'files/george02.txt'
stoplist = 'files/stopwords.txt'

stops = readStopWords(stoplist)

counts1 = countWordsStoplist(file1,stops)
counts2 = countWordsStoplist(file2,stops)

printTop20Slicing(counts1)
printTop20Slicing(counts2)

simscore = similarity(counts1,counts2)

print ('Similarity:', simscore)

In [None]:
# Lexical analysis
# OOP (object-oriented programming) definition

class DocWordCounts:
    def __init__(self,filename):
        self.filename = filename
        self.counts = {}
        self.count()

# Count         
    def count(self):
        infile = open(self.filename,'r')
        for line in infile:
            words = line.split()
            for word in words:
                if word in self.counts:
                    self.counts[word] += 1
                else:
                    self.counts[word] = 1

    def printTop20(self):
        words = list(self.counts.keys())
        words.sort(reverse=True,key=lambda v:self.counts[v])
        for w in words[:20]:
            print (w, ':', self.counts[w])
        print ('-' * 22)

    def similarity(self,otherdoc):
        size1 = len(self.counts)
        size2 = len(otherdoc.counts)
        if size1 + size2 == 0: # avoid a divide-by-zero error
            return (0.0 )
        overlap = 0
        for w in self.counts:
            if w in otherdoc.counts:
                overlap += 1
        return (float(overlap) / (size1 + size2 - overlap))

##################################################


# Test for the OOP definiton
file1 = 'files/george01.txt'
file2 = 'files/george02.txt'

doc1 = DocWordCounts(file1)
doc2 = DocWordCounts(file2)

doc1.printTop20()
doc2.printTop20()

simscore = doc1.similarity(doc2)

print ('Similarity:', simscore)

In [None]:
# Testing tasks 1-3 

infile = 'files/mobypara.txt'
infile = 'files/mobydick.txt'

stops = readStopWords('files/stopwords.txt')
stops = []

moby_counts = countWordsStoplist(infile,stops)
printTop20Slicing(moby_counts)

print("\n")
################################
# Testing task 4

infiles = ['files/george01.txt',
           'files/george02.txt',
           'files/george03.txt',
           'files/george04.txt']

allcounts = []

for infile in infiles:
    counts = countWordsStoplist(infile,stops)
    allcounts.append(counts)

for i in range(len(allcounts)):
    for j in range(len(allcounts)):
        if i < j:
            name1 = infiles[i]
            name2 = infiles[j]
            sim = similarity(allcounts[i],allcounts[j])
            print ('compare: (%s <> %s) = %.3f' % (name1,name2,sim))