In [2]:
import re, pdb, sys, math
from collections import defaultdict

In [3]:
class Graph:
    def __init__(self):
        self.Vertices = []
        self.Edges = []

    def getRankedVertices(self):
        res = defaultdict(float)
        for e in self.Edges:
            res[e.Vertex1] += e.Weight
        return sorted(res.items(), key=lambda x: x[1], reverse=True)

In [4]:
class Vertex:
    def __init__(self):
        self.Sentence = None

In [5]:
class Edge:
    def __init__(self):
        self.Vertex1 = None
        self.Vertex2 = None
        self.Weight = 0

In [6]:
class WordType:
    Content=0
    Function=1
    ContentPunctuation=2
    FunctionPunctuation=3

In [7]:
class Word:
    def __init__(self):
        self.Text=''
        self.Type=''

In [8]:
class Sentence:
    def __init__(self):
        self.Words = []

    def getFullSentence(self):
        text = ''
        for w in self.Words:
            text += w.Text
        return text.strip()

    def getReducedSentence(self):
        sentenceText = ''
        sentenceEnd = self.Words[len(self.Words)-1]
        contentWords = filter(lambda w: w.Type == WordType.Content, self.Words)
        i = 0
        while i < len(contentWords):
            w = contentWords[i]
            # upper case the first character of the sentence
            if i == 0:
                li = list(w.Text)
                li[0] = li[0].upper()
                w.Text = ''.join(li)
            sentenceText += w.Text
            if i < len(contentWords)-1:
                sentenceText += ' '
            elif sentenceEnd.Text != w.Text:
                sentenceText += sentenceEnd.Text
            i = i+1
        return sentenceText



In [9]:
class Paragraph:
    def __init__(self):
        self.Sentences = []

In [16]:
class Reduction:
    functionPunctuation = ' ,-'
    contentPunctuation = '.?!\n'
    punctuationCharacters = functionPunctuation+contentPunctuation
    sentenceEndCharacters = '.?!'

    def isContentPunctuation(self, text):
        for c in self.contentPunctuation:
            if text.lower() == c.lower():
                return True
        return False

    def isFunctionPunctuation(self, text):
        for c in self.functionPunctuation:
            if text.lower() == c.lower():
                return True
        return False

    def isFunction(self, text, stopWords):
        for w in stopWords:
            if text.lower() == w.lower():
                return True
        return False

    def tag(self, sampleWords, stopWords):
        taggedWords = []
        for w in sampleWords:
            tw = Word()
            tw.Text = w
            if self.isContentPunctuation(w):
                tw.Type = WordType.ContentPunctuation
            elif self.isFunctionPunctuation(w):
                tw.Type = WordType.FunctionPunctuation
            elif self.isFunction(w, stopWords):
                tw.Type = WordType.Function
            else:
                tw.Type = WordType.Content
            taggedWords.append(tw)
        return taggedWords

    def tokenize(self, text):
        return filter(lambda w: w != '', re.split('([{0}])'.format(self.punctuationCharacters), text))	

    def getWords(self, sentenceText, stopWords):
        return self.tag(self.tokenize(sentenceText), stopWords) 

    def getSentences(self, line, stopWords):
        sentences = []
        sentenceTexts = filter(lambda w: w.strip() != '', re.split('[{0}]'.format(self.sentenceEndCharacters), line))	
        sentenceEnds = re.findall('[{0}]'.format(self.sentenceEndCharacters), line)
        sentenceEnds.reverse()
        for t in sentenceTexts:
            if len(sentenceEnds) > 0:
                t += sentenceEnds.pop()
            sentence = Sentence()
            sentence.Words = self.getWords(t, stopWords)
            sentences.append(sentence)
        return sentences

    def getParagraphs(self, lines, stopWords):
        paragraphs = []
        for line in lines:
            paragraph = Paragraph()
            paragraph.Sentences = self.getSentences(line, stopWords)
            paragraphs.append(paragraph)
        return paragraphs

    def findWeight(self, sentence1, sentence2):
        length1 = len(list(filter(lambda w: w.Type == WordType.Content, sentence1.Words)))
        length2 = len(list(filter(lambda w: w.Type == WordType.Content, sentence2.Words)))
        if length1 < 4 or length2 < 4:
            return 0
        weight = 0
        for w1 in filter(lambda w: w.Type == WordType.Content, sentence1.Words):
            for w2 in filter(lambda w: w.Type == WordType.Content, sentence2.Words):
                if w1.Text.lower() == w2.Text.lower():
                    weight = weight + 1
        normalised1 = 0
        if length1 > 0:
            normalised1 = math.log(length1)
        normalised2 = 0
        if length2 > 0:
            normalised2 = math.log(length2)
        norm = normalised1 + normalised2
        if norm == 0:
            return 0
        return weight / float(norm)

    def buildGraph(self, sentences):
        g = Graph()
        for s in sentences:
            v = Vertex()
            v.Sentence = s
            g.Vertices.append(v)
        for i in g.Vertices:
            for j in g.Vertices:
                if i != j:
                    w = self.findWeight(i.Sentence, j.Sentence)
                    e = Edge()
                    e.Vertex1 = i
                    e.Vertex2 = j
                    e.Weight = w
                    g.Edges.append(e)
        return g

    def sentenceRank(self, paragraphs):
        sentences = []
        for p in paragraphs:
            for s in p.Sentences:
                sentences.append(s)
        g = self.buildGraph(sentences)
        return g.getRankedVertices()

    def reduce(self, text, reductionRatio):
        stopWordsFile = 'stopWords.txt'
        stopWords= open(stopWordsFile).read().splitlines()

        lines = text.splitlines()
        print("lines", lines)
        contentLines = filter(lambda w: w.strip() != '', lines)
        print("contentLines", contentLines)

        paragraphs = self.getParagraphs(contentLines, stopWords)
        print("paragraphs", paragraphs)

        rankedSentences = self.sentenceRank(paragraphs)

        orderedSentences = []
        for p in paragraphs:
            for s in p.Sentences:
                orderedSentences.append(s)

        reducedSentences = []
        i = 0
        while i < math.trunc(len(rankedSentences) * reductionRatio):
            s = rankedSentences[i][0].Sentence
            position = orderedSentences.index(s)
            reducedSentences.append((s, position))
            i = i + 1
        reducedSentences = sorted(reducedSentences, key=lambda x: x[1])

        reducedText = []
        for s,r in reducedSentences:
            reducedText.append(s.getFullSentence())
        return reducedText	

In [17]:
import string
reduction = Reduction()
filename = 'Part1/awards_1990/awd_1990_00/a9000006.txt'
f = open(filename)
addTitle = False
addTexts = False
title = []
text = []
for word in f.read().split():
    if (word == "Title"):
        addTitle = True
        continue

    if (word == "\n"):
        addTitle = False

    if (addTexts == True and word == "\n"):
        addTexts = False
        break


    if (word == "Abstract"):
        addTexts = True
        continue

    if(addTitle == True):
        title.append(word)

    if(addTexts == True):
        text.append(word)


title = ' '.join(title)
text =' '.join(text)

reduction_ratio = 0.1
reduced_text = reduction.reduce(text, reduction_ratio)

#output = open('output.txt')
print(reduced_text)


lines [': Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic po

In [12]:
print(text)

: Commercial exploitation over the past two hundred years drove the great Mysticete whales to near extinction. Variation in the sizes of populations prior to exploitation, minimal population size during exploitation and current population sizes permit analyses of the effects of differing levels of exploitation on species with different biogeographical distributions and life-history characteristics. Dr. Stephen Palumbi at the University of Hawaii will study the genetic population structure of three whale species in this context, the Humpback Whale, the Gray Whale and the Bowhead Whale. The effect of demographic history will be determined by comparing the genetic structure of the three species. Additional studies will be carried out on the Humpback Whale. The humpback has a world-wide distribution, but the Atlantic and Pacific populations of the northern hemisphere appear to be discrete populations, as is the population of the southern hemispheric oceans. Each of these oceanic population