In [14]:
from summarizer import Summarizer
import os
from nltk import sent_tokenize, word_tokenize
import re 
import numpy as np
import rouge
import time

In [15]:
def textWordCount(Text):
    number_of_words = word_tokenize(Text)
    count=(len(number_of_words))
    return count

In [16]:
def textSentenceCount(Text):
    number_of_sentences = sent_tokenize(Text)
    count=(len(number_of_sentences))
    return count

In [17]:
def findRatio(count,ratio):
    if(count*ratio<950):
        Ratio=np.round(ratio,3)
        return Ratio
    else:
        return findRatio(count,ratio-0.001)

In [18]:
def findRatioLong(count,ratio,limit):
    if(count*ratio<limit):
        Ratio=np.round(ratio,3)
        return Ratio
    else:
        return findRatioLong(count,ratio-0.001,limit)

In [19]:
def shortSentenceClean(Corpus,Limit):
    sentences = sent_tokenize(Corpus)
    for sentence in sentences:
        if (textWordCount(sentence)<Limit):
            sentences.remove(sentence)
#            print("DELETED SHORT SENTENCE, SIZE:",textWordCount(sentence))
#            print(sentence)
            
    newCorpus = ' '.join(sentences)
    return newCorpus

In [20]:
def longSentenceClean(Corpus,Limit):
    sentences = sent_tokenize(Corpus)
    for sentence in sentences:
        if (textWordCount(sentence)>Limit):
            sentences.remove(sentence)
#            print("****DELETED LONG SENTENCE, SIZE:",textWordCount(sentence))
#            print(sentence)
    
    newCorpus = ' '.join(sentences)
    return newCorpus

In [21]:
def cleanTitle(Corpus):
    Corpus = Corpus.replace("\n"," ")
    sentences = sent_tokenize(Corpus)
    for sentence in sentences:
        count=0
        for i in sentence:
            if(i.isupper()):
                count=count+1
        if(count*3>len(sentence)):
            sentences.remove(sentence)
            #print(sentence)
            #print("**** DELETED TITLE, SIZE:",textWordCount(sentence))
            #print(sentence)
    
    newCorpus = ' '.join(sentences)
    return newCorpus

In [22]:
def cleanManyCharacterandNumber(Corpus,rate):
    Corpus = Corpus.replace("\n"," ")
    sentences = sent_tokenize(Corpus)
    for sentence in sentences:        
        digit=letter=other=0
        for c in sentence:
            if c.isdigit():
                digit=digit+1
            elif c.isalpha():
                letter=letter+1
            else:
                other=other+1
        
        if(len(sentence)<(digit*rate)):
            sentences.remove(sentence)
#            print("****DELETED NUMBER, SIZE:",textWordCount(sentence))
#            print(sentence)
            continue
        
        if(len(sentence)<(other-sentence.count(' '))*10):
            sentences.remove(sentence)
#            print("****DELETED CHARACTER, SIZE:",textWordCount(sentence))
#            print(sentence)
        
    newCorpus = ' '.join(sentences)
    return newCorpus

In [23]:
def cleanParenthesis(Corpus):
    Corpus = Corpus.replace("\n"," ")
    cleaned=[]
    sentences = sent_tokenize(Corpus)
    for sentence in sentences:
        try:
            temp =re.sub("[\(\[].*?[\)\]]", "", sentence) #remove () and []
            cleaned.append(temp)
            #print(sentence)
            #print("---------")
            #print(temp)
            #print("---------------------------------------------")
            continue
        except:
            pass
    
    newCorpus = ' '.join(cleaned)
    #print("---------------------------------------------")
    return newCorpus

In [24]:
def preProcessingText(Corpus):
    
    cleanedText = Corpus.replace("\n"," ")    
    cleanedText=cleanParenthesis(cleanedText)
    cleanedText=cleanManyCharacterandNumber(cleanedText,10)    
    cleanedText=shortSentenceClean(cleanedText,7)
    cleanedText=longSentenceClean(cleanedText,80)
    
    return cleanedText
    

In [25]:
def preProcessingTextLong(Corpus):
    
    cleanedText = Corpus.replace("\n"," ")    
    cleanedText=cleanManyCharacterandNumber(cleanedText,12)    
    cleanedText=shortSentenceClean(cleanedText,6)
    cleanedText=longSentenceClean(cleanedText,40)
    cleanedText=cleanTitle(cleanedText)
    
    return cleanedText

In [None]:
startTimeforOverall = time.time()
inputs = os.listdir('annual_reports/')
cleanedReports=[]
DocumentSentenceCounts=[]
DocumentWordCounts=[]
AfterPreprocessingDocumentSentenceCounts=[]
AfterPreprocessingDocumentWordCounts=[]
SummarySentenceCounts=[]
SummaryWordCounts=[]

for x in range(len(inputs)):
    startTimeforDocument = time.time()
    #read files
    print('{0}. Document'.format(len(cleanedReports)+1))
    print('File Name:{0}'.format(inputs[x]))
    sourceFilePath='annual_reports/' + inputs[x]
    file = open(sourceFilePath, encoding="utf8")
    temp=file.read()
    file.close()
    
    OrginalTextSentenceCount=textSentenceCount(temp)
    OrginalTextWordCount=textWordCount(temp)
    print('Orginal Text Sentence Count:{0}, Orginal Text Word Count:{1}'.format(OrginalTextSentenceCount,OrginalTextWordCount))
    DocumentSentenceCounts.append(OrginalTextSentenceCount)
    DocumentWordCounts.append(OrginalTextWordCount)
    cleanedReport=preProcessingText(temp)
    
    
    PreprocessedTextLenght=len(cleanedReport)
    print('Text lenght before BERT:{0}'.format(PreprocessedTextLenght))
    
    if(PreprocessedTextLenght>1000000):
        cleanedReport=preProcessingTextLong(cleanedReport)
        PreprocessedTextLenght=len(cleanedReport)
        print('Second Text lenght before BERT:{0}'.format(PreprocessedTextLenght))
    
    cleanedReports.append(cleanedReport)
    PreprocessedTextSentenceCount=textSentenceCount(cleanedReport)
    PreprocessedTextWordCount=textWordCount(cleanedReport)
    #print(cleanedReport)
    print('Preprocessed Text Sentence Count:{0}, Preprocessed Text Word Count:{1}'.format(PreprocessedTextSentenceCount,PreprocessedTextWordCount))
    AfterPreprocessingDocumentSentenceCounts.append(PreprocessedTextSentenceCount)
    AfterPreprocessingDocumentWordCounts.append(PreprocessedTextWordCount)
    
    #BERT

    Ratio=findRatio(PreprocessedTextWordCount,1)
    print('Ratio:',Ratio)
    if (Ratio<=0.0):
        Ratio=0.001
        print('Second Ratio:',Ratio)
    model = Summarizer()
    result = model(cleanedReport,ratio=Ratio)
    summary = ''.join(result)
    #write
    destinationFilePath='summaries/' + inputs[x]
    file2 = open(destinationFilePath,"w",encoding='utf-8') 
    file2.write(summary)
    file2.close()
    SummaryTextSentenceCount=textSentenceCount(summary)
    SummaryTextWordCount=textWordCount(summary)
    print('Summary Sentence Count:{0}, Summary Word Count:{1}'.format(SummaryTextSentenceCount,SummaryTextWordCount))
    SummarySentenceCounts.append(SummaryTextSentenceCount)
    SummaryWordCounts.append(SummaryTextWordCount)
    
    Limit=930
    while True:        
        if(SummaryTextWordCount>999):
            if(SummaryTextWordCount>900 and SummaryTextWordCount<1100):
                Ratio=Ratio-0.001
            else:
                Ratio=Ratio-0.003
            Ratio=findRatioLong(PreprocessedTextWordCount,Ratio,Limit)
            print('Summary Word Limit:',Limit)
            print('New Ratio:',Ratio)
            if (Ratio<=0.0):
                Ratio=0.001
                print('New Ratio:',Ratio)
            model = Summarizer()
            result = model(cleanedReport,ratio=Ratio)
            summary = ''.join(result)
            #write
            destinationFilePath='summaries/' + inputs[x]
            file2 = open(destinationFilePath,"w",encoding='utf-8') 
            file2.write(summary)
            file2.close()
            SummaryTextSentenceCount=textSentenceCount(summary)
            SummaryTextWordCount=textWordCount(summary)
            print('New Summary Sentence Count:{0}, Summary Word Count:{1}'.format(SummaryTextSentenceCount,SummaryTextWordCount))
            SummarySentenceCounts.pop()
            SummaryWordCounts.pop()
            SummarySentenceCounts.append(SummaryTextSentenceCount)
            SummaryWordCounts.append(SummaryTextWordCount)
            Limit=Limit-20
        else:
            break
    
    elapsedTimeforDocument = time.time() - startTimeforDocument
    elapsedTimeforAll = time.time() - startTimeforOverall
    print('Document processing time: '+time.strftime("%M:%S", time.gmtime(elapsedTimeforDocument)))
    print('Total processing time: '+time.strftime("%d:%H:%M:%S", time.gmtime(elapsedTimeforAll)))
    print("######################################################################################")


1. Document
File Name:10023.txt
Orginal Text Sentence Count:937, Orginal Text Word Count:35691
Text lenght before BERT:139845
Preprocessed Text Sentence Count:813, Preprocessed Text Word Count:23703
Ratio: 0.04
Summary Sentence Count:32, Summary Word Count:1237
Summary Word Limit: 930
New Ratio: 0.037
New Summary Sentence Count:30, Summary Word Count:1028
Summary Word Limit: 910
New Ratio: 0.036
New Summary Sentence Count:28, Summary Word Count:978
Document processing time: 08:15
Total processing time: 01:00:08:15
######################################################################################
2. Document
File Name:10024.txt
Orginal Text Sentence Count:947, Orginal Text Word Count:37399
Text lenght before BERT:140583
Preprocessed Text Sentence Count:821, Preprocessed Text Word Count:23970
Ratio: 0.039
Summary Sentence Count:33, Summary Word Count:1114
Summary Word Limit: 930
New Ratio: 0.036
New Summary Sentence Count:30, Summary Word Count:940
Document processing time: 05:35
Tot

16. Document
File Name:10073.txt
Orginal Text Sentence Count:603, Orginal Text Word Count:23967
Text lenght before BERT:82440
Preprocessed Text Sentence Count:520, Preprocessed Text Word Count:14179
Ratio: 0.067
Summary Sentence Count:35, Summary Word Count:914
Document processing time: 01:44
Total processing time: 01:02:54:18
######################################################################################
17. Document
File Name:1008.txt
Orginal Text Sentence Count:1679, Orginal Text Word Count:62261
Text lenght before BERT:254753
Preprocessed Text Sentence Count:1455, Preprocessed Text Word Count:43850
Ratio: 0.021
Summary Sentence Count:30, Summary Word Count:1092
Summary Word Limit: 930
New Ratio: 0.02
New Summary Sentence Count:28, Summary Word Count:986
Document processing time: 09:27
Total processing time: 01:03:03:46
######################################################################################
18. Document
File Name:10120.txt
Orginal Text Sentence Count:493, Orgin

New Summary Sentence Count:31, Summary Word Count:1023
Summary Word Limit: 890
New Ratio: 0.027
New Summary Sentence Count:30, Summary Word Count:1009
Summary Word Limit: 870
New Ratio: 0.026
New Summary Sentence Count:27, Summary Word Count:854
Document processing time: 17:19
Total processing time: 01:04:28:58
######################################################################################
32. Document
File Name:10225.txt
Orginal Text Sentence Count:503, Orginal Text Word Count:18504
Text lenght before BERT:79248
Preprocessed Text Sentence Count:448, Preprocessed Text Word Count:13475
Ratio: 0.07
Summary Sentence Count:32, Summary Word Count:975
Document processing time: 01:33
Total processing time: 01:04:30:31
######################################################################################
33. Document
File Name:10226.txt
Orginal Text Sentence Count:575, Orginal Text Word Count:20423
Text lenght before BERT:84655
Preprocessed Text Sentence Count:497, Preprocessed Text Wor

New Summary Sentence Count:31, Summary Word Count:1067
Summary Word Limit: 890
New Ratio: 0.024
New Summary Sentence Count:29, Summary Word Count:986
Document processing time: 17:29
Total processing time: 01:07:28:29
######################################################################################
46. Document
File Name:10385.txt
Orginal Text Sentence Count:627, Orginal Text Word Count:23167
Text lenght before BERT:87909
Preprocessed Text Sentence Count:557, Preprocessed Text Word Count:15112
Ratio: 0.062
Summary Sentence Count:36, Summary Word Count:989
Document processing time: 01:59
Total processing time: 01:07:30:29
######################################################################################
47. Document
File Name:10386.txt
Orginal Text Sentence Count:782, Orginal Text Word Count:27369
Text lenght before BERT:103855
Preprocessed Text Sentence Count:714, Preprocessed Text Word Count:17887
Ratio: 0.053
Summary Sentence Count:38, Summary Word Count:985
Document processi

Preprocessed Text Sentence Count:1038, Preprocessed Text Word Count:29641
Ratio: 0.032
Summary Sentence Count:33, Summary Word Count:1066
Summary Word Limit: 930
New Ratio: 0.031
New Summary Sentence Count:32, Summary Word Count:1070
Summary Word Limit: 910
New Ratio: 0.03
New Summary Sentence Count:31, Summary Word Count:1086
Summary Word Limit: 890
New Ratio: 0.029
New Summary Sentence Count:30, Summary Word Count:1063
Summary Word Limit: 870
New Ratio: 0.028
New Summary Sentence Count:29, Summary Word Count:971
Document processing time: 19:22
Total processing time: 01:10:06:18
######################################################################################
62. Document
File Name:10501.txt
Orginal Text Sentence Count:1293, Orginal Text Word Count:44187
Text lenght before BERT:198311
Preprocessed Text Sentence Count:1179, Preprocessed Text Word Count:33475
Ratio: 0.028
Summary Sentence Count:33, Summary Word Count:992
Document processing time: 04:19
Total processing time: 01:10:

######################################################################################
76. Document
File Name:10547.txt
Orginal Text Sentence Count:1067, Orginal Text Word Count:36322
Text lenght before BERT:153633
Preprocessed Text Sentence Count:958, Preprocessed Text Word Count:25427
Ratio: 0.037
Summary Sentence Count:37, Summary Word Count:1025
Summary Word Limit: 930
New Ratio: 0.036
New Summary Sentence Count:35, Summary Word Count:933
Document processing time: 06:40
Total processing time: 01:12:06:42
######################################################################################
77. Document
File Name:10548.txt
Orginal Text Sentence Count:1048, Orginal Text Word Count:37339
Text lenght before BERT:153957
Preprocessed Text Sentence Count:957, Preprocessed Text Word Count:25551
Ratio: 0.037
Summary Sentence Count:34, Summary Word Count:1056
Summary Word Limit: 930
New Ratio: 0.036
New Summary Sentence Count:33, Summary Word Count:1056
Summary Word Limit: 910
New Ratio: 0.0

Summary Sentence Count:30, Summary Word Count:1044
Summary Word Limit: 930
New Ratio: 0.017
New Summary Sentence Count:29, Summary Word Count:1121
Summary Word Limit: 910
New Ratio: 0.014
New Summary Sentence Count:24, Summary Word Count:925
Document processing time: 18:44
Total processing time: 01:15:49:27
######################################################################################
90. Document
File Name:10579.txt
Orginal Text Sentence Count:1919, Orginal Text Word Count:69387
Text lenght before BERT:322406
Preprocessed Text Sentence Count:1722, Preprocessed Text Word Count:52882
Ratio: 0.017
Summary Sentence Count:29, Summary Word Count:1196
Summary Word Limit: 930
New Ratio: 0.014
New Summary Sentence Count:24, Summary Word Count:944
Document processing time: 12:52
Total processing time: 01:16:02:19
######################################################################################
91. Document
File Name:10585.txt
Orginal Text Sentence Count:1089, Orginal Text Word Coun

######################################################################################
105. Document
File Name:10696.txt
Orginal Text Sentence Count:513, Orginal Text Word Count:16868
Text lenght before BERT:69704
Preprocessed Text Sentence Count:442, Preprocessed Text Word Count:11773
Ratio: 0.08
Summary Sentence Count:34, Summary Word Count:888
Document processing time: 01:40
Total processing time: 01:17:48:18
######################################################################################
106. Document
File Name:10744.txt
Orginal Text Sentence Count:1188, Orginal Text Word Count:43970
Text lenght before BERT:172896
Preprocessed Text Sentence Count:1071, Preprocessed Text Word Count:29424
Ratio: 0.032
Summary Sentence Count:35, Summary Word Count:1028
Summary Word Limit: 930
New Ratio: 0.031
New Summary Sentence Count:34, Summary Word Count:993
Document processing time: 08:06
Total processing time: 01:17:56:24
####################################################################

Document processing time: 10:55
Total processing time: 01:20:04:50
######################################################################################
119. Document
File Name:10856.txt
Orginal Text Sentence Count:1606, Orginal Text Word Count:71236
Text lenght before BERT:244377
Preprocessed Text Sentence Count:1415, Preprocessed Text Word Count:41822
Ratio: 0.022
Summary Sentence Count:34, Summary Word Count:1125
Summary Word Limit: 930
New Ratio: 0.019
New Summary Sentence Count:28, Summary Word Count:1037
Summary Word Limit: 910
New Ratio: 0.018
New Summary Sentence Count:29, Summary Word Count:871
Document processing time: 16:08
Total processing time: 01:20:20:58
######################################################################################
120. Document
File Name:10862.txt
Orginal Text Sentence Count:2117, Orginal Text Word Count:82575
Text lenght before BERT:312170
Preprocessed Text Sentence Count:1891, Preprocessed Text Word Count:52442
Ratio: 0.018
Summary Sentence C

New Summary Sentence Count:33, Summary Word Count:941
Document processing time: 08:43
Total processing time: 01:23:01:03
######################################################################################
135. Document
File Name:10925.txt
Orginal Text Sentence Count:942, Orginal Text Word Count:29728
Text lenght before BERT:132699
Preprocessed Text Sentence Count:830, Preprocessed Text Word Count:22291
Ratio: 0.042
Summary Sentence Count:35, Summary Word Count:1097
Summary Word Limit: 930
New Ratio: 0.041
New Summary Sentence Count:34, Summary Word Count:996
Document processing time: 06:02
Total processing time: 01:23:07:06
######################################################################################
136. Document
File Name:10926.txt
Orginal Text Sentence Count:820, Orginal Text Word Count:28344
Text lenght before BERT:119861
Preprocessed Text Sentence Count:727, Preprocessed Text Word Count:20198
Ratio: 0.047
Summary Sentence Count:34, Summary Word Count:1162
Summary Word 

######################################################################################
150. Document
File Name:11118.txt
Orginal Text Sentence Count:1241, Orginal Text Word Count:53687
Text lenght before BERT:181254
Preprocessed Text Sentence Count:1080, Preprocessed Text Word Count:33133
Ratio: 0.028
Summary Sentence Count:33, Summary Word Count:991
Document processing time: 03:52
Total processing time: 02:00:50:03
######################################################################################
151. Document
File Name:11157.txt
Orginal Text Sentence Count:2011, Orginal Text Word Count:74391
Text lenght before BERT:317737
Preprocessed Text Sentence Count:1825, Preprocessed Text Word Count:53586
Ratio: 0.017
Summary Sentence Count:32, Summary Word Count:1141
Summary Word Limit: 930
New Ratio: 0.014
New Summary Sentence Count:27, Summary Word Count:939
Document processing time: 13:06
Total processing time: 02:01:03:10
################################################################

Document processing time: 18:56
Total processing time: 02:03:23:12
######################################################################################
166. Document
File Name:11357.txt
Orginal Text Sentence Count:497, Orginal Text Word Count:18488
Text lenght before BERT:68654
Preprocessed Text Sentence Count:432, Preprocessed Text Word Count:11855
Ratio: 0.08
Summary Sentence Count:35, Summary Word Count:939
Document processing time: 01:36
Total processing time: 02:03:24:48
######################################################################################
167. Document
File Name:11358.txt
Orginal Text Sentence Count:453, Orginal Text Word Count:17222
Text lenght before BERT:64322
Preprocessed Text Sentence Count:397, Preprocessed Text Word Count:11044
Ratio: 0.086
Summary Sentence Count:35, Summary Word Count:870
Document processing time: 01:30
Total processing time: 02:03:26:19
######################################################################################
168. Document

Summary Sentence Count:36, Summary Word Count:1228
Summary Word Limit: 930
New Ratio: 0.03
New Summary Sentence Count:32, Summary Word Count:966
Document processing time: 07:36
Total processing time: 02:05:53:34
######################################################################################
181. Document
File Name:11432.txt
Orginal Text Sentence Count:1205, Orginal Text Word Count:42858
Text lenght before BERT:175406
Preprocessed Text Sentence Count:1085, Preprocessed Text Word Count:29872
Ratio: 0.031
Summary Sentence Count:34, Summary Word Count:1080
Summary Word Limit: 930
New Ratio: 0.03
New Summary Sentence Count:32, Summary Word Count:1085
Summary Word Limit: 910
New Ratio: 0.029
New Summary Sentence Count:31, Summary Word Count:921
Document processing time: 11:37
Total processing time: 02:06:05:12
######################################################################################
182. Document
File Name:11433.txt
Orginal Text Sentence Count:1295, Orginal Text Word Coun

Summary Word Limit: 930
New Ratio: 0.036
New Summary Sentence Count:33, Summary Word Count:974
Document processing time: 07:20
Total processing time: 02:08:03:20
######################################################################################
194. Document
File Name:11522.txt
Orginal Text Sentence Count:1057, Orginal Text Word Count:37284
Text lenght before BERT:155386
Preprocessed Text Sentence Count:897, Preprocessed Text Word Count:25967
Ratio: 0.036
Summary Sentence Count:33, Summary Word Count:900
Document processing time: 03:39
Total processing time: 02:08:07:00
######################################################################################
195. Document
File Name:11523.txt
Orginal Text Sentence Count:1034, Orginal Text Word Count:36655
Text lenght before BERT:148617
Preprocessed Text Sentence Count:882, Preprocessed Text Word Count:24756
Ratio: 0.038
Summary Sentence Count:36, Summary Word Count:1072
Summary Word Limit: 930
New Ratio: 0.037
New Summary Sentence Coun

Preprocessed Text Sentence Count:965, Preprocessed Text Word Count:30709
Ratio: 0.03
Summary Sentence Count:35, Summary Word Count:1092
Summary Word Limit: 930
New Ratio: 0.029
New Summary Sentence Count:34, Summary Word Count:1011
Summary Word Limit: 910
New Ratio: 0.028
New Summary Sentence Count:33, Summary Word Count:1068
Summary Word Limit: 890
New Ratio: 0.027
New Summary Sentence Count:32, Summary Word Count:912
Document processing time: 19:59
Total processing time: 02:10:28:40
######################################################################################
210. Document
File Name:11645.txt
Orginal Text Sentence Count:1561, Orginal Text Word Count:51192
Text lenght before BERT:231946
Preprocessed Text Sentence Count:1404, Preprocessed Text Word Count:38280
Ratio: 0.024
Summary Sentence Count:34, Summary Word Count:1070
Summary Word Limit: 930
New Ratio: 0.023
New Summary Sentence Count:33, Summary Word Count:1094
Summary Word Limit: 910
New Ratio: 0.022
New Summary Sentenc

In [None]:
def Average(lst): 
    return sum(lst) / len(lst) 

In [None]:
print("#################################################################")
averageDocumentSentenceCounts = Average(DocumentSentenceCounts)
print("averageDocumentSentenceCounts:",averageDocumentSentenceCounts)

averageDocumentWordCounts = Average(DocumentWordCounts)
print("averageDocumentWordCounts:",averageDocumentWordCounts)

print("#################################################################")
averageAfterPreprocessingDocumentSentenceCounts = Average(AfterPreprocessingDocumentSentenceCounts)
print("averageAfterPreprocessingDocumentSentenceCounts:",averageAfterPreprocessingDocumentSentenceCounts)

averageAfterPreprocessingDocumentWordCounts = Average(AfterPreprocessingDocumentWordCounts)
print("averageAfterPreprocessingDocumentWordCounts:",averageAfterPreprocessingDocumentWordCounts)

print("#################################################################")
averageSummarySentenceCounts = Average(SummarySentenceCounts)
print("averageSummarySentenceCounts:",averageSummarySentenceCounts)

averageSummaryWordCounts = Average(SummaryWordCounts)
print("averageSummaryWordCounts:",averageSummaryWordCounts)

In [None]:
import os
entries1 = os.listdir('annual_reports')
entries2 = os.listdir('summaries')

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=1000,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    references=[]

    for x in range(len(entries2)):
        tempfilepath='summaries\\' + entries2[x]
        file = open(tempfilepath, encoding="utf8")
        references.append(file.read())
        
    
    hypothesis=[]

    for x in range(len(entries2)):
        tempfilepath='annual_reports\\' + entries2[x]
        file = open(tempfilepath, encoding="utf8")
        hypothesis.append(file.read())
    
    scores = evaluator.get_scores(hypothesis, references)


    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

In [None]:
print("#################################################################")

In [None]:
goldreferences=[]

for x in range(len(entries2)):
    try:
        tempfilepathR='gold_summaries\\' + entries2[x].split(".")[0]+'_1.txt'
        fileR = open(tempfilepathR, encoding="utf8")
        goldreferences.append(fileR.read())


    except:
        print("Missing : ", tempfilepathR)


for aggregator in ['Avg']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=1000,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
        
    scores = evaluator.get_scores(hypothesis, goldreferences)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

In [None]:
tempListSentences=[]
tempListWords=[]

for i in range(len(goldreferences)):
    tempListWords.append(textWordCount(goldreferences[i]))
    tempListSentences.append(textSentenceCount(goldreferences[i]))
    

averagetempListSentences = Average(tempListSentences)
print("average gold Summaries_1 Sentence Count:",averagetempListSentences)

averagetempListWords = Average(tempListWords)
print("average gold Summaries_1 Words Count:",averagetempListWords)
print("####################################################################################")

In [None]:
goldreferences=[]

for x in range(len(entries2)):
    try:
        tempfilepathR='gold_summaries\\' + entries2[x].split(".")[0]+'_2.txt'
        fileR = open(tempfilepathR, encoding="utf8")
        goldreferences.append(fileR.read())


    except:
        print("Missing : ", tempfilepathR)


for aggregator in ['Avg']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=1000,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
        
    scores = evaluator.get_scores(hypothesis, goldreferences)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

In [None]:
tempListSentences=[]
tempListWords=[]

for i in range(len(goldreferences)):
    tempListWords.append(textWordCount(goldreferences[i]))
    tempListSentences.append(textSentenceCount(goldreferences[i]))
    

averagetempListSentences = Average(tempListSentences)
print("average gold Summaries_2 Sentence Count:",averagetempListSentences)

averagetempListWords = Average(tempListWords)
print("average gold Summaries_2 Words Count:",averagetempListWords)
print("####################################################################################")

In [None]:
goldreferences=[]

for x in range(len(entries2)):
    try:
        tempfilepathR='gold_summaries\\' + entries2[x].split(".")[0]+'_3.txt'
        fileR = open(tempfilepathR, encoding="utf8")
        goldreferences.append(fileR.read())


    except:
        print("Missing : ", tempfilepathR)


for aggregator in ['Avg']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=1000,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
        
    scores = evaluator.get_scores(hypothesis, goldreferences)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

In [None]:
tempListSentences=[]
tempListWords=[]

for i in range(len(goldreferences)):
    tempListWords.append(textWordCount(goldreferences[i]))
    tempListSentences.append(textSentenceCount(goldreferences[i]))
    

averagetempListSentences = Average(tempListSentences)
print("average gold Summaries_3 Sentence Count:",averagetempListSentences)

averagetempListWords = Average(tempListWords)
print("average gold Summaries_3 Words Count:",averagetempListWords)
print("####################################################################################")

In [None]:
import os
entries2 = os.listdir('summaries')
entries3 = os.listdir('gold_summaries')
hypothesisAll=[]
goldReferencesAll=[]
onlyName=[]

for item in range(len(entries2)):
    temp=entries2[item].split(".")[0]
    onlyName.append(temp)

count=0
for item in range(len(entries3)):
    if(entries3[item].split("_")[0] in onlyName):
        tempfilepathR='gold_summaries\\' + entries3[item]
        fileR = open(tempfilepathR, encoding="utf8")
        goldReferencesAll.append(fileR.read())
        fileR.close()

        tempfilepathH='annual_reports\\' + entries3[item].split("_")[0]+".txt"
        fileH = open(tempfilepathH, encoding="utf8")
        hypothesisAll.append(fileH.read())
        
for aggregator in ['Avg']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=1000,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)
        
    scores = evaluator.get_scores(hypothesisAll, goldReferencesAll)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

In [None]:
tempListSentences=[]
tempListWords=[]

for i in range(len(goldReferencesAll)):
    tempListWords.append(textWordCount(goldReferencesAll[i]))
    tempListSentences.append(textSentenceCount(goldReferencesAll[i]))
    

averagetempListSentences = Average(tempListSentences)
print("average gold Summaries All Sentence Count:",averagetempListSentences)

averagetempListWords = Average(tempListWords)
print("average gold Summaries All Words Count:",averagetempListWords)
print("####################################################################################")
    
