In [2]:
# import package
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
stopwords = list (STOP_WORDS)
punctuation += '\n'
from heapq import nlargest

In [3]:
# Defining a Class for extractive Summarization 

class Summarizer():
    
    ## constructor for data loading
    ## give either path or corpus itself
    def __init__(self, path = None, corpus = None):
        self.mCorpus = ''
        self.mWordFrequencies = {}
        self.mSentScore = {}
        self.mNumSentences = 0
        
        if path != None:
            file = open(path, "r")
            self.mCorpus = self.mCorpus + file.read()
        elif corpus != None:
            self.mCorpus = corpus
            
        self.mNlp = spacy.load('en_core_web_sm')
        self.mDoc = self.mNlp(self.mCorpus)
    
    
    # Function for Printing Corpus
    def PrintCorpus(self):
        print(self.mCorpus)
    
    
    ## defining a function for Calculating 
    ## word frequiences
    def WordFrequencyCalculator(self):
        for word in self.mDoc:
            wordInLowerCase = word.text.lower()
            
            if (wordInLowerCase not in stopwords) and (wordInLowerCase not in punctuation):
                if wordInLowerCase not in self.mWordFrequencies.keys():
                    self.mWordFrequencies[wordInLowerCase] = 1
                else:
                    self.mWordFrequencies[wordInLowerCase] += 1
    
        return
    
    
    ## Function for normalizing Frequency Values
    def WordFreqNormalizer(self):
        maxFreq = max(self.mWordFrequencies.values())
        
        for word in self.mWordFrequencies.keys():
            self.mWordFrequencies[word] = self.mWordFrequencies[word] / maxFreq
        
        return
    
    # Function For calculating  sentence score
    # based on computed normalized word Frequencies
    def CalSentScore(self):
        sentences = [sent for sent in self.mDoc.sents]
        self.mNumSentences = len(sentences)
        
        for sent in sentences:
            for word in sent:
                wordInLowerCase = word.text.lower()
                
                if wordInLowerCase in self.mWordFrequencies.keys():
                    if sent not in self.mSentScore.keys():
                        self.mSentScore[sent] = self.mWordFrequencies[wordInLowerCase]
                    else:
                        self.mSentScore[sent] += self.mWordFrequencies[wordInLowerCase]
        return
    
    ## Writting the MainFunction for this script now
    def SummarizeMyText(self, fractionToReduce = 0.2):
        self.WordFrequencyCalculator()
        self.WordFreqNormalizer()
        self.CalSentScore()
        
        reducedSentNum = int (self.mNumSentences * fractionToReduce)
        print ('Total number of Sentences = {}'.format(self.mNumSentences))
        print('Num of Sentences Reduced to {}'.format(reducedSentNum))
        print('Summary as follows : \n')
        
        summaryList = nlargest(reducedSentNum, self.mSentScore, key = self.mSentScore.get)
        for sent in summaryList:
            print(sent, end = '')
        

In [7]:
# Summarization w

for dirname, _, filenames in os.walk("D:/OneDrive - Data ScienceTech Institute/5 python Lab\Python Project/raw_data"):
    for filename in filenames:
        print('Printing Summary for {} \n'.format(filename))
        summarizer = Summarizer(path = os.path.join(dirname, filename) )
        summarizer.SummarizeMyText(fractionToReduce = 0.05)
        print('\n')

Printing Summary for content.txt 

Total number of Sentences = 65
Num of Sentences Reduced to 3
Summary as follows : 

Personally I feel like statistics and probability are subjects that hold strong intuitions behind that are only perceived by facing examples and exercises, however the content was mainly explained from a rather aseptic theoretical point of view.I found the last parts of the class really rushed and i had to work a lot after class just to understand what we saw earlier so i had no time to do exercices.He was very explicit and always motivate the class ahead about every topic to be addressed
"It was over complicated, the explanation was not clear to me, it was over complicated thought when I went to study alone from external resources it was easier for me.

Printing Summary for diffucult.txt 

Total number of Sentences = 53
Num of Sentences Reduced to 2
Summary as follows : 

The class content is easy but may have hard applications if the professor wants to
The content wa

## BLEU score 

In [5]:

from nltk.translate.bleu_score import sentence_bleu   # Library to compare two sentences and calculate score using BLEU
from nltk.tokenize import word_tokenize  # NLTK library to tokenize the expected and acquired results

def compare_bleu(result, expected):    
    token_res = word_tokenize(result)    # Tokenize the acquired results
    token_exp = word_tokenize(expected)  # Tokenize the expected results
    score = sentence_bleu(token_res, token_exp)   # Compare two sentences and calculate score
    print("The BLEU score of accuracy is: ", score)
    

In [12]:
# compare expected summary text from Jennifer with Summary_text about student expectation 
#from automatic summary 
Summary_text = """It was a statistics discovery, I would have appreciated to learn some basics, I had the feeling we went through tuff theroems straight away
the class had all the probabilityI was expecting a point of view more conceptually visual (with this I mean based on real examples rather than a series of slides displaying the theoretical part black over white with coins and dices examples from time to time).The class did cover everything about probability
Class was good but need some exercise sheet so that we can practice"""

#from Jennifer 
Expected_Summary = """In general, the professor was very knowledgeable; he 
explained all the concepts and gave lots of examples which was helpful. Some found the 
course too fast and would have liked more basic explanations. Some wanted more teaching 
on R as they didn’t know it beforehand"""


In [13]:
# BLeu score
compare_bleu(Summary_text, Expected_Summary)

The BLEU score of accuracy is:  8.972141065609098e-232


## ROUGE Score 


In [14]:
# !pip install rouge-score

#using rouge1 and rougeL

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(Summary_text, Expected_Summary)
scores

{'rouge1': Score(precision=0.3695652173913043, recall=0.19101123595505617, fmeasure=0.2518518518518518),
 'rougeL': Score(precision=0.17391304347826086, recall=0.0898876404494382, fmeasure=0.11851851851851852)}